270 lines
10 KiB
Nim
270 lines
10 KiB
Nim
import httpclient
|
|
import strtabs # To access XmlAttributes
|
|
import os # To use splitFile
|
|
import strutils # To use cmpIgnoreCase
|
|
import tables
|
|
import json
|
|
import htmlparser
|
|
import std/xmltree
|
|
import std/jsonutils
|
|
import threadpool
|
|
#Normally i like do this order:
|
|
#template
|
|
#iterators
|
|
#functions
|
|
#classes
|
|
#vars
|
|
#but i want these allocation vars to be easy to find because of the editing that will need to be constantly done\
|
|
|
|
|
|
#THIS CODE MAY EVENTUALLY FIZZLE OUT AND STALL THEN DIE, THIS IS BECAUSE OF A BUG IN HTTPCLIENT.CLOSE() THAT HAPPENS AFTER MANY CLOSING
|
|
#There aren't many work arounds. Not closing them leads into kernel issues involving having too many open files.
|
|
#I tried closing them after 10 urls, and having a list of HTTP clients to be called upon but it doesn't work as nicely as you'd think
|
|
#Using async to close them isn't really affective either and leads into other problems.
|
|
|
|
|
|
type fancyamount* = ref object of RootObj
|
|
amountnum*: string
|
|
measure*: string
|
|
|
|
type Recipeline* = ref object of RootObj
|
|
fancy* : fancyamount
|
|
ingredients*: string
|
|
|
|
type Recipe* = ref object
|
|
name* : string
|
|
author* : string
|
|
url* : string
|
|
lines* : seq[Recipeline]
|
|
img* : string
|
|
|
|
|
|
|
|
#this suggests the measurment which can be defined. I can really only use imperial because using metric goes into a whole other can of worms and isn't used in NYTC
|
|
#this allows these vars to be executed on threads.
|
|
|
|
# ! Normally this stuff doesn't work because each thread has its shared heap, but because these are read only its fine?!
|
|
var measures* = createShared(array[11, string], sizeof(seq[string]))
|
|
|
|
measures[] = ["handful", "cup", "cups", "pound", "pounds", "ounce", "ounces", "tablespoons", "tablespoon", "teaspoon", "teaspoons"]
|
|
var htmlconversion* = createShared(TableRef[string, string], 200)
|
|
#so I remember there being a modual to do this, but i can't find it so we're stuck with the manual version of this
|
|
#That is, automatically formating fractions to human readability // floats
|
|
|
|
htmlconversion[] = {"⅛": "1/8", "¼" : "1/4",
|
|
"½" : "1/2", "¾" : "3/4",
|
|
"⅓" : "1/3", "⅔" : "2/3",
|
|
"⅕" : "1/5", "⅖" : "2/5",
|
|
"⅗" : "3/5",
|
|
"⅘" : "4/5", "⅙" : "1/6",
|
|
"⅚" : "5/6", "⅜" : "3/8",
|
|
"⅝" : "5/8", "⅞" : "7/8"
|
|
}.newTable()
|
|
|
|
#Keep in mind channels are not stable with threadpool. Though I think its fine here.
|
|
var recipeMaster = createShared(Channel[Recipe], 1)
|
|
recipeMaster[].open()
|
|
var urlChannel = createShared(Channel[string], 1)
|
|
urlChannel[].open()
|
|
for x in ["tag/spring", "tag/summer", "tag/winter", "tag/fall"]:
|
|
urlChannel[].send(x)
|
|
|
|
iterator `...`*[T](a: T, b: T): T =
|
|
var res: T = T(a)
|
|
while res <= b:
|
|
yield res
|
|
inc res
|
|
|
|
proc initFancyAmount(a : string, b :string ) : fancyamount =
|
|
return(fancyamount(amountnum: a, measure : b))
|
|
|
|
|
|
|
|
iterator rec(a: ptr Channel[string]) : string =
|
|
var returncounter = 0
|
|
var toggle = true
|
|
while toggle:
|
|
if a[].peek != 0:
|
|
yield a[].tryRecv().msg
|
|
else:
|
|
if returncounter >= 30000:
|
|
toggle = false
|
|
sleep 10
|
|
inc returncounter
|
|
|
|
|
|
proc newRecipe(name : string, author : string, url : string, lines : seq[RecipeLine], img : string) : Recipe =
|
|
return Recipe(name : name, author : author, url : url, lines : lines, img : img)
|
|
|
|
proc findsubstring(input : seq[string], substring : string) : seq[int] =
|
|
#returns the lines in which a substring appears
|
|
var returnlist : seq[int]
|
|
#len(input)-1 is used because len is base 1 and loops are base 0
|
|
for line in 0...len(input)-1:
|
|
var currentline = input[line]
|
|
if currentline.contains(substring):
|
|
returnlist.add(line)
|
|
else:
|
|
continue
|
|
if returnlist.len == 0:
|
|
returnlist.add(-1)
|
|
return returnlist
|
|
|
|
return returnlist
|
|
|
|
proc printrecipes(url : string, ssplit: seq[string], client : HttpClient) : Recipe =
|
|
var img : string
|
|
var html = parseHtml(ssplit.join(""))
|
|
var outputrecipe : seq[Recipeline]
|
|
var quanity = findsubstring(ssplit, """<span class="quantity">""")
|
|
var ingredient = findsubstring(ssplit, """<span class="ingredient-name">""")
|
|
var author : string
|
|
var name = "placeholder"
|
|
|
|
#Defines the image\
|
|
for a in html.findAll("picture"):
|
|
for a in a.findAll("img"):
|
|
img = a.attrs["src"]
|
|
|
|
for node in html.findAll("p"):
|
|
#because htmlparser doesn't include classes we have to just search the string
|
|
if ($node).contains("card-byline"):
|
|
#they only have one item so we just define it like this
|
|
for items in node.items:
|
|
name = $items
|
|
break
|
|
|
|
if name == "placeholder":
|
|
#sometimes the earlier method doesn't work, but its always in the JS, so we just parse the orignal for it
|
|
for lines in ssplit:
|
|
if lines.contains("bootstrap.recipe"):
|
|
#returns a json thingy
|
|
name = parseJson(lines.split("= ")[1].split(";")[0])["byline"].getStr
|
|
break
|
|
if name == "placeholder":
|
|
#if theres nothing i can do i just do this
|
|
echo "Yea we tried everything but we cant find the author name"
|
|
return
|
|
#This code can probably optimized with htmlparsing but uh, yea no.
|
|
#this is for assembling the recipes...
|
|
for i in 0...len(quanity)-1:
|
|
var amount : string
|
|
var quanitystring = ssplit[quanity[i]+1].strip()
|
|
var ingredients = ssplit[ingredient[i]+1].strip()
|
|
proc detectMeasure(input: string) : string =
|
|
for x in measures[]:
|
|
if normalize(input).contains(x):
|
|
return x
|
|
return "none"
|
|
case quanitystring:
|
|
of "":
|
|
let recipe = Recipeline(fancy : initFancyAmount("Unspecified Amount of", "none"), ingredients: ingredients)
|
|
outputrecipe.add(recipe)
|
|
continue
|
|
else:
|
|
#I'd like to say this code is pretty self explantory
|
|
if quanitystring.contains("frac"):
|
|
|
|
if quanitystring[0] == "&"[0]:
|
|
#all of this fract buiness is an attempt to parse weird html fractions and replace them with human readability
|
|
var big = quanitystring.split("frac")[1][0 .. ^2][0]
|
|
var little = quanitystring.split("frac")[1][0 .. ^2][1]
|
|
var amount = big & "/" & little
|
|
var recipe = Recipeline(fancy : initFancyAmount(amount, detectMeasure(ingredients)), ingredients: ingredients)
|
|
outputrecipe.add(recipe)
|
|
|
|
else:
|
|
var whole = quanitystring.split(" ")[0]
|
|
var big = quanitystring.split("frac")[1][0 .. ^2][0]
|
|
var little = quanitystring.split("frac")[1][0 .. ^2][1]
|
|
var amount = whole & " " & big & "/" & little
|
|
var recipe = Recipeline(fancy : initFancyAmount(amount, detectMeasure(ingredients)), ingredients: ingredients)
|
|
outputrecipe.add(recipe)
|
|
|
|
|
|
elif quanitystring[0] == "&"[0]:
|
|
try:
|
|
var amount = htmlconversion[][quanitystring]
|
|
var recipe = Recipeline(fancy : initFancyAmount(amount, detectMeasure(ingredients)), ingredients: ingredients)
|
|
outputrecipe.add(recipe)
|
|
except:
|
|
discard "asd"
|
|
|
|
else:
|
|
amount = ssplit[quanity[i]+1].strip()
|
|
var recipe = Recipeline(fancy : initFancyAmount(amount, detectMeasure(ingredients)), ingredients: ingredients)
|
|
outputrecipe.add(recipe)
|
|
|
|
return newRecipe(name, author, url, outputrecipe, img)
|
|
|
|
proc geturl(input : string) : seq[string] =
|
|
var page = create(XmlNode, sizeof(XmlNode))
|
|
page[] = parseHtml(input)
|
|
var urllist : seq[string]
|
|
#this is for the tags... obviously
|
|
for node in page[].findAll("a"):
|
|
try:
|
|
var url = node.attrs["href"]
|
|
if "/tag/" in url[0 .. 4]:
|
|
urllist.add(url)
|
|
except KeyError, IndexDefect:
|
|
continue
|
|
#this is for recipes
|
|
for node in page[].findAll("article"):
|
|
try:
|
|
var url = node.attrs["data-url"]
|
|
if "/recipes/" in url[0 .. 9]:
|
|
urllist.add(url)
|
|
except KeyError, IndexDefect:
|
|
continue
|
|
#all links should be in href form
|
|
dealloc page
|
|
return urllist
|
|
proc recursive(uwu : string) {.thread.} =
|
|
let client = newHttpClient()
|
|
try:
|
|
if uwu.contains("%"):
|
|
return
|
|
var webread = client.getContent("https://cooking.nytimes.com/"&uwu)
|
|
var url = geturl(webread)
|
|
|
|
if uwu.contains("recipes/"):
|
|
var recipe = printrecipes(uwu, webread.split("\n"), client)
|
|
recipeMaster[].send(recipe)
|
|
|
|
for urlx in url:
|
|
urlChannel[].send(urlx)
|
|
|
|
except Exception as e:
|
|
echo e.msg
|
|
client.close()
|
|
return
|
|
#the channels to communicate stuff
|
|
|
|
proc scanwebsite() =
|
|
#rec is a iterator.
|
|
#rec will iterate and raise an exception when it reaches the last iteraton in a list.
|
|
#it does this so as the list expands, the loop continues, allowing for a recursive loop
|
|
var counter = 0
|
|
try:
|
|
for url in rec(urlChannel):
|
|
spawn recursive(url)
|
|
inc counter
|
|
echo counter
|
|
echo "writing now"
|
|
|
|
except: echo "uwu"
|
|
|
|
proc writeToFile() =
|
|
#To my knowledge you cant just Json a channel.
|
|
var RecipeHolder : seq[Recipe]
|
|
for x in 1 .. recipeMaster[].peek:
|
|
#this should be memory equal ~ because its freed from the channel.
|
|
RecipeHolder.add(recipeMaster[].tryRecv().msg)
|
|
writeFile("recipes.json", $toJson(RecipeHolder))
|
|
proc main() =
|
|
scanwebsite()
|
|
writeToFile()
|
|
if isMainModule:
|
|
|
|
main()
|