Electron-Svelte-Recipe-Planner/NimRecipe/NimRecipe.nim
Ialbassort 6f36c5fd71 asd
2021-12-16 04:03:57 -05:00

270 lines
10 KiB
Nim

import httpclient
import strtabs # To access XmlAttributes
import os # To use splitFile
import strutils # To use cmpIgnoreCase
import tables
import json
import htmlparser
import std/xmltree
import std/jsonutils
import threadpool
#Normally i like do this order:
#template
#iterators
#functions
#classes
#vars
#but i want these allocation vars to be easy to find because of the editing that will need to be constantly done\
#THIS CODE MAY EVENTUALLY FIZZLE OUT AND STALL THEN DIE, THIS IS BECAUSE OF A BUG IN HTTPCLIENT.CLOSE() THAT HAPPENS AFTER MANY CLOSING
#There aren't many work arounds. Not closing them leads into kernel issues involving having too many open files.
#I tried closing them after 10 urls, and having a list of HTTP clients to be called upon but it doesn't work as nicely as you'd think
#Using async to close them isn't really affective either and leads into other problems.
type fancyamount* = ref object of RootObj
amountnum*: string
measure*: string
type Recipeline* = ref object of RootObj
fancy* : fancyamount
ingredients*: string
type Recipe* = ref object
name* : string
author* : string
url* : string
lines* : seq[Recipeline]
img* : string
#this suggests the measurment which can be defined. I can really only use imperial because using metric goes into a whole other can of worms and isn't used in NYTC
#this allows these vars to be executed on threads.
# ! Normally this stuff doesn't work because each thread has its shared heap, but because these are read only its fine?!
var measures* = createShared(array[11, string], sizeof(seq[string]))
measures[] = ["handful", "cup", "cups", "pound", "pounds", "ounce", "ounces", "tablespoons", "tablespoon", "teaspoon", "teaspoons"]
var htmlconversion* = createShared(TableRef[string, string], 200)
#so I remember there being a modual to do this, but i can't find it so we're stuck with the manual version of this
#That is, automatically formating fractions to human readability // floats
htmlconversion[] = {"⅛": "1/8", "¼" : "1/4",
"½" : "1/2", "¾" : "3/4",
"⅓" : "1/3", "⅔" : "2/3",
"⅕" : "1/5", "⅖" : "2/5",
"⅗" : "3/5",
"⅘" : "4/5", "⅙" : "1/6",
"⅚" : "5/6", "⅜" : "3/8",
"⅝" : "5/8", "⅞" : "7/8"
}.newTable()
#Keep in mind channels are not stable with threadpool. Though I think its fine here.
var recipeMaster = createShared(Channel[Recipe], 1)
recipeMaster[].open()
var urlChannel = createShared(Channel[string], 1)
urlChannel[].open()
for x in ["tag/spring", "tag/summer", "tag/winter", "tag/fall"]:
urlChannel[].send(x)
iterator `...`*[T](a: T, b: T): T =
var res: T = T(a)
while res <= b:
yield res
inc res
proc initFancyAmount(a : string, b :string ) : fancyamount =
return(fancyamount(amountnum: a, measure : b))
iterator rec(a: ptr Channel[string]) : string =
var returncounter = 0
var toggle = true
while toggle:
if a[].peek != 0:
yield a[].tryRecv().msg
else:
if returncounter >= 30000:
toggle = false
sleep 10
inc returncounter
proc newRecipe(name : string, author : string, url : string, lines : seq[RecipeLine], img : string) : Recipe =
return Recipe(name : name, author : author, url : url, lines : lines, img : img)
proc findsubstring(input : seq[string], substring : string) : seq[int] =
#returns the lines in which a substring appears
var returnlist : seq[int]
#len(input)-1 is used because len is base 1 and loops are base 0
for line in 0...len(input)-1:
var currentline = input[line]
if currentline.contains(substring):
returnlist.add(line)
else:
continue
if returnlist.len == 0:
returnlist.add(-1)
return returnlist
return returnlist
proc printrecipes(url : string, ssplit: seq[string], client : HttpClient) : Recipe =
var img : string
var html = parseHtml(ssplit.join(""))
var outputrecipe : seq[Recipeline]
var quanity = findsubstring(ssplit, """<span class="quantity">""")
var ingredient = findsubstring(ssplit, """<span class="ingredient-name">""")
var author : string
var name = "placeholder"
#Defines the image\
for a in html.findAll("picture"):
for a in a.findAll("img"):
img = a.attrs["src"]
for node in html.findAll("p"):
#because htmlparser doesn't include classes we have to just search the string
if ($node).contains("card-byline"):
#they only have one item so we just define it like this
for items in node.items:
name = $items
break
if name == "placeholder":
#sometimes the earlier method doesn't work, but its always in the JS, so we just parse the orignal for it
for lines in ssplit:
if lines.contains("bootstrap.recipe"):
#returns a json thingy
name = parseJson(lines.split("= ")[1].split(";")[0])["byline"].getStr
break
if name == "placeholder":
#if theres nothing i can do i just do this
echo "Yea we tried everything but we cant find the author name"
return
#This code can probably optimized with htmlparsing but uh, yea no.
#this is for assembling the recipes...
for i in 0...len(quanity)-1:
var amount : string
var quanitystring = ssplit[quanity[i]+1].strip()
var ingredients = ssplit[ingredient[i]+1].strip()
proc detectMeasure(input: string) : string =
for x in measures[]:
if normalize(input).contains(x):
return x
return "none"
case quanitystring:
of "":
let recipe = Recipeline(fancy : initFancyAmount("Unspecified Amount of", "none"), ingredients: ingredients)
outputrecipe.add(recipe)
continue
else:
#I'd like to say this code is pretty self explantory
if quanitystring.contains("frac"):
if quanitystring[0] == "&"[0]:
#all of this fract buiness is an attempt to parse weird html fractions and replace them with human readability
var big = quanitystring.split("frac")[1][0 .. ^2][0]
var little = quanitystring.split("frac")[1][0 .. ^2][1]
var amount = big & "/" & little
var recipe = Recipeline(fancy : initFancyAmount(amount, detectMeasure(ingredients)), ingredients: ingredients)
outputrecipe.add(recipe)
else:
var whole = quanitystring.split(" ")[0]
var big = quanitystring.split("frac")[1][0 .. ^2][0]
var little = quanitystring.split("frac")[1][0 .. ^2][1]
var amount = whole & " " & big & "/" & little
var recipe = Recipeline(fancy : initFancyAmount(amount, detectMeasure(ingredients)), ingredients: ingredients)
outputrecipe.add(recipe)
elif quanitystring[0] == "&"[0]:
try:
var amount = htmlconversion[][quanitystring]
var recipe = Recipeline(fancy : initFancyAmount(amount, detectMeasure(ingredients)), ingredients: ingredients)
outputrecipe.add(recipe)
except:
discard "asd"
else:
amount = ssplit[quanity[i]+1].strip()
var recipe = Recipeline(fancy : initFancyAmount(amount, detectMeasure(ingredients)), ingredients: ingredients)
outputrecipe.add(recipe)
return newRecipe(name, author, url, outputrecipe, img)
proc geturl(input : string) : seq[string] =
var page = create(XmlNode, sizeof(XmlNode))
page[] = parseHtml(input)
var urllist : seq[string]
#this is for the tags... obviously
for node in page[].findAll("a"):
try:
var url = node.attrs["href"]
if "/tag/" in url[0 .. 4]:
urllist.add(url)
except KeyError, IndexDefect:
continue
#this is for recipes
for node in page[].findAll("article"):
try:
var url = node.attrs["data-url"]
if "/recipes/" in url[0 .. 9]:
urllist.add(url)
except KeyError, IndexDefect:
continue
#all links should be in href form
dealloc page
return urllist
proc recursive(uwu : string) {.thread.} =
let client = newHttpClient()
try:
if uwu.contains("%"):
return
var webread = client.getContent("https://cooking.nytimes.com/"&uwu)
var url = geturl(webread)
if uwu.contains("recipes/"):
var recipe = printrecipes(uwu, webread.split("\n"), client)
recipeMaster[].send(recipe)
for urlx in url:
urlChannel[].send(urlx)
except Exception as e:
echo e.msg
client.close()
return
#the channels to communicate stuff
proc scanwebsite() =
#rec is a iterator.
#rec will iterate and raise an exception when it reaches the last iteraton in a list.
#it does this so as the list expands, the loop continues, allowing for a recursive loop
var counter = 0
try:
for url in rec(urlChannel):
spawn recursive(url)
inc counter
echo counter
echo "writing now"
except: echo "uwu"
proc writeToFile() =
#To my knowledge you cant just Json a channel.
var RecipeHolder : seq[Recipe]
for x in 1 .. recipeMaster[].peek:
#this should be memory equal ~ because its freed from the channel.
RecipeHolder.add(recipeMaster[].tryRecv().msg)
writeFile("recipes.json", $toJson(RecipeHolder))
proc main() =
scanwebsite()
writeToFile()
if isMainModule:
main()