334 lines
11 KiB
Nim
334 lines
11 KiB
Nim
|
import httpclient
|
||
|
import strtabs # To access XmlAttributes
|
||
|
import os # To use splitFile
|
||
|
import strutils # To use cmpIgnoreCase
|
||
|
import tables
|
||
|
import json
|
||
|
import htmlparser
|
||
|
import std/xmltree
|
||
|
import std/jsonutils
|
||
|
import asyncdispatch
|
||
|
import threadpool
|
||
|
import random
|
||
|
randomize()
|
||
|
#Normally i like do this order:
|
||
|
#template
|
||
|
#iterators
|
||
|
#functions
|
||
|
#classes
|
||
|
#vars
|
||
|
#but i want these allocation vars to be easy to find because of the editing that will need to be constantly done
|
||
|
|
||
|
|
||
|
|
||
|
#THIS IS INVALID CODE? IT WORKS FOR A WHILE BUT DOESN'T SUPPLANT THE FACT THAT NIM DOESN'T REALLY WORK WELL WITHOUT A GC
|
||
|
#THIS WAS AN ATTEMPT AT MULTITHREADING, AND ALL MY OTHER ATTEMPTS HAVE FAILED
|
||
|
#THIS HAS SOME COOL IDEAS, THAT I WISH TO ARCHIVE IF I NEED TO COMEBACK TO IT
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
var pointerBlocks* = createShared(TableRef[float, seq[pointer]], sizeOf(TableRef[float, seq[pointer]]))
|
||
|
|
||
|
|
||
|
template organizedPtr(x: untyped, a : untyped, c : float) =
|
||
|
var y = create(typeof(a), sizeof(a))
|
||
|
pointerBlocks[][c].add(y)
|
||
|
var x = y[]
|
||
|
|
||
|
|
||
|
|
||
|
template organizedPtr(x : untyped, a : typedesc, c : float) =
|
||
|
var x = create(typeof(a), sizeof(a))
|
||
|
pointerBlocks[][c].add(x)
|
||
|
|
||
|
#you can probably automatically do this with macros on the execution of a proc but for now its just gonna exist in and of itself
|
||
|
#this creates an id block, so you can free all poiints with a single command
|
||
|
#create an ID, to look it up, and then free it, etc
|
||
|
proc cdbs() : int =
|
||
|
var intholder : seq[int]
|
||
|
|
||
|
var id = createShared(float, sizeof(float))
|
||
|
id[] = rand(100000000.0)
|
||
|
pointerBlocks[][id[]].add(id)
|
||
|
return id[]
|
||
|
|
||
|
|
||
|
template `!`(a: ptr) =
|
||
|
dealloc a
|
||
|
|
||
|
proc fcdbs(x: float) =
|
||
|
for x in pointerBlocks[][x]:
|
||
|
!(unsafeAddr x)
|
||
|
|
||
|
|
||
|
#so this idea is a bit weird
|
||
|
#im going to make memory blocks of pointers using randon numbers to make a key to stop race conditions, except for vary rare situations
|
||
|
|
||
|
|
||
|
type fancyamount* = ref object of RootObj
|
||
|
amountnum*: string
|
||
|
measure*: string
|
||
|
|
||
|
type Recipeline* = ref object of RootObj
|
||
|
fancy* : fancyamount
|
||
|
ingredients*: string
|
||
|
|
||
|
type Recipe* = ref object
|
||
|
name* : string
|
||
|
author* : string
|
||
|
url* : string
|
||
|
lines* : seq[Recipeline]
|
||
|
img* : string
|
||
|
|
||
|
|
||
|
|
||
|
#this suggests the measurments which can be defined. I can really only use imperial because using metric goes into a whole other can of worms and isn't used in NYTC
|
||
|
#this allows these vars to be executed on threads.
|
||
|
var measures* = createShared(seq[string], sizeof(seq[string]))
|
||
|
measures[] = @["handful", "cup", "cups", "pound", "pounds", "ounce", "ounces", "tablespoons", "tablespoon", "teaspoon", "teaspoons"]
|
||
|
var htmlconversion* = createShared(TableRef[string, string], 200)
|
||
|
htmlconversion[] = {"⅛": "1/8", "¼" : "1/4",
|
||
|
"½" : "1/2", "¾" : "3/4",
|
||
|
"⅓" : "1/3", "⅔" : "2/3",
|
||
|
"⅕" : "1/5", "⅖" : "2/5",
|
||
|
"⅗" : "3/5",
|
||
|
"⅘" : "4/5", "⅙" : "1/6",
|
||
|
"⅚" : "5/6", "⅜" : "3/8",
|
||
|
"⅝" : "5/8", "⅞" : "7/8"
|
||
|
}.newTable()
|
||
|
var urlMaster* = createShared(seq[string], sizeof(seq[string]))
|
||
|
urlMaster[] = @["tag/spring", "tag/summer", "tag/winter", "tag/fall"]
|
||
|
var recipeMaster = createShared(seq[Recipe], sizeof(seq[Recipe]))
|
||
|
echo urlMaster.sizeOf()
|
||
|
|
||
|
proc bootlegGC() {.thread.} =
|
||
|
while true:
|
||
|
echo urlMaster[].sizeOf()
|
||
|
sleep 100
|
||
|
|
||
|
iterator `...`*[T](a: T, b: T): T =
|
||
|
var res: T = T(a)
|
||
|
while res <= b:
|
||
|
yield res
|
||
|
inc res
|
||
|
|
||
|
|
||
|
|
||
|
proc initFancyAmount(a : string, b :string ) : fancyamount =
|
||
|
return(fancyamount(amountnum: a, measure : b))
|
||
|
|
||
|
|
||
|
|
||
|
iterator rec(a: ptr seq[string]) : ptr string =
|
||
|
var list = a[]
|
||
|
var res = 0
|
||
|
var returncounter = 0
|
||
|
var toggle = true
|
||
|
while toggle:
|
||
|
sleep 10
|
||
|
list = a[]
|
||
|
if res != list.len()-1:
|
||
|
if returncounter != 0:
|
||
|
returncounter = 0
|
||
|
var yieldy = create(string, sizeof(string))
|
||
|
yieldy[] = list[res]
|
||
|
yield yieldy
|
||
|
dealloc yieldy
|
||
|
inc res
|
||
|
else:
|
||
|
if returncounter >= 30000:
|
||
|
toggle = false
|
||
|
sleep 10
|
||
|
inc returncounter
|
||
|
|
||
|
|
||
|
proc newRecipe(name : string, author : string, url : string, lines : seq[RecipeLine], img : string) : Recipe =
|
||
|
return Recipe(name : name, author : author, url : url, lines : lines, img : img)
|
||
|
|
||
|
proc findsubstring(input : seq[string], substring : string) : seq[int] =
|
||
|
var id = cdbs()
|
||
|
#returns the lines in which a substring appears
|
||
|
var returnlist : seq[int]
|
||
|
#len(input)-1 is used because len is base 1 and loops are base 0
|
||
|
for line in 0...len(input)-1:
|
||
|
var currentline = input[line]
|
||
|
if currentline.contains(substring):
|
||
|
returnlist.add(line)
|
||
|
else:
|
||
|
continue
|
||
|
if returnlist.len == 0:
|
||
|
returnlist.add(-1)
|
||
|
return returnlist
|
||
|
|
||
|
return returnlist
|
||
|
#so I remember there being a modual to do this, but i can't find it so we're stuck with the manual version of this
|
||
|
|
||
|
proc printrecipes(url : ptr string, ssplit: ptr seq[string]) : Recipe =
|
||
|
var id = cdbs()
|
||
|
organizedPtr img, string, id
|
||
|
|
||
|
|
||
|
organizedPtr html, parseHtml(ssplit[].join("")), id
|
||
|
organizedPtr utputrecipe, seq[Recipeline], id
|
||
|
organizedPtr quanity, findsubstring(ssplit[], """<span class="quantity">"""), id
|
||
|
organizedPtr ingredient, findsubstring(ssplit[], """<span class="ingredient-name">"""), id
|
||
|
organizedPtr author, string, id
|
||
|
organizedPtr name, "placeholder", id
|
||
|
organizedPtr outputrecipe, seq[Recipeline], id
|
||
|
#Defines the image\
|
||
|
for a in html.findAll("picture"):
|
||
|
for a in a.findAll("img"):
|
||
|
img[] = a.attrs["src"]
|
||
|
|
||
|
for node in html.findAll("p"):
|
||
|
#because htmlparser doesn't include classes we have to just search the string
|
||
|
if ($node).contains("card-byline"):
|
||
|
#they only have one item so we just define it like this
|
||
|
for items in node.items:
|
||
|
name = $items
|
||
|
break
|
||
|
|
||
|
if name == "placeholder":
|
||
|
#sometimes the earlier method doesn't work, but its always in the JS, so we just parse the orignal for it
|
||
|
for lines in ssplit[]:
|
||
|
if lines.contains("bootstrap.recipe"):
|
||
|
#returns a json thingy
|
||
|
name = parseJson(lines.split("= ")[1].split(";")[0])["byline"].getStr
|
||
|
break
|
||
|
if name == "placeholder":
|
||
|
#if theres nothing i can do i just do this
|
||
|
echo "Yea we tried everything but we cant find the author name"
|
||
|
return
|
||
|
#This code can probably optimized with htmlparsing but uh, yea no.
|
||
|
#this is for assembling the recipes...
|
||
|
for i in 0...len(quanity)-1:
|
||
|
organizedPtr amount, string, id
|
||
|
organizedPtr quanitystring, ssplit[quanity[i]+1].strip(), id
|
||
|
organizedPtr ingredients, ssplit[ingredient[i]+1].strip(), id
|
||
|
proc detectMeasure(input: string) : string =
|
||
|
for x in measures[]:
|
||
|
if normalize(input).contains(x):
|
||
|
return x
|
||
|
return "none"
|
||
|
case quanitystring:
|
||
|
of "":
|
||
|
let recipe = Recipeline(fancy : initFancyAmount("Unspecified Amount of", "none"), ingredients: ingredients)
|
||
|
outputrecipe[].add(recipe)
|
||
|
continue
|
||
|
else:
|
||
|
if quanitystring.contains("frac"):
|
||
|
|
||
|
if quanitystring[0] == "&"[0]:
|
||
|
organizedPtr big, quanitystring.split("frac")[1][0 .. ^2][0], id
|
||
|
organizedPtr little, quanitystring.split("frac")[1][0 .. ^2][1], id
|
||
|
amount[] = big & "/" & little
|
||
|
organizedPtr recipe, Recipeline(fancy : initFancyAmount(amount[], detectMeasure(ingredients)), ingredients: ingredients), id
|
||
|
outputrecipe[].add(recipe)
|
||
|
|
||
|
else:
|
||
|
organizedPtr whole, quanitystring.split(" ")[0], id
|
||
|
organizedPtr big, quanitystring.split("frac")[1][0 .. ^2][0], id
|
||
|
organizedPtr little, quanitystring.split("frac")[1][0 .. ^2][1], id
|
||
|
amount[] = whole & " " & big & "/" & little
|
||
|
organizedPtr recipe, Recipeline(fancy : initFancyAmount(amount[], detectMeasure(ingredients)), ingredients: ingredients), id
|
||
|
outputrecipe[].add(recipe)
|
||
|
|
||
|
elif quanitystring[0] == "&"[0]:
|
||
|
try:
|
||
|
amount[] = htmlconversion[][quanitystring]
|
||
|
organizedPtr recipe, Recipeline(fancy : initFancyAmount(amount[], detectMeasure(ingredients)), ingredients: ingredients), id
|
||
|
outputrecipe[].add(recipe)
|
||
|
except:
|
||
|
discard "asd"
|
||
|
|
||
|
else:
|
||
|
amount[] = ssplit[quanity[i]+1].strip()
|
||
|
var recipe = Recipeline(fancy : initFancyAmount(amount[], detectMeasure(ingredients)), ingredients: ingredients)
|
||
|
outputrecipe[].add(recipe)
|
||
|
|
||
|
fcdbs(id)
|
||
|
return newRecipe(name, author[], url[], outputrecipe[], img[])
|
||
|
|
||
|
proc geturl(input : string) : seq[string] =
|
||
|
var page = create(XmlNode, sizeof(XmlNode))
|
||
|
page[] = parseHtml(input)
|
||
|
var urllist : seq[string]
|
||
|
#this is for the tags... obviously
|
||
|
for node in page[].findAll("a"):
|
||
|
try:
|
||
|
var url = node.attrs["href"]
|
||
|
if "/tag/" in url[0 .. 4]:
|
||
|
urllist.add(url)
|
||
|
except KeyError, IndexDefect:
|
||
|
continue
|
||
|
#this is for recipes
|
||
|
for node in page[].findAll("article"):
|
||
|
try:
|
||
|
var url = node.attrs["data-url"]
|
||
|
if "/recipes/" in url[0 .. 9]:
|
||
|
urllist.add(url)
|
||
|
except KeyError, IndexDefect:
|
||
|
continue
|
||
|
#all links should be in href form
|
||
|
!page
|
||
|
return urllist
|
||
|
proc recursive(uwu : ptr) {.thread.} =
|
||
|
let client = newHttpClient()
|
||
|
|
||
|
try:
|
||
|
if uwu[].contains("%"):
|
||
|
return
|
||
|
var webread = create(seq[string], sizeof(seq[string]))
|
||
|
webread[] = client.getContent("https://cooking.nytimes.com/"&uwu[]).split("\n")
|
||
|
var recipe = create(Recipe, Recipe.sizeOf())
|
||
|
if uwu[].contains("recipes/"):
|
||
|
recipe[] = printrecipes(uwu, webread)
|
||
|
recipeMaster[].add(recipe[])
|
||
|
client.close()
|
||
|
let url = create(seq[string], sizeOf(seq[string]))
|
||
|
|
||
|
url[] = geturl(webread[].join(""))
|
||
|
for urlx in url[]:
|
||
|
if not urlMaster[].contains(urlx):
|
||
|
urlMaster[].add(urlx)
|
||
|
!webread
|
||
|
!url
|
||
|
!recipe
|
||
|
return
|
||
|
except Exception as e:
|
||
|
echo e.msg
|
||
|
return
|
||
|
|
||
|
#the channels to communicate stuff
|
||
|
|
||
|
proc scanwebsite() =
|
||
|
#rec is a iterator.
|
||
|
#rec will iterate and raise an exception when it reaches the last iteraton in a list.
|
||
|
#it does this so as the list expands, the loop continues, allowing for a recursive loop
|
||
|
var counter = 0
|
||
|
spawn bootlegGC()
|
||
|
echo "yea this should be going"
|
||
|
try:
|
||
|
for url in rec(urlMaster):
|
||
|
spawn recursive(url)
|
||
|
# echo GC_getStatistics()
|
||
|
echo "writing now"
|
||
|
|
||
|
except: echo "uwu"
|
||
|
|
||
|
proc writeToFile() =
|
||
|
writeFile("beta.txt", urlMaster[].join("\n"))
|
||
|
writeFile("recipes.json", $toJson(recipeMaster))
|
||
|
proc main() =
|
||
|
scanwebsite()
|
||
|
writeToFile()
|
||
|
if isMainModule:
|
||
|
main()
|