AutomatedTiktokScraperAndCo.../main.nim

import nimpy
import httpclient
import q
import os
import xmltree
import sugar
import sequtils
import strutils
import strformat
import std/db_sqlite
import std/jsonutils,json
import random
import math
import asyncdispatch
import schedules
import times
randomize()


os.removeDir("processing")
os.createDir("processing")
discard os.existsOrCreateDir("archive")
let db = open("tiktok.db", "", "", "")
db.exec(sql"CREATE TABLE IF NOT EXISTS videos (id INTEGER NOT NULL, json LONGTEXT NOT NULL)")

proc getTotalVideos(tag : string) : int =
    var it = 0
    for x in db.fastRows(sql(fmt"SELECT * FROM {tag}")): it.inc()
    return it

proc isVideoInDb(alt  : string) : bool =
    for x in collect(for x in db.fastRows(sql"SELECT * FROM videos") : x[1]):
        if x.contains(alt):
            return true
    return false

proc inAnyInList(a : openArray[string], b : string) : bool =
    let matching = normalize(b)
    for x in a:
        if normalize(x).contains(matching):
            return true
    return false

proc getRights(a : (string, string, HttpClient, ptr Channel[(string, string, int, seq[string])], int)) {.thread.} =
    if a[0] == "N/A":
        a[3][].send((a[0], a[1], a[4], @[]))
        return
    let songName = a[0].replace(" ", "+")
    let artist = a[1].replace(" ", "+")
    #haha they wanna stop me from automating this. This is what happens when you dont offer an api
                                                #This site will be down before my cookie expires >:)
    a[2].headers = newHttpHeaders({"Cookie" : "disc=2100-02-28T08%3A23%3A00.199Z; RepInstance=instance=REP2&expires=2/28/2100 5:13:54 AM"})
    var page = "!"

    #So this is some odd code. Sometimes this website will just go "haha, no."
    for x in 0 .. 10:
        try:
            #This is the best db i could find.
            page = a[2].request(fmt"https://repertoire.bmi.com/Search/Search?SearchForm.View_Count=&SearchForm.Main_Search=Title&SearchForm.Main_Search_Text={songName}&SearchForm.Sub_Search=Performer&SearchForm.Sub_Search_Text={artist}&SearchForm.Search_Type=all").body
            break
        except:
            if x == 10:
                sleep 2000
            else:
                #eg 500 * 1.1.. 1.2.. 1.3.. 2x
                sleep int(math.floor(500.0*(1+(x/10))))
            echo "trying again..."
    if page == "!":
        raise newException(OSError, fmt"we where unable to get repertoire data for {a[0]} by {a[1]}")

    let doc = q(page)
    #So, we have standard selector and an abnornal one.
    #One is for when the rightsholder is well documented, and one where it is not
    let standard = doc.select("a.expander").map(x=>innerText(x))
    var abnormal : string
    try:
        abnormal = doc.select("table.style-01 tbody tr td").map(x=>innerText(x).strip())[0]
    except:
        a[3][].send((a[0], a[1], a[4], standard))
        return
    if standard.len() == 0:
        a[3][].send (a[0], a[1], a[4],@[abnormal])
    else:
        a[3][].send((a[0], a[1], a[4],standard))

proc processVideo(a : (int, string, HttpClient)) {.thread.} =
    writeFile(fmt"processing/{a[0]}.mp4", a[2].request(a[1]).body)

type videoDetails = object of RootObj
    artist : string
    song : string
    hasSong : bool
    url : string
    author : string
    pageFound : string
    altText : string
    tags : seq[string]
    copyright : seq[string]

type IntervalRoutine = object of RootObj
    weeks : int
    days : int
    hours : int
    minutes : int
    seconds : int
    restrictHasSong : bool
    restrictRights : bool
    rightsCriteria : seq[string]
    fetch : seq[(string, int)]
    generate : seq[(string, int)]

proc makeRoutine(weeks, days, hours, minutes, seconds : int; fetch, generate : seq[(string, int)],
restrictHasSong, restrictRights : bool, rightsCriteria : seq[string]) : IntervalRoutine=

    var normalized : seq[string]
    #we normalize it because, it'll prevent any minor typos from interferring with matching.
    if rightsCriteria.len() != 0:
        normalized = rightsCriteria.map(x=>normalize(x))

    return IntervalRoutine(weeks : weeks, days : days, hours : hours,
     minutes : minutes, seconds : seconds, fetch : fetch, generate : generate,
     restrictRights : restrictRights,restrictHasSong : restrictHasSong, rightsCriteria : normalized)

proc makeVideoDetails(videoIn : (string, string, string), song, author, inputUrl : string, copyright : seq[string]) : videoDetails =
    var tags : seq[string]
    try:
        tags = videoIn[1].split(" ").filter(x=>x.len() != 0).filter(x=>x[0] == '#')
    except:
        tags = @[]

    if inputUrl.contains("topics"):
        let topic = inputUrl.split("topics/")[1]
        tags.add(fmt"#topic{topic}")

    elif inputUrl == "https://tiktok.com/":
        tags.add("#topicmain")

    return (videoDetails(artist : author, song : song, copyright : copyright, url : videoIn[0], altText : videoIn[1], tags : tags, author : videoIn[2], pageFound : inputUrl, hasSong : song == "N/A"))

proc getVideosWithSongs(url : string, target : int) =
    let module = readFile("sel.py")
    discard pyBuiltinsModule().exec(module)

    let getVidsOnPage = pyGlobals()["getVideosOnPage"].to(proc(a : string, b : int) : seq[(string, string, string)]  {.gcsafe.})
    let idSong = pyGlobals()["idSong"].to(proc(a : string) : seq[(string, string, int)] {.gcsafe.})

    let client = newHttpClient()
    client.headers = newHttpHeaders({"Referer" : "https://www.tiktok.com/"})

    var videos = getVidsOnPage(url, target)
    videos = videos.filter(x=> not isVideoInDb(x[1]))
    if videos.len() == 0:
        return
    let endChannel = createShared(Channel[(string, string, int, seq[string])], 200)
    var thr = newSeq[Thread[(int, string, HttpClient)]](videos.high)
    var endthr = newSeq[Thread[(string, string, HttpClient,
    ptr Channel[(string, string, int, seq[string])], int)]](videos.high)
    for x in 0 .. videos.high:
        try:
            createThread(thr[x], processVideo, (x, videos[x][0], client))
        except:
            break
    joinThreads(thr)

    let songs = idSong("./processing/").map(x=>(x[0].replace("\\", ""), x[1].replace("\\", ""), x[2]))

    endChannel[].open()
    for x in 0 .. songs.high:
        try:
            createThread(endthr[x], getRights, (songs[x][0], songs[x][1], client, endChannel, songs[x][2]))
        except:
            break
    joinThreads(endthr)
    let exit = collect(for x in 0 .. endChannel[].peek: endChannel[].tryRecv.msg)
    var start = getTotalVideos("videos")

    for x in exit:
        var video : videoDetails
        try:
            video = makeVideoDetails(videos[x[2]], x[0], x[1], url, x[3])
        except:
            continue
        if isVideoInDb(videos[x[2]][1]):
           continue
        db.exec(sql"""INSERT INTO videos (id, json)
                              VALUES (?, ?)""", start, $tojson(video))
        for tag in video.tags:
            let tag = tag[1 .. ^1]
            try:
                db.exec(sql(fmt"CREATE TABLE IF NOT EXISTS {tag} (lookup INTEGER NOT NULL)"))
                db.exec(sql(fmt"INSERT INTO {tag} (lookup) VALUES (?)"), start)
            except:
                echo tag
        moveFile(fmt"processing/{x[2]}.mp4", fmt"archive/{start}.mp4")
        echo fmt"processing/{x[2]}.mp4; archive/{start}.mp4"
        start.inc()
#for x in db.fastRows(sql(fmt"SELECT * FROM {tag} WHERE lookup LIKE {rand(total)}")):

proc generateRandomVideos(tag : string, amount : int, start = true, ending = true, routine : IntervalRoutine) : seq[string] =

    let total = getTotalVideos(tag)
    if amount > total:
        raise newException(Exception, fmt"Cannot provide enough unique videos for the amount given due to there not being enough videos.")
    let pending = collect(for x in os.walkDir("processing/"): x[1].split("/")[1].split(".")[0])
    echo "going"
    var choices : seq[string]

    for x in db.fastRows(sql(fmt"SELECT * FROM {tag}")):
        #they arent in processing... yet :)
        if pending.contains($x[0]):
            continue
        let fileFormat = fmt"file processing/{x[0]}.mp4"
        if choices.contains(fileFormat):
            continue
        if routine.restrictHasSong or routine.restrictRights:
            var current : JsonNode
            #gets the metadata for a song
            for rows in db.fastRows(sql(fmt"SELECT * FROM videos WHERE id LIKE {x[0]}")):
                current = parseJson(rows[1])
            #if a song is found in the video
            if routine.restrictHasSong and current["artist"].getStr != "N/A":
                continue
            else:
                #if the song contains a restricted word
                let noramlized = normalize($current)
                if routine.rightsCriteria.any(restricted=>noramlized.contains(restricted)):
                    continue
        choices.add(fileFormat)

    var repeat = 0
    while result.len() != amount:
        #a timeout, based on, statistics :)
        if repeat == 50:
            echo "NOT ENOUGH VIDEOS TO SUSTAIN >:O, PICK LESS VIDEOS OR LESS RESTRICTIONS"
            break
        let current = choices[rand(total)]

        if not result.contains(current):
            result.add(current)
            repeat = 0
        repeat.inc()

    let ids = result.map(x=>x.split("/")[1])
    if start:
        writeFile("concat.txt", result.join("\n"))
    else:
        let concat = readFile("concat.txt") & "\n"
        writeFile("concat.txt", concat & result.join("\n"))
    echo ids
    for x in ids:
        echo x
        os.moveFile(fmt"archive/{x}", fmt"processing/{x}")
    discard os.execShellCmd("./normalizeFramerate.sh normalize")
    if ending:
        discard os.execShellCmd("ffmpeg -f concat -i concat.txt -c copy -y output.mp4")
        for x in walkDir("processing/"):
            let path = x[1]
            let id = path.split("/")[1]
            moveFile(path, fmt"archive/{id}")

proc doRoutine(routine : IntervalRoutine) {.gcsafe.} =
    for x in routine.fetch:
        getVideosWithSongs(x[0], x[1])
        echo x
    if routine.generate.len() == 1:
        discard generateRandomVideos(routine.generate[0][0], routine.generate[0][1], routine = routine)
    else:
        let ending = routine.generate.high
        for iteration in 0 .. ending:
            let current = routine.generate[ending]
                                    #start: true false
                                    #inbetween: false false
                                    #final: false true
            discard generateRandomVideos(current[0], current[1],
                                     0 == iteration, iteration == ending, routine)

            discard os.execShellCmd("./callback.sh")
proc Main() =
    echo "Doing what I must because I can :D"
    let routineJson = parseJson(readFile("routine.json"))
    echo "a"
    let routine = (
        makeRoutine(
            routineJson["Weeks"].getInt, routineJson["Days"].getInt,
            routineJson["Hours"].getInt, routineJson["Minutes"].getInt, routineJson["Seconds"].getInt,
            collect(for x in routineJson["Routine"]["Fetch"] : (x[0].getStr, x[1].getInt)),
            collect(for x in routineJson["Routine"]["Generate"] : (x[0].getStr, x[1].getInt)),
            routineJson["Routine"]["Restrictions"]["RestrictHasSong"].getBool,
            routineJson["Routine"]["Restrictions"]["RestrictRights"].getBool,
            collect(for x in routineJson["Routine"]["Restrictions"]["RightsCriteria"] : x.getStr)
        )
    )
    #Scheduler is kinda baby and it doesn't have any checks for "OH IS THIS INPUT 0"
    #Then proceeds to do the math it needs. Obvious problems, y'know.
    #So, we gotta get its total duration in seconds. I could do this myself easily, but this is probably cleaner
    let gap = initDuration(weeks = routine.weeks, days = routine.days, hours = routine.hours, minutes = routine.minutes,
                       seconds = routine.seconds)
    let second = gap.inSeconds()

    doRoutine(routine)
    schedules:
        every(seconds=int(second), async=true):
           doRoutine(routine)

Main()