AutomatedTiktokScraperAndCo.../main.nim
2022-03-14 18:12:38 -04:00

305 lines
No EOL
12 KiB
Nim

import nimpy
import httpclient
import q
import os
import xmltree
import sugar
import sequtils
import strutils
import strformat
import std/db_sqlite
import std/jsonutils,json
import random
import math
import asyncdispatch
import schedules
import times
randomize()
os.removeDir("processing")
os.createDir("processing")
discard os.existsOrCreateDir("archive")
let db = open("tiktok.db", "", "", "")
db.exec(sql"CREATE TABLE IF NOT EXISTS videos (id INTEGER NOT NULL, json LONGTEXT NOT NULL)")
proc getTotalVideos(tag : string) : int =
var it = 0
for x in db.fastRows(sql(fmt"SELECT * FROM {tag}")): it.inc()
return it
proc isVideoInDb(alt : string) : bool =
for x in collect(for x in db.fastRows(sql"SELECT * FROM videos") : x[1]):
if x.contains(alt):
return true
return false
proc inAnyInList(a : openArray[string], b : string) : bool =
let matching = normalize(b)
for x in a:
if normalize(x).contains(matching):
return true
return false
proc getRights(a : (string, string, HttpClient, ptr Channel[(string, string, int, seq[string])], int)) {.thread.} =
if a[0] == "N/A":
a[3][].send((a[0], a[1], a[4], @[]))
return
let songName = a[0].replace(" ", "+")
let artist = a[1].replace(" ", "+")
#haha they wanna stop me from automating this. This is what happens when you dont offer an api
#This site will be down before my cookie expires >:)
a[2].headers = newHttpHeaders({"Cookie" : "disc=2100-02-28T08%3A23%3A00.199Z; RepInstance=instance=REP2&expires=2/28/2100 5:13:54 AM"})
var page = "!"
#So this is some odd code. Sometimes this website will just go "haha, no."
for x in 0 .. 10:
try:
#This is the best db i could find.
page = a[2].request(fmt"https://repertoire.bmi.com/Search/Search?SearchForm.View_Count=&SearchForm.Main_Search=Title&SearchForm.Main_Search_Text={songName}&SearchForm.Sub_Search=Performer&SearchForm.Sub_Search_Text={artist}&SearchForm.Search_Type=all").body
break
except:
if x == 10:
sleep 2000
else:
#eg 500 * 1.1.. 1.2.. 1.3.. 2x
sleep int(math.floor(500.0*(1+(x/10))))
echo "trying again..."
if page == "!":
raise newException(OSError, fmt"we where unable to get repertoire data for {a[0]} by {a[1]}")
let doc = q(page)
#So, we have standard selector and an abnornal one.
#One is for when the rightsholder is well documented, and one where it is not
let standard = doc.select("a.expander").map(x=>innerText(x))
var abnormal : string
try:
abnormal = doc.select("table.style-01 tbody tr td").map(x=>innerText(x).strip())[0]
except:
a[3][].send((a[0], a[1], a[4], standard))
return
if standard.len() == 0:
a[3][].send (a[0], a[1], a[4],@[abnormal])
else:
a[3][].send((a[0], a[1], a[4],standard))
proc processVideo(a : (int, string, HttpClient)) {.thread.} =
writeFile(fmt"processing/{a[0]}.mp4", a[2].request(a[1]).body)
type videoDetails = object of RootObj
artist : string
song : string
hasSong : bool
url : string
author : string
pageFound : string
altText : string
tags : seq[string]
copyright : seq[string]
type IntervalRoutine = object of RootObj
weeks : int
days : int
hours : int
minutes : int
seconds : int
restrictHasSong : bool
restrictRights : bool
rightsCriteria : seq[string]
fetch : seq[(string, int)]
generate : seq[(string, int)]
proc makeRoutine(weeks, days, hours, minutes, seconds : int; fetch, generate : seq[(string, int)],
restrictHasSong, restrictRights : bool, rightsCriteria : seq[string]) : IntervalRoutine=
var normalized : seq[string]
#we normalize it because, it'll prevent any minor typos from interferring with matching.
if rightsCriteria.len() != 0:
normalized = rightsCriteria.map(x=>normalize(x))
return IntervalRoutine(weeks : weeks, days : days, hours : hours,
minutes : minutes, seconds : seconds, fetch : fetch, generate : generate,
restrictRights : restrictRights,restrictHasSong : restrictHasSong, rightsCriteria : normalized)
proc makeVideoDetails(videoIn : (string, string, string), song, author, inputUrl : string, copyright : seq[string]) : videoDetails =
var tags : seq[string]
try:
tags = videoIn[1].split(" ").filter(x=>x.len() != 0).filter(x=>x[0] == '#')
except:
tags = @[]
if inputUrl.contains("topics"):
let topic = inputUrl.split("topics/")[1]
tags.add(fmt"#topic{topic}")
elif inputUrl == "https://tiktok.com/":
tags.add("#topicmain")
return (videoDetails(artist : author, song : song, copyright : copyright, url : videoIn[0], altText : videoIn[1], tags : tags, author : videoIn[2], pageFound : inputUrl, hasSong : song == "N/A"))
proc getVideosWithSongs(url : string, target : int) =
let module = readFile("sel.py")
discard pyBuiltinsModule().exec(module)
let getVidsOnPage = pyGlobals()["getVideosOnPage"].to(proc(a : string, b : int) : seq[(string, string, string)] {.gcsafe.})
let idSong = pyGlobals()["idSong"].to(proc(a : string) : seq[(string, string, int)] {.gcsafe.})
let client = newHttpClient()
client.headers = newHttpHeaders({"Referer" : "https://www.tiktok.com/"})
var videos = getVidsOnPage(url, target)
videos = videos.filter(x=> not isVideoInDb(x[1]))
if videos.len() == 0:
return
let endChannel = createShared(Channel[(string, string, int, seq[string])], 200)
var thr = newSeq[Thread[(int, string, HttpClient)]](videos.high)
var endthr = newSeq[Thread[(string, string, HttpClient,
ptr Channel[(string, string, int, seq[string])], int)]](videos.high)
for x in 0 .. videos.high:
try:
createThread(thr[x], processVideo, (x, videos[x][0], client))
except:
break
joinThreads(thr)
let songs = idSong("./processing/").map(x=>(x[0].replace("\\", ""), x[1].replace("\\", ""), x[2]))
endChannel[].open()
for x in 0 .. songs.high:
try:
createThread(endthr[x], getRights, (songs[x][0], songs[x][1], client, endChannel, songs[x][2]))
except:
break
joinThreads(endthr)
let exit = collect(for x in 0 .. endChannel[].peek: endChannel[].tryRecv.msg)
var start = getTotalVideos("videos")
for x in exit:
var video : videoDetails
try:
video = makeVideoDetails(videos[x[2]], x[0], x[1], url, x[3])
except:
continue
if isVideoInDb(videos[x[2]][1]):
continue
db.exec(sql"""INSERT INTO videos (id, json)
VALUES (?, ?)""", start, $tojson(video))
for tag in video.tags:
let tag = tag[1 .. ^1]
try:
db.exec(sql(fmt"CREATE TABLE IF NOT EXISTS {tag} (lookup INTEGER NOT NULL)"))
db.exec(sql(fmt"INSERT INTO {tag} (lookup) VALUES (?)"), start)
except:
echo tag
moveFile(fmt"processing/{x[2]}.mp4", fmt"archive/{start}.mp4")
echo fmt"processing/{x[2]}.mp4; archive/{start}.mp4"
start.inc()
#for x in db.fastRows(sql(fmt"SELECT * FROM {tag} WHERE lookup LIKE {rand(total)}")):
proc generateRandomVideos(tag : string, amount : int, start = true, ending = true, routine : IntervalRoutine) : seq[string] =
let total = getTotalVideos(tag)
if amount > total:
raise newException(Exception, fmt"Cannot provide enough unique videos for the amount given due to there not being enough videos.")
let pending = collect(for x in os.walkDir("processing/"): x[1].split("/")[1].split(".")[0])
echo "going"
var choices : seq[string]
for x in db.fastRows(sql(fmt"SELECT * FROM {tag}")):
#they arent in processing... yet :)
if pending.contains($x[0]):
continue
let fileFormat = fmt"file processing/{x[0]}.mp4"
if choices.contains(fileFormat):
continue
if routine.restrictHasSong or routine.restrictRights:
var current : JsonNode
#gets the metadata for a song
for rows in db.fastRows(sql(fmt"SELECT * FROM videos WHERE id LIKE {x[0]}")):
current = parseJson(rows[1])
#if a song is found in the video
if routine.restrictHasSong and current["artist"].getStr != "N/A":
continue
else:
#if the song contains a restricted word
let noramlized = normalize($current)
if routine.rightsCriteria.any(restricted=>noramlized.contains(restricted)):
continue
choices.add(fileFormat)
var repeat = 0
while result.len() != amount:
#a timeout, based on, statistics :)
if repeat == 50:
echo "NOT ENOUGH VIDEOS TO SUSTAIN >:O, PICK LESS VIDEOS OR LESS RESTRICTIONS"
break
let current = choices[rand(total)]
if not result.contains(current):
result.add(current)
repeat = 0
repeat.inc()
let ids = result.map(x=>x.split("/")[1])
if start:
writeFile("concat.txt", result.join("\n"))
else:
let concat = readFile("concat.txt") & "\n"
writeFile("concat.txt", concat & result.join("\n"))
echo ids
for x in ids:
echo x
os.moveFile(fmt"archive/{x}", fmt"processing/{x}")
discard os.execShellCmd("./normalizeFramerate.sh normalize")
if ending:
discard os.execShellCmd("ffmpeg -f concat -i concat.txt -c copy -y output.mp4")
for x in walkDir("processing/"):
let path = x[1]
let id = path.split("/")[1]
moveFile(path, fmt"archive/{id}")
proc doRoutine(routine : IntervalRoutine) {.gcsafe.} =
for x in routine.fetch:
getVideosWithSongs(x[0], x[1])
echo x
if routine.generate.len() == 1:
discard generateRandomVideos(routine.generate[0][0], routine.generate[0][1], routine = routine)
else:
let ending = routine.generate.high
for iteration in 0 .. ending:
let current = routine.generate[ending]
#start: true false
#inbetween: false false
#final: false true
discard generateRandomVideos(current[0], current[1],
0 == iteration, iteration == ending, routine)
discard os.execShellCmd("./callback.sh")
proc Main() =
echo "Doing what I must because I can :D"
let routineJson = parseJson(readFile("routine.json"))
echo "a"
let routine = (
makeRoutine(
routineJson["Weeks"].getInt, routineJson["Days"].getInt,
routineJson["Hours"].getInt, routineJson["Minutes"].getInt, routineJson["Seconds"].getInt,
collect(for x in routineJson["Routine"]["Fetch"] : (x[0].getStr, x[1].getInt)),
collect(for x in routineJson["Routine"]["Generate"] : (x[0].getStr, x[1].getInt)),
routineJson["Routine"]["Restrictions"]["RestrictHasSong"].getBool,
routineJson["Routine"]["Restrictions"]["RestrictRights"].getBool,
collect(for x in routineJson["Routine"]["Restrictions"]["RightsCriteria"] : x.getStr)
)
)
#Scheduler is kinda baby and it doesn't have any checks for "OH IS THIS INPUT 0"
#Then proceeds to do the math it needs. Obvious problems, y'know.
#So, we gotta get its total duration in seconds. I could do this myself easily, but this is probably cleaner
let gap = initDuration(weeks = routine.weeks, days = routine.days, hours = routine.hours, minutes = routine.minutes,
seconds = routine.seconds)
let second = gap.inSeconds()
doRoutine(routine)
schedules:
every(seconds=int(second), async=true):
doRoutine(routine)
Main()