Pre-Documentation!

This commit is contained in:
user 2024-11-23 05:01:29 -05:00
parent 9c893059e5
commit 2c8dffa147
21 changed files with 110 additions and 1327 deletions

View file

View file

@ -0,0 +1,4 @@
#!/bin/bash
nimble install nimquery itertools
nim c -d:ssl -d:danger --out:scrape ../src/nim/scraping/scrape.nim
nim c -d:ssl -d:danger --out:server ../src/nim/searching/server

View file

@ -5,35 +5,25 @@ proxyConfig:
ddosPreventionConfig:
# 0 for no ddos-prevention, use at your own moral and legal risk
# The scraper works according to the diagram in the project's README.
# For each thread, it will simultaneously download, by each forum thread.
# So, if you have 16 threads with a page wait of 1000, every second it will download 16 pages.
# That is, assuming it is in sync, which it will not be.
# That is, assuming it is in sync, which it will not be.
timeBetweenForumThreadsMs: 1000
timeBetweenPagesMs: 200
threadingConfig:
enabled : true
# Just use how many threads your cpu has.
enabled: true
# Just use how many threads your cpu has.
# If your cpu has 12 cores and 24 threads available, it will use 24 threads
# See "Bottlenecks & Optimization" in the README for a guide on setting these vars if unsure.
useWorkerPerThread : true
useWorkerPerThread: true
# Ignored if useWorkerPerThread is true.
workerCount : -1
workerCount: -1
databaseConnectionsConfig:
# Either 'SQLLite' or "PostgresSql"
databaseType : 'PostgresSql'
postgresConn:
host : "127.0.0.1"
port : 12345
user : ""
password : ""
database : ""
sqliteConn:
filepath : "./ArlongPark.db"
user : ""
password : ""
database : ""
databaseConfig:
filepath: "./ArlongPark.db"
user: ""
password: ""
database: ""

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,20 @@
UPDATE subpost
SET hasreply = 'false';
UPDATE subpost
SET hasreply = 'true'
WHERE postid IN (SELECT DISTINCT replypostid
FROM subpost
WHERE isreply = 'true');
UPDATE SubPost SET CreationTime = strftime('%s', CreationTime);
UPDATE MacroPost SET CreationTime = strftime('%s', CreationTime);
CREATE VIRTUAL TABLE TextSearch USING fts5(
PostId,
NonQuotedText
);
INSERT INTO TextSearch select PostId, NonQuotedText from Subpost

4
execenv/refineDb.sh Executable file
View file

@ -0,0 +1,4 @@
#!/bin/bash
cat refineDatabase.sql | sqlite3 $(cat config.yaml | yq -r .databaseConfig.filepath)
nim c -r -d:ssl -d:danger ../src/nim/scraping/organizeByChapters.nim

BIN
execenv/scrape Executable file

Binary file not shown.

BIN
execenv/server Executable file

Binary file not shown.

View file

@ -13,7 +13,7 @@ using System.Reflection;
[assembly: System.Reflection.AssemblyCompanyAttribute("c#")]
[assembly: System.Reflection.AssemblyConfigurationAttribute("Debug")]
[assembly: System.Reflection.AssemblyFileVersionAttribute("1.0.0.0")]
[assembly: System.Reflection.AssemblyInformationalVersionAttribute("1.0.0+0f22caaf9017f951f1792ff84989edac394c32ec")]
[assembly: System.Reflection.AssemblyInformationalVersionAttribute("1.0.0+9c893059e5b41c00de07a5f0ea31dbdda8f77a8c")]
[assembly: System.Reflection.AssemblyProductAttribute("c#")]
[assembly: System.Reflection.AssemblyTitleAttribute("c#")]
[assembly: System.Reflection.AssemblyVersionAttribute("1.0.0.0")]

View file

@ -1 +1 @@
814961062ebbe87d1f28b738cbff65402953509f5edff671e61947676b9a4dc8
d31dc13bf6f786eb359f92856930ccd0a0169f5f3d3ecb37c8b21c76fc24ff73

View file

@ -38,6 +38,10 @@ CREATE TABLE SubPost(
FOREIGN KEY(ReplyPostId) REFERENCES SubPost(PostId)
);
CREATE INDEX idx_usernames
ON SubPost (submitter);
@ -52,10 +56,3 @@ ON SubPost (CreationTime);
CREATE INDEX idx_subpost_Submitter
ON SubPost (Submitter);
CREATE VIRTUAL TABLE NonQuotedText USING fts5(
PostId String not null,
NonQuotedText String not null
);
INSERT INTO TextSearch select PostId, NonQuotedText from Subpost

View file

@ -14,6 +14,10 @@
<body>
<main>
<div id="top">
<div hidden id="warning" >
<p>There was an exception in contacting / parsing the output of the api.</p>
<p id="specific-warning"></p>
</div>
<form onsubmit="processForm(event)">
<select id="search-type">
<option value="sub">Substring In Post Body</option>
@ -318,7 +322,6 @@
}
.user-result table {
align-items: center;
text-align: center;
@ -355,6 +358,10 @@
margin-left : 4vw;
color: grey;
}
#warning {
background-color: red;
color:white
}
</style>
@ -364,7 +371,6 @@
<script id="chart-lib">
let chart = null;
function downSample(list, interval) {
let i = 0;
let counter = interval;
@ -525,6 +531,7 @@
</script>
<script id="lib-layout">
const endPoint = "http://localhost:5000"
let order = "Oldest";
function setUrl(params) {
const url = new URL(window.location.href);
@ -538,15 +545,24 @@
}
}
async function doApiCall(path, params) {
const apiLocation = new URL("http://localhost:5000");
Object.entries(params).forEach(([key, value]) => {
apiLocation.searchParams.set(key, value);
});
apiLocation.pathname = path;
const queryResponse = await fetch(apiLocation);
const qObject = await queryResponse.json();
responseObject = qObject;
return qObject;
try{
const apiLocation = new URL(endPoint);
Object.entries(params).forEach(([key, value]) => {
apiLocation.searchParams.set(key, value);
});
apiLocation.pathname = path;
const queryResponse = await fetch(apiLocation);
const qObject = await queryResponse.json();
responseObject = qObject;
return qObject;
}
catch(exception){
const error = document.querySelector("#warning");
const specificWarning = document.querySelector("#specific-warning");
warning.removeAttribute("hidden")
specificWarning.textContent = exception
}
}
function setToggle(option, input) {
@ -581,7 +597,7 @@
const limit = document.querySelector("#max-results");
orderType.addEventListener("change", (event) => {
const val = event.target.value;
page = 1;
changePage(1)
if (val == order) {
return;
@ -793,7 +809,7 @@
const totalPages = document.querySelector("#num-of-pages");
totalPages.textContent = numOfPages;
if (params != undefined && params.page == null) {
if (params != undefined && params.page == null && setPageTo1 == true) {
changePage(1, false);
}
else if (
@ -810,7 +826,6 @@
queryBody.subposts = queryBody.subposts.reverse();
order = "Newest";
}
console.log(queryBody)
queryBody.subposts
.slice((page - 1) * pageSize, page * pageSize)
.forEach((x) => {
@ -826,7 +841,6 @@
const viewMetaButton = cloned.querySelector("#view-meta");
let postidData = document.createElement("td")
console.log(x)
postidData.textContent = x.postId
let quotedLength = document.createElement("td")
quotedLength.textContent = x.quotedText.length
@ -934,8 +948,9 @@
cloned.setAttribute("id", `id-${x.postId}`);
if (x.hasReply || x.isReply) {
const button = document.createElement("button");
button.addEventListener("click", (x) => {
button.addEventListener("click", (event) => {
const obj = {
queryType: "replyChain",
postId: x.postId,
@ -1149,6 +1164,8 @@
break;
}
} else {
document.querySelector("#left-side").removeAttribute('hidden')
document.querySelector("#right-side").removeAttribute("hidden")
defaultSearch();
}
}

View file

@ -9,35 +9,22 @@ type
timeBetweenForumThreadsMs = 0
timeBetweenPagesMs = 0
Threading* = object
enabled* : bool
enabled* : bool
useWorkerPerThread* : bool
workerCount* : int
DatabaseType = enum
SQLLite, PostgresSql
PostgresConn* = object
host* : string
port* : int
user* : string
password* : string
database* : string
SQLLiteConn* = object
filepath* : string
user* : string
password* : string
database* : string
DatabaseConnConfig* = object
databaseType : DatabaseType
postgresConn* : PostgresConn
sqliteConn* : SQLLiteConn
ScraperConfig* = object
proxyConfig* : ProxyConfig
ddosPreventionConfig* : DDOSPrevention
threadingConfig* : Threading
databaseConnectionsConfig* : DatabaseConnConfig
databaseConfig* : SQLLiteConn
proc initConfig*(path : string) : ScraperConfig =
doAssert(fileExists(path), "config does not exist")
let configRaw = readFile(path)
load(configRaw, result)
print initConfig("./config.yaml")
print initConfig("./config.yaml")

Binary file not shown.

View file

@ -1,66 +0,0 @@
import sequtils
import db_connector/db_sqlite
import nimlevenshtein
import strutils
import tables
import algorithm
let db = open("../../../execenv/ArlongPark.db", "", "", "")
proc getReplyToStart(db : DbConn, a : var seq[string]) =
var last = a[^1]
let parent = db.getRow(sql"""select postId, replyPostId, isReply from SubPost where PostId = ?""", last)
if parent[0] == "":
return
let isReply = parseBool parent[2]
if isReply:
a.add(parent[1])
getReplyToStart(db, a)
proc getReplyToEnd(db : DbConn, a : var seq[string]) =
var last = a[^1]
let parent = db.getRow(sql"""select postId, replyPostId, hasReply from SubPost where replyPostId = ?""", last)
if parent[0] == "":
return
let hasReply = parseBool parent[2]
a.add(parent[0])
if hasReply:
getReplyToEnd(db, a)
var result = newTable[string, seq[string]]()
for x in db.fastRows(sql"select postId, replyPostId, isReply, hasReply from SubPost where isReply = 'true' or hasReply = 'true'"):
let isReply = parseBool x[2]
let hasReply = parseBool x[3]
var originChain : seq[string]
var childChain : seq[string]
if isReply:
var chain = @[x[1]]
getReplyToStart(db, chain)
originChain = chain
originChain.reverse()
if hasReply:
var chain = @[x[0]]
getReplyToEnd(db, chain)
childChain = chain
let origin =
if isReply:
originChain[0]
else:
x[0]
let endPostId =
if hasReply:
childChain[^1]
else:
x[0]
echo (origin, endPostId)
echo "=="

Binary file not shown.

View file

@ -8,6 +8,8 @@ import times
import strutils
import algorithm
import strformat
import config
import db_connector/db_sqlite
var chapterDate : Table[int, int64]
var dateChapter : Table[int64, int]
@ -27,7 +29,11 @@ for x in 1 .. one.high:
releaseDates.add(relaseDateTime)
releaseDates.sort()
let db = open("../../../execenv/ArlongPark.db", "", "", "")
var scraperConfig = createShared(ScraperConfig, sizeof(ScraperConfig))
scraperConfig[] = initConfig("./config.yaml")
let db = db_sqlite.open(scraperConfig[].databaseConfig.filepath, scraperConfig[].databaseConfig.user, scraperConfig[].databaseConfig.password, scraperConfig[].databaseConfig.database )
proc getChapter(time : int) : int =
for x in 0 .. releaseDates.high:
@ -37,15 +43,17 @@ proc getChapter(time : int) : int =
raise new Exception
db.exec(sql"BEGIN TRANSACTION")
for x in db.fastRows(sql"select PostId, unixepoch(CreationTime), Chapter from SubPost;"):
for x in db.fastRows(sql"select PostId, CreationTime, Chapter from SubPost;"):
let rowid = x[0]
let time = parseInt(x[1])
echo (time, rowid)
let chapter = getChapter(time)
db.exec(sql"UPDATE SubPost SET CHAPTER = ? WHERE PostId = ?;", chapter, rowid)
db.exec(sql"commit")
db.exec(sql"BEGIN TRANSACTION")
for x in db.fastRows(sql"select PostId, unixepoch(CreationTime), Chapter from MacroPost;"):
for x in db.fastRows(sql"select PostId, CreationTime, Chapter from MacroPost;"):
let rowid = x[0]
let time = parseInt(x[1])
let chapter = getChapter(time)

View file

@ -3,18 +3,18 @@ import itertools
import nimquery
import db_connector/[db_sqlite, db_postgres]
import config
import std/cpuinfo
randomize()
let writeDb = db_sqlite.open("ArlongPark.db", "", "", "")
let readDb = db_sqlite.open("ArlongPark.db", "", "", "")
var scraperConfig = createShared(ScraperConfig, sizeof(ScraperConfig))
scraperConfig[] = initConfig("./config.yaml")
let writeDb = db_sqlite.open(scraperConfig[].databaseConfig.filepath, scraperConfig[].databaseConfig.user, scraperConfig[].databaseConfig.password, scraperConfig[].databaseConfig.database )
let readDb = db_sqlite.open(scraperConfig[].databaseConfig.filepath, scraperConfig[].databaseConfig.user, scraperConfig[].databaseConfig.password, scraperConfig[].databaseConfig.database )
writeDb.exec(sql"PRAGMA journal_mode=WAL;")
var dbLock : Lock
initLock(dbLock)
let postgresChannel = createShared(Channel[SqlQuery], sizeof(Channel[SqlQuery]))
var scraperConfig = createShared(ScraperConfig, sizeof(ScraperConfig))
scraperConfig[] = initConfig("./config.yaml")
type
Stats = object
@ -44,17 +44,6 @@ type
Inaccessible = ref object of HttpRequestError
FailedDownload = ref object of HttpRequestError
proc postgresWorker() {.thread.} =
let databaseConfig = scraperConfig[].databaseConnectionsConfig.postgresConn
let db = db_postgres.open("", databaseConfig.user, databaseConfig.password,
&"host={databaseConfig.host} port={databaseConfig.port} dbname={databaseConfig.database}")
while true:
let peek = postgresChannel[].peek
doAssert(peek != -1, "Somehow channel closed. This is not expected behavior.")
for x in 0 .. peek:
let toExecute = postgresChannel[].recv()
db.exec(toExecute)
sleep 10
proc getRandomProxy() : Proxy =
let proxies = readFile("proxies.txt").split("\n").map(x => x.split(":"))
@ -282,11 +271,14 @@ proc downloadPagesWrapper(input : seq[string]) =
proc downloadFromDatabase*() =
let perSeconds = 0
echo "1!"
let cpus = 2 #countProcessors()
let cpus =
if scraperConfig[].threadingConfig.useWorkerPerThread:
countProcessors()
else:
scraperConfig[].threadingConfig.workerCount
doAssert(cpus >= 1, "Work cannot get done with less than 1 worker")
var threads = newSeq[Thread[seq[string]]](cpus)
let toDownload = readDb.getAllRows(sql"select * from ToDownload where Completed = false and inaccessible = false ORDER BY RANDOM()").map(x=> x[0])
echo "2!"
let threadedChunks = toDownload.distribute(cpus)
for i in 0 .. threads.high:
echo i
@ -294,4 +286,5 @@ proc downloadFromDatabase*() =
sleep perSeconds*1000
joinThreads(threads)
generateToDownload()
downloadFromDatabase()

View file

@ -14,6 +14,9 @@ import std/monotimes
import sugar
import strformat
import nimlevenshtein
import ../scraping/config
var scraperConfig = createShared(ScraperConfig, sizeof(ScraperConfig))
scraperConfig[] = initConfig("./config.yaml")
type SubPostResponse = object
queryTime : int
@ -121,8 +124,8 @@ proc escapeString(a : string) : string =
result = result.replace(""""""", """""""")
proc controllerInit*() : ControllerData =
result.db = open("../../../execenv/ArlongPark.db", "", "", "")
result.db.exec(sql"PRAGMA query_only = boolean;")
result.db = db_sqlite.open(scraperConfig[].databaseConfig.filepath, scraperConfig[].databaseConfig.user, scraperConfig[].databaseConfig.password, scraperConfig[].databaseConfig.database )
result.db.exec(sql"PRAGMA query_only = true;")
var headers = newTable[string, string]()
headers["Content-Type"] = "application/json"
headers["Access-Control-Allow-Origin"] = "*"