Pre-Documentation!
This commit is contained in:
parent
9c893059e5
commit
2c8dffa147
21 changed files with 110 additions and 1327 deletions
execenv
src
c#/obj/Debug/net8.0
database/schema
frontend
nim
4
execenv/compilescrapeandserver.sh
Executable file
4
execenv/compilescrapeandserver.sh
Executable file
|
@ -0,0 +1,4 @@
|
|||
#!/bin/bash
|
||||
nimble install nimquery itertools
|
||||
nim c -d:ssl -d:danger --out:scrape ../src/nim/scraping/scrape.nim
|
||||
nim c -d:ssl -d:danger --out:server ../src/nim/searching/server
|
|
@ -5,35 +5,25 @@ proxyConfig:
|
|||
|
||||
ddosPreventionConfig:
|
||||
# 0 for no ddos-prevention, use at your own moral and legal risk
|
||||
|
||||
|
||||
# The scraper works according to the diagram in the project's README.
|
||||
# For each thread, it will simultaneously download, by each forum thread.
|
||||
# So, if you have 16 threads with a page wait of 1000, every second it will download 16 pages.
|
||||
# That is, assuming it is in sync, which it will not be.
|
||||
# That is, assuming it is in sync, which it will not be.
|
||||
timeBetweenForumThreadsMs: 1000
|
||||
timeBetweenPagesMs: 200
|
||||
|
||||
threadingConfig:
|
||||
enabled : true
|
||||
# Just use how many threads your cpu has.
|
||||
enabled: true
|
||||
# Just use how many threads your cpu has.
|
||||
# If your cpu has 12 cores and 24 threads available, it will use 24 threads
|
||||
# See "Bottlenecks & Optimization" in the README for a guide on setting these vars if unsure.
|
||||
useWorkerPerThread : true
|
||||
useWorkerPerThread: true
|
||||
# Ignored if useWorkerPerThread is true.
|
||||
workerCount : -1
|
||||
workerCount: -1
|
||||
|
||||
databaseConnectionsConfig:
|
||||
# Either 'SQLLite' or "PostgresSql"
|
||||
databaseType : 'PostgresSql'
|
||||
postgresConn:
|
||||
host : "127.0.0.1"
|
||||
port : 12345
|
||||
user : ""
|
||||
password : ""
|
||||
database : ""
|
||||
|
||||
sqliteConn:
|
||||
filepath : "./ArlongPark.db"
|
||||
user : ""
|
||||
password : ""
|
||||
database : ""
|
||||
databaseConfig:
|
||||
filepath: "./ArlongPark.db"
|
||||
user: ""
|
||||
password: ""
|
||||
database: ""
|
587
execenv/output
587
execenv/output
File diff suppressed because one or more lines are too long
587
execenv/output2
587
execenv/output2
File diff suppressed because one or more lines are too long
20
execenv/refineDatabase.sql
Normal file
20
execenv/refineDatabase.sql
Normal file
|
@ -0,0 +1,20 @@
|
|||
UPDATE subpost
|
||||
SET hasreply = 'false';
|
||||
|
||||
UPDATE subpost
|
||||
SET hasreply = 'true'
|
||||
WHERE postid IN (SELECT DISTINCT replypostid
|
||||
FROM subpost
|
||||
WHERE isreply = 'true');
|
||||
|
||||
|
||||
UPDATE SubPost SET CreationTime = strftime('%s', CreationTime);
|
||||
|
||||
UPDATE MacroPost SET CreationTime = strftime('%s', CreationTime);
|
||||
|
||||
CREATE VIRTUAL TABLE TextSearch USING fts5(
|
||||
PostId,
|
||||
NonQuotedText
|
||||
);
|
||||
|
||||
INSERT INTO TextSearch select PostId, NonQuotedText from Subpost
|
4
execenv/refineDb.sh
Executable file
4
execenv/refineDb.sh
Executable file
|
@ -0,0 +1,4 @@
|
|||
#!/bin/bash
|
||||
cat refineDatabase.sql | sqlite3 $(cat config.yaml | yq -r .databaseConfig.filepath)
|
||||
|
||||
nim c -r -d:ssl -d:danger ../src/nim/scraping/organizeByChapters.nim
|
BIN
execenv/scrape
Executable file
BIN
execenv/scrape
Executable file
Binary file not shown.
BIN
execenv/server
Executable file
BIN
execenv/server
Executable file
Binary file not shown.
|
@ -13,7 +13,7 @@ using System.Reflection;
|
|||
[assembly: System.Reflection.AssemblyCompanyAttribute("c#")]
|
||||
[assembly: System.Reflection.AssemblyConfigurationAttribute("Debug")]
|
||||
[assembly: System.Reflection.AssemblyFileVersionAttribute("1.0.0.0")]
|
||||
[assembly: System.Reflection.AssemblyInformationalVersionAttribute("1.0.0+0f22caaf9017f951f1792ff84989edac394c32ec")]
|
||||
[assembly: System.Reflection.AssemblyInformationalVersionAttribute("1.0.0+9c893059e5b41c00de07a5f0ea31dbdda8f77a8c")]
|
||||
[assembly: System.Reflection.AssemblyProductAttribute("c#")]
|
||||
[assembly: System.Reflection.AssemblyTitleAttribute("c#")]
|
||||
[assembly: System.Reflection.AssemblyVersionAttribute("1.0.0.0")]
|
||||
|
|
|
@ -1 +1 @@
|
|||
814961062ebbe87d1f28b738cbff65402953509f5edff671e61947676b9a4dc8
|
||||
d31dc13bf6f786eb359f92856930ccd0a0169f5f3d3ecb37c8b21c76fc24ff73
|
||||
|
|
|
@ -38,6 +38,10 @@ CREATE TABLE SubPost(
|
|||
FOREIGN KEY(ReplyPostId) REFERENCES SubPost(PostId)
|
||||
);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
CREATE INDEX idx_usernames
|
||||
ON SubPost (submitter);
|
||||
|
||||
|
@ -52,10 +56,3 @@ ON SubPost (CreationTime);
|
|||
|
||||
CREATE INDEX idx_subpost_Submitter
|
||||
ON SubPost (Submitter);
|
||||
|
||||
CREATE VIRTUAL TABLE NonQuotedText USING fts5(
|
||||
PostId String not null,
|
||||
NonQuotedText String not null
|
||||
);
|
||||
|
||||
INSERT INTO TextSearch select PostId, NonQuotedText from Subpost
|
||||
|
|
|
@ -14,6 +14,10 @@
|
|||
<body>
|
||||
<main>
|
||||
<div id="top">
|
||||
<div hidden id="warning" >
|
||||
<p>There was an exception in contacting / parsing the output of the api.</p>
|
||||
<p id="specific-warning"></p>
|
||||
</div>
|
||||
<form onsubmit="processForm(event)">
|
||||
<select id="search-type">
|
||||
<option value="sub">Substring In Post Body</option>
|
||||
|
@ -318,7 +322,6 @@
|
|||
|
||||
}
|
||||
|
||||
|
||||
.user-result table {
|
||||
align-items: center;
|
||||
text-align: center;
|
||||
|
@ -355,6 +358,10 @@
|
|||
margin-left : 4vw;
|
||||
color: grey;
|
||||
}
|
||||
#warning {
|
||||
background-color: red;
|
||||
color:white
|
||||
}
|
||||
|
||||
</style>
|
||||
|
||||
|
@ -364,7 +371,6 @@
|
|||
|
||||
<script id="chart-lib">
|
||||
let chart = null;
|
||||
|
||||
function downSample(list, interval) {
|
||||
let i = 0;
|
||||
let counter = interval;
|
||||
|
@ -525,6 +531,7 @@
|
|||
</script>
|
||||
|
||||
<script id="lib-layout">
|
||||
const endPoint = "http://localhost:5000"
|
||||
let order = "Oldest";
|
||||
function setUrl(params) {
|
||||
const url = new URL(window.location.href);
|
||||
|
@ -538,15 +545,24 @@
|
|||
}
|
||||
}
|
||||
async function doApiCall(path, params) {
|
||||
const apiLocation = new URL("http://localhost:5000");
|
||||
Object.entries(params).forEach(([key, value]) => {
|
||||
apiLocation.searchParams.set(key, value);
|
||||
});
|
||||
apiLocation.pathname = path;
|
||||
const queryResponse = await fetch(apiLocation);
|
||||
const qObject = await queryResponse.json();
|
||||
responseObject = qObject;
|
||||
return qObject;
|
||||
try{
|
||||
const apiLocation = new URL(endPoint);
|
||||
Object.entries(params).forEach(([key, value]) => {
|
||||
apiLocation.searchParams.set(key, value);
|
||||
});
|
||||
apiLocation.pathname = path;
|
||||
const queryResponse = await fetch(apiLocation);
|
||||
const qObject = await queryResponse.json();
|
||||
responseObject = qObject;
|
||||
return qObject;
|
||||
}
|
||||
catch(exception){
|
||||
const error = document.querySelector("#warning");
|
||||
const specificWarning = document.querySelector("#specific-warning");
|
||||
|
||||
warning.removeAttribute("hidden")
|
||||
specificWarning.textContent = exception
|
||||
}
|
||||
}
|
||||
|
||||
function setToggle(option, input) {
|
||||
|
@ -581,7 +597,7 @@
|
|||
const limit = document.querySelector("#max-results");
|
||||
orderType.addEventListener("change", (event) => {
|
||||
const val = event.target.value;
|
||||
page = 1;
|
||||
changePage(1)
|
||||
|
||||
if (val == order) {
|
||||
return;
|
||||
|
@ -793,7 +809,7 @@
|
|||
const totalPages = document.querySelector("#num-of-pages");
|
||||
totalPages.textContent = numOfPages;
|
||||
|
||||
if (params != undefined && params.page == null) {
|
||||
if (params != undefined && params.page == null && setPageTo1 == true) {
|
||||
changePage(1, false);
|
||||
}
|
||||
else if (
|
||||
|
@ -810,7 +826,6 @@
|
|||
queryBody.subposts = queryBody.subposts.reverse();
|
||||
order = "Newest";
|
||||
}
|
||||
console.log(queryBody)
|
||||
queryBody.subposts
|
||||
.slice((page - 1) * pageSize, page * pageSize)
|
||||
.forEach((x) => {
|
||||
|
@ -826,7 +841,6 @@
|
|||
const viewMetaButton = cloned.querySelector("#view-meta");
|
||||
|
||||
let postidData = document.createElement("td")
|
||||
console.log(x)
|
||||
postidData.textContent = x.postId
|
||||
let quotedLength = document.createElement("td")
|
||||
quotedLength.textContent = x.quotedText.length
|
||||
|
@ -934,8 +948,9 @@
|
|||
|
||||
cloned.setAttribute("id", `id-${x.postId}`);
|
||||
if (x.hasReply || x.isReply) {
|
||||
|
||||
const button = document.createElement("button");
|
||||
button.addEventListener("click", (x) => {
|
||||
button.addEventListener("click", (event) => {
|
||||
const obj = {
|
||||
queryType: "replyChain",
|
||||
postId: x.postId,
|
||||
|
@ -1149,6 +1164,8 @@
|
|||
break;
|
||||
}
|
||||
} else {
|
||||
document.querySelector("#left-side").removeAttribute('hidden')
|
||||
document.querySelector("#right-side").removeAttribute("hidden")
|
||||
defaultSearch();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,35 +9,22 @@ type
|
|||
timeBetweenForumThreadsMs = 0
|
||||
timeBetweenPagesMs = 0
|
||||
Threading* = object
|
||||
enabled* : bool
|
||||
enabled* : bool
|
||||
useWorkerPerThread* : bool
|
||||
workerCount* : int
|
||||
DatabaseType = enum
|
||||
SQLLite, PostgresSql
|
||||
PostgresConn* = object
|
||||
host* : string
|
||||
port* : int
|
||||
user* : string
|
||||
password* : string
|
||||
database* : string
|
||||
SQLLiteConn* = object
|
||||
filepath* : string
|
||||
user* : string
|
||||
password* : string
|
||||
database* : string
|
||||
DatabaseConnConfig* = object
|
||||
databaseType : DatabaseType
|
||||
postgresConn* : PostgresConn
|
||||
sqliteConn* : SQLLiteConn
|
||||
|
||||
ScraperConfig* = object
|
||||
proxyConfig* : ProxyConfig
|
||||
ddosPreventionConfig* : DDOSPrevention
|
||||
threadingConfig* : Threading
|
||||
databaseConnectionsConfig* : DatabaseConnConfig
|
||||
databaseConfig* : SQLLiteConn
|
||||
|
||||
proc initConfig*(path : string) : ScraperConfig =
|
||||
doAssert(fileExists(path), "config does not exist")
|
||||
let configRaw = readFile(path)
|
||||
load(configRaw, result)
|
||||
print initConfig("./config.yaml")
|
||||
print initConfig("./config.yaml")
|
||||
|
|
Binary file not shown.
|
@ -1,66 +0,0 @@
|
|||
import sequtils
|
||||
import db_connector/db_sqlite
|
||||
import nimlevenshtein
|
||||
import strutils
|
||||
import tables
|
||||
import algorithm
|
||||
let db = open("../../../execenv/ArlongPark.db", "", "", "")
|
||||
|
||||
proc getReplyToStart(db : DbConn, a : var seq[string]) =
|
||||
var last = a[^1]
|
||||
let parent = db.getRow(sql"""select postId, replyPostId, isReply from SubPost where PostId = ?""", last)
|
||||
|
||||
if parent[0] == "":
|
||||
return
|
||||
|
||||
let isReply = parseBool parent[2]
|
||||
|
||||
if isReply:
|
||||
a.add(parent[1])
|
||||
getReplyToStart(db, a)
|
||||
|
||||
proc getReplyToEnd(db : DbConn, a : var seq[string]) =
|
||||
var last = a[^1]
|
||||
let parent = db.getRow(sql"""select postId, replyPostId, hasReply from SubPost where replyPostId = ?""", last)
|
||||
|
||||
if parent[0] == "":
|
||||
return
|
||||
|
||||
let hasReply = parseBool parent[2]
|
||||
a.add(parent[0])
|
||||
|
||||
if hasReply:
|
||||
getReplyToEnd(db, a)
|
||||
|
||||
var result = newTable[string, seq[string]]()
|
||||
for x in db.fastRows(sql"select postId, replyPostId, isReply, hasReply from SubPost where isReply = 'true' or hasReply = 'true'"):
|
||||
let isReply = parseBool x[2]
|
||||
let hasReply = parseBool x[3]
|
||||
|
||||
var originChain : seq[string]
|
||||
var childChain : seq[string]
|
||||
if isReply:
|
||||
var chain = @[x[1]]
|
||||
getReplyToStart(db, chain)
|
||||
originChain = chain
|
||||
originChain.reverse()
|
||||
if hasReply:
|
||||
var chain = @[x[0]]
|
||||
getReplyToEnd(db, chain)
|
||||
childChain = chain
|
||||
|
||||
let origin =
|
||||
if isReply:
|
||||
originChain[0]
|
||||
else:
|
||||
x[0]
|
||||
let endPostId =
|
||||
if hasReply:
|
||||
childChain[^1]
|
||||
else:
|
||||
x[0]
|
||||
echo (origin, endPostId)
|
||||
|
||||
|
||||
|
||||
echo "=="
|
BIN
src/nim/scraping/organizeByChapters
Executable file
BIN
src/nim/scraping/organizeByChapters
Executable file
Binary file not shown.
|
@ -8,6 +8,8 @@ import times
|
|||
import strutils
|
||||
import algorithm
|
||||
import strformat
|
||||
import config
|
||||
|
||||
import db_connector/db_sqlite
|
||||
var chapterDate : Table[int, int64]
|
||||
var dateChapter : Table[int64, int]
|
||||
|
@ -27,7 +29,11 @@ for x in 1 .. one.high:
|
|||
releaseDates.add(relaseDateTime)
|
||||
releaseDates.sort()
|
||||
|
||||
let db = open("../../../execenv/ArlongPark.db", "", "", "")
|
||||
var scraperConfig = createShared(ScraperConfig, sizeof(ScraperConfig))
|
||||
scraperConfig[] = initConfig("./config.yaml")
|
||||
|
||||
let db = db_sqlite.open(scraperConfig[].databaseConfig.filepath, scraperConfig[].databaseConfig.user, scraperConfig[].databaseConfig.password, scraperConfig[].databaseConfig.database )
|
||||
|
||||
|
||||
proc getChapter(time : int) : int =
|
||||
for x in 0 .. releaseDates.high:
|
||||
|
@ -37,15 +43,17 @@ proc getChapter(time : int) : int =
|
|||
raise new Exception
|
||||
|
||||
db.exec(sql"BEGIN TRANSACTION")
|
||||
for x in db.fastRows(sql"select PostId, unixepoch(CreationTime), Chapter from SubPost;"):
|
||||
for x in db.fastRows(sql"select PostId, CreationTime, Chapter from SubPost;"):
|
||||
let rowid = x[0]
|
||||
let time = parseInt(x[1])
|
||||
echo (time, rowid)
|
||||
|
||||
let chapter = getChapter(time)
|
||||
db.exec(sql"UPDATE SubPost SET CHAPTER = ? WHERE PostId = ?;", chapter, rowid)
|
||||
db.exec(sql"commit")
|
||||
|
||||
db.exec(sql"BEGIN TRANSACTION")
|
||||
for x in db.fastRows(sql"select PostId, unixepoch(CreationTime), Chapter from MacroPost;"):
|
||||
for x in db.fastRows(sql"select PostId, CreationTime, Chapter from MacroPost;"):
|
||||
let rowid = x[0]
|
||||
let time = parseInt(x[1])
|
||||
let chapter = getChapter(time)
|
||||
|
|
|
@ -3,18 +3,18 @@ import itertools
|
|||
import nimquery
|
||||
import db_connector/[db_sqlite, db_postgres]
|
||||
import config
|
||||
import std/cpuinfo
|
||||
|
||||
randomize()
|
||||
|
||||
let writeDb = db_sqlite.open("ArlongPark.db", "", "", "")
|
||||
let readDb = db_sqlite.open("ArlongPark.db", "", "", "")
|
||||
var scraperConfig = createShared(ScraperConfig, sizeof(ScraperConfig))
|
||||
scraperConfig[] = initConfig("./config.yaml")
|
||||
let writeDb = db_sqlite.open(scraperConfig[].databaseConfig.filepath, scraperConfig[].databaseConfig.user, scraperConfig[].databaseConfig.password, scraperConfig[].databaseConfig.database )
|
||||
let readDb = db_sqlite.open(scraperConfig[].databaseConfig.filepath, scraperConfig[].databaseConfig.user, scraperConfig[].databaseConfig.password, scraperConfig[].databaseConfig.database )
|
||||
writeDb.exec(sql"PRAGMA journal_mode=WAL;")
|
||||
|
||||
var dbLock : Lock
|
||||
initLock(dbLock)
|
||||
|
||||
let postgresChannel = createShared(Channel[SqlQuery], sizeof(Channel[SqlQuery]))
|
||||
var scraperConfig = createShared(ScraperConfig, sizeof(ScraperConfig))
|
||||
scraperConfig[] = initConfig("./config.yaml")
|
||||
|
||||
type
|
||||
Stats = object
|
||||
|
@ -44,17 +44,6 @@ type
|
|||
Inaccessible = ref object of HttpRequestError
|
||||
FailedDownload = ref object of HttpRequestError
|
||||
|
||||
proc postgresWorker() {.thread.} =
|
||||
let databaseConfig = scraperConfig[].databaseConnectionsConfig.postgresConn
|
||||
let db = db_postgres.open("", databaseConfig.user, databaseConfig.password,
|
||||
&"host={databaseConfig.host} port={databaseConfig.port} dbname={databaseConfig.database}")
|
||||
while true:
|
||||
let peek = postgresChannel[].peek
|
||||
doAssert(peek != -1, "Somehow channel closed. This is not expected behavior.")
|
||||
for x in 0 .. peek:
|
||||
let toExecute = postgresChannel[].recv()
|
||||
db.exec(toExecute)
|
||||
sleep 10
|
||||
|
||||
proc getRandomProxy() : Proxy =
|
||||
let proxies = readFile("proxies.txt").split("\n").map(x => x.split(":"))
|
||||
|
@ -282,11 +271,14 @@ proc downloadPagesWrapper(input : seq[string]) =
|
|||
|
||||
proc downloadFromDatabase*() =
|
||||
let perSeconds = 0
|
||||
echo "1!"
|
||||
let cpus = 2 #countProcessors()
|
||||
let cpus =
|
||||
if scraperConfig[].threadingConfig.useWorkerPerThread:
|
||||
countProcessors()
|
||||
else:
|
||||
scraperConfig[].threadingConfig.workerCount
|
||||
doAssert(cpus >= 1, "Work cannot get done with less than 1 worker")
|
||||
var threads = newSeq[Thread[seq[string]]](cpus)
|
||||
let toDownload = readDb.getAllRows(sql"select * from ToDownload where Completed = false and inaccessible = false ORDER BY RANDOM()").map(x=> x[0])
|
||||
echo "2!"
|
||||
let threadedChunks = toDownload.distribute(cpus)
|
||||
for i in 0 .. threads.high:
|
||||
echo i
|
||||
|
@ -294,4 +286,5 @@ proc downloadFromDatabase*() =
|
|||
sleep perSeconds*1000
|
||||
joinThreads(threads)
|
||||
|
||||
generateToDownload()
|
||||
downloadFromDatabase()
|
|
@ -14,6 +14,9 @@ import std/monotimes
|
|||
import sugar
|
||||
import strformat
|
||||
import nimlevenshtein
|
||||
import ../scraping/config
|
||||
var scraperConfig = createShared(ScraperConfig, sizeof(ScraperConfig))
|
||||
scraperConfig[] = initConfig("./config.yaml")
|
||||
|
||||
type SubPostResponse = object
|
||||
queryTime : int
|
||||
|
@ -121,8 +124,8 @@ proc escapeString(a : string) : string =
|
|||
result = result.replace(""""""", """""""")
|
||||
|
||||
proc controllerInit*() : ControllerData =
|
||||
result.db = open("../../../execenv/ArlongPark.db", "", "", "")
|
||||
result.db.exec(sql"PRAGMA query_only = boolean;")
|
||||
result.db = db_sqlite.open(scraperConfig[].databaseConfig.filepath, scraperConfig[].databaseConfig.user, scraperConfig[].databaseConfig.password, scraperConfig[].databaseConfig.database )
|
||||
result.db.exec(sql"PRAGMA query_only = true;")
|
||||
var headers = newTable[string, string]()
|
||||
headers["Content-Type"] = "application/json"
|
||||
headers["Access-Control-Allow-Origin"] = "*"
|
||||
|
|
Loading…
Reference in a new issue