334 lines
13 KiB
Nim
334 lines
13 KiB
Nim
import tiny_sqlite
|
|
import sequtils
|
|
import sugar
|
|
import tables
|
|
import times
|
|
import algorithm
|
|
import strutils
|
|
import stats
|
|
import sets
|
|
import std/enumerate
|
|
import nimSHA2
|
|
import ./main
|
|
import ./scoring
|
|
import strformat
|
|
import math
|
|
|
|
## A lot of this file is now deprecated due to a different algorithm being introduced
|
|
## However, should the other algorithm be reintoduced, this will be beneficial
|
|
## And so, it will continue to be in the codebase.
|
|
|
|
type
|
|
GeneticBase* = object
|
|
scoreGene* : Table[string, Score]
|
|
wordsGene* : Table[string, HashSet[string]]
|
|
Generation* = object
|
|
startTime* : int64
|
|
endTime* : int64
|
|
|
|
proc wordsAndScoreToGene*(a : Table[string, HashSet[string]], b : Table[string, Score]) : GeneticBase =
|
|
result.wordsGene = a
|
|
result.scoreGene = b
|
|
|
|
proc tallyScores*(a : MacroScore) : float =
|
|
# Biggest portion, how successful it is
|
|
var successScore = 0.0;
|
|
successScore = a.faliureRates.mean + (a.faliureRates.max / 3)
|
|
var wordScore = a.percentBadUtilizations.mean + a.badUtilizationPerWords.mean + a.percentBadUtilizations.max
|
|
wordScore *= (1/ (a.percentGoodUtilizations.mean / 100))
|
|
wordScore *= (1 / (a.usedWordPercentsGood.mean) / 100)
|
|
result = (successScore*0.80)+(wordScore*0.20)
|
|
|
|
|
|
proc serializeAccuracy(db : DbConn, a : Accuracy, macroScoreId : int64, language : string) : int64 =
|
|
let dbval = toDbValues(macroScoreId, a.language, a.correct, a.incorrect, a.faliures)
|
|
db.exec("insert into Accuracy(MacroScoreId, Language, Correct, Incorrect, Faliures) VALUES (?, ?, ?, ?, ?)", dbval)
|
|
result = db.lastInsertRowId()
|
|
let accuracyId = result
|
|
let countyInsert = """INSERT INTO CountTable (MacroScoreId, AccuracyId, Language, StrKey, IntVal, Type)
|
|
VALUES (?, ?, ?, ?, ?, ?);"""
|
|
let languagesConfusedForMap = a.languagesConfusedFor.pairs.toSeq().map(x=> toDbValues(macroScoreId, accuracyId, language, x[0], x[1], "LanguagesConfusedFor"))
|
|
|
|
let correctMap = a.correctWordCounts.pairs.toSeq().map(x=> toDbValues(macroScoreId, accuracyId, language, x[0], x[1], "correctWords"))
|
|
|
|
let incorrectMap = a.incorrectWordCounts.pairs.toSeq().map(x=> toDbValues(macroScoreId, accuracyId, language, x[0], x[1], "incorrectWords"))
|
|
|
|
db.execMany(countyInsert, languagesConfusedForMap)
|
|
db.execMany(countyInsert, correctMap)
|
|
db.execMany(countyInsert, incorrectMap)
|
|
|
|
proc serializeScore(db : DbConn, a : Score, macroScoreId : int64, accuracyId : int64) =
|
|
let dbval = toDbValues(macroScoreId, accuracyId, a.language, a.faliureRate, a.totalWordUtilization, a.utlizationPerWord, a.totalGoodWordUtilization, a.goodUtilizationPerWord, a.percentGoodUtilization, a.totalBadWordUtilization, a.badUtilizationPerWord, a.percentBadUtilization, a.usedWordPercentGood, a.usedWordPercentBad)
|
|
|
|
let insertStatement = """
|
|
INSERT INTO Score (
|
|
MacroScoreId,
|
|
AccuracyRowId,
|
|
Language,
|
|
FaliureRate,
|
|
TotalWordUtilization,
|
|
UtlizationPerWord,
|
|
TotalGoodWordUtilization,
|
|
GoodUtilizationPerWord,
|
|
PercentGoodUtilization,
|
|
TotalBadWordUtilization,
|
|
BadUtilizationPerWord,
|
|
PercentBadUtilization,
|
|
UsedWordPercentGood,
|
|
UsedWordPercentBad
|
|
) VALUES (
|
|
?,
|
|
?,
|
|
?,
|
|
?,
|
|
?,
|
|
?,
|
|
?,
|
|
?,
|
|
?,
|
|
?,
|
|
?,
|
|
?,
|
|
?,
|
|
?
|
|
);"""
|
|
db.exec(insertStatement, dbval)
|
|
|
|
proc serializeRunningStat(db : DbConn, a : RunningStat, macroScoreId : int64) : int64 =
|
|
let dbval = toDbValues(macroScoreId, a.max, a.min, a.sum, a.mean, a.standardDeviation)
|
|
let insertStatement = """INSERT INTO RunningStat (MacroScoreId, Max, Min, Sum, Mean, StdDeviation) VALUES (?, ?, ?, ?, ?, ?)"""
|
|
db.exec(insertStatement, dbval)
|
|
return db.lastInsertRowId()
|
|
|
|
proc makeMacroScoreDb(db : DbConn, generationRowId : int64) : int64 =
|
|
db.exec("insert into MacroScore(IsPopulated, Generation) VALUES (1, ?)", generationRowId)
|
|
return db.lastInsertRowId()
|
|
|
|
proc seralizeMacroScores(db : DbConn, macroScoreId : int, a : MacroScore, foreginKeys : array[0 .. 10, int64]) =
|
|
let score = tallyScores(a)
|
|
let dbVals = toDbValues(score, a.wordCount,
|
|
foreginKeys[0], foreginKeys[1], foreginKeys[2], foreginKeys[3],
|
|
foreginKeys[4], foreginKeys[5], foreginKeys[6], foreginKeys[7],
|
|
foreginKeys[8], foreginKeys[9], foreginKeys[10], macroScoreId
|
|
)
|
|
let updateStatement = """UPDATE MacroScore SET
|
|
Score = ?,
|
|
WordCount = ?,
|
|
|
|
FaliureRates = ?,
|
|
TotalWordUtilizations = ?,
|
|
UtlizationPerWords = ?,
|
|
|
|
TotalGoodWordUtilizations = ?,
|
|
GoodUtilizationPerWords = ?,
|
|
PercentGoodUtilizations = ?,
|
|
|
|
TotalBadWordUtilizations = ?,
|
|
BadUtilizationPerWords = ?,
|
|
PercentBadUtilizations = ?,
|
|
|
|
UsedWordPercentsBad = ?,
|
|
UsedWordPercentsGood = ?,
|
|
|
|
IsPopulated = 0 WHERE rowid = ?"""
|
|
db.exec(updateStatement, dbVals)
|
|
|
|
proc wordsToString(a : Table[string, HashSet[string]]) : string =
|
|
return ($a).toSeq().sorted().join("")
|
|
|
|
|
|
proc vocabAlreadyExists*(trainingDb : DbConn, a : Table[string, HashSet[string]] ) : bool =
|
|
let digest = toHex($computeSHA256(wordsToString(a)))
|
|
return trainingDb.one("select * from WordSums where Sha256 = ?", $digest).isSome()
|
|
|
|
proc serializeMacroScore*(trainingDb : DbConn, a : MacroScore, generationRowId : int) =
|
|
if vocabAlreadyExists(trainingDb, a.words):
|
|
return
|
|
let sha256 = toHex $computeSHA256(wordsToString(a.words))
|
|
let vocabStmt = "INSERT INTO WordSums(Sha256) VALUES (?)"
|
|
trainingdb.exec(vocabStmt, $sha256)
|
|
|
|
# trainingDb.exec("PRAGMA synchronous = EXTRA")
|
|
# trainingDb.exec("PRAGMA journal_mode = WAL")
|
|
trainingDb.exec("PRAGMA foreign_keys = OFF")
|
|
|
|
let macroDbId = makeMacroScoreDb(trainingdb, generationRowId)
|
|
let stmt = "INSERT INTO Words(StrKey, Language, MacroScoreId) VALUES (?, ?, ?)"
|
|
let score = tallyScores(a)
|
|
|
|
|
|
for (key,val) in a.words.pairs:
|
|
let words = collect(for word in val: toDbValues(word, key, macroDbId))
|
|
trainingdb.execMany(stmt, words)
|
|
|
|
let statsArray = @[a.faliureRates, a.totalWordUtilizations, a.utlizationPerWords,
|
|
a.totalGoodWordUtilizations, a.goodUtilizationPerWords, a.percentGoodUtilizations,
|
|
a.totalBadWordUtilizations, a.badUtilizationPerWords, a.percentBadUtilizations,
|
|
a.usedWordPercentsBad, a.usedWordPercentsGood]
|
|
var stats : array[0..10, int64]
|
|
|
|
for (i,x) in enumerate statsArray:
|
|
stats[i] = serializeRunningStat(trainingdb, x, macroDbId)
|
|
|
|
seralizeMacroScores(trainingdb, macroDbId, a, stats)
|
|
for (key,val) in a.scores.pairs:
|
|
let score = serializeAccuracy(trainingdb, val.accuracy, macroDbId, key)
|
|
serializeScore(trainingdb, val, macroDbId, score)
|
|
|
|
proc createWordAccuracyTable*(a : GeneticBase) : Table[string, Table[string, float]] =
|
|
proc cubic(x : float) : float =
|
|
let h = -0.3
|
|
let a = 2.5
|
|
let k = 0.0
|
|
return a*(x-h)^3+k
|
|
for (language, score) in a.scoreGene.pairs:
|
|
let accuracy = score.accuracy
|
|
let iTable = accuracy.incorrectWordCounts
|
|
let cTable = accuracy.correctWordCounts
|
|
|
|
var totalTable = accuracy.correctWordCounts
|
|
totalTable.merge(accuracy.incorrectWordCounts)
|
|
|
|
if totalTable.values.toSeq().high == -1: continue
|
|
let totalFound = totalTable.values.toSeq().foldl(a+b)
|
|
|
|
result[language] = initTable[string, float]()
|
|
let weight = 1.0
|
|
for word in a.wordsGene[language]:
|
|
let percent =
|
|
#if its never been found, its useless, so its 100% wrong
|
|
if word notin cTable:
|
|
500000.0
|
|
#if its not found in iTable that means its 100% right
|
|
elif word notin iTable:
|
|
let step1 = weight
|
|
let normalizationFactor = cubic(1-((cTable[word] / totalFound)+0.1))
|
|
step1*normalizationFactor
|
|
else:
|
|
let step1 = (iTable[word] / totalTable[word])*100
|
|
let normalizationFactor = cubic((1-(cTable[word] / totalFound)+0.1))
|
|
step1*normalizationFactor
|
|
|
|
result[language][word] = percent
|
|
proc insertGeneration*(db : DbConn, a : Generation) : int64 =
|
|
let insert = """INSERT INTO Generation(TimeStarted, TimeEnded) VALUES(?, ?);"""
|
|
db.exec(insert, a.startTime, a.endTime)
|
|
return db.lastInsertRowId()
|
|
|
|
proc setupAccuracies(countTables : seq[(string, string, string, int64)], accuracies : seq[(string, int64, int64, int64)]) : (Table[string, Accuracy], Table[string, HashSet[string]]) =
|
|
let languages = toHashSet accuracies.map(x=>x[0])
|
|
|
|
for lang in languages:
|
|
var incorrectWords = initCountTable[string]()
|
|
var LanguagesConfusedFor = initCountTable[string]()
|
|
var correctWords = initCountTable[string]()
|
|
var dictonary : HashSet[string]
|
|
for (language, typeOfTable, key, val) in countTables:
|
|
if language == lang:
|
|
dictonary.incl(key)
|
|
if typeOfTable == "LanguagesConfusedFor":
|
|
LanguagesConfusedFor.inc(key, val)
|
|
elif typeOfTable == "incorrectWords":
|
|
incorrectWords.inc(key, val)
|
|
elif typeOfTable == "correctWords":
|
|
correctWords.inc(key, val)
|
|
var score : (string, int64, int64, int64)
|
|
for scores in accuracies:
|
|
if lang == scores[0]:
|
|
score = scores
|
|
break
|
|
doAssert(score[0] != "")
|
|
let correctCount = score[1]
|
|
let incorrectCount = score[2]
|
|
let failiureCount = score[3]
|
|
var newAcc = Accuracy()
|
|
newAcc.language = lang
|
|
newAcc.correct = correctCount
|
|
newAcc.incorrect = incorrectCount
|
|
newAcc.faliures = failiureCount
|
|
newAcc.wordCount = wordCount
|
|
newAcc.languagesConfusedFor = LanguagesConfusedFor
|
|
newAcc.correctWordCounts = correctWords
|
|
newAcc.incorrectWordCounts = incorrectWords
|
|
result[0][lang] = newAcc
|
|
result[1][lang] = dictonary
|
|
|
|
proc deserializeMacroScore*(db : DbConn, MacroScoreId : int, wordCount : int) : MacroScore =
|
|
let data = db.iterate("select Language, Type, StrKey, IntVal from CountTable where MacroScoreId = ?;", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].strval, x[2].strval, x[3].intval))
|
|
let languages = data.map(x=>x[0]).deduplicate()
|
|
|
|
|
|
let accuracies = db.iterate("select Language, Correct, Incorrect, Faliures from Accuracy where MacroScoreId = ?;", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].intval, x[2].intval, x[3].intval))
|
|
|
|
let words = db.iterate("select Language, StrKey from Words where MacroScoreId = ?", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].strval))
|
|
|
|
let (accuracy, dictonary) = setupAccuracies(data, accuracies)
|
|
|
|
result = makeMacroScore(score(accuracy), dictonary)
|
|
|
|
proc deserializeGeneration*(db : DbConn, generationNumber : int64) : seq[MacroScore] =
|
|
let accuracies = db.iterate("select Rowid, WordCount from MacroScore where generation = ? and IsPopulated = 0;", generationNumber).toSeq().map(x=> (x[0].intval, x[1].intval))
|
|
for (i, wordcount) in accuracies:
|
|
result.add(deserializeMacroScore(db, i, wordcount))
|
|
|
|
proc createBestModel*(db : DbConn) : MacroScore =
|
|
db.execScript("""
|
|
DROP TABLE IF EXISTS tmptable;
|
|
CREATE temporary TABLE tmptable AS
|
|
SELECT DISTINCT( s.language ),
|
|
(SELECT macroscoreid
|
|
FROM score AS s1
|
|
WHERE s1.language = s.language
|
|
ORDER BY faliurerate ASC
|
|
LIMIT 1) AS macroscoreidbest
|
|
FROM score AS s;
|
|
""")
|
|
echo db.iterate("""select language, macroscoreidbest from tmptable""").toSeq().map(x=> (x[0].strval, x[1].intval))
|
|
|
|
let languageToPerformance = db.iterate("""
|
|
SELECT tmptable.language,
|
|
macroscoreidbest,
|
|
correct,
|
|
Faliures,
|
|
incorrect FROM tmptable JOIN accuracy
|
|
ON accuracy.macroscoreid = macroscoreidbest
|
|
AND accuracy.language = tmptable.language;
|
|
""").toSeq().map(x=> (x[0].strval, x[2].intval, x[3].intval, x[4].intval))
|
|
echo "step2!"
|
|
let languageToCounttable = db.iterate(
|
|
""" SELECT tmptable.language,
|
|
macroscoreidbest,
|
|
type,
|
|
strkey,
|
|
intval
|
|
FROM tmptable
|
|
JOIN counttable
|
|
ON counttable.macroscoreid = macroscoreidbest
|
|
AND counttable.language = tmptable.language;
|
|
""").toSeq().map(x=> (x[0].strval, x[2].strval, x[3].strval, x[4].intval))
|
|
let (accuracy, dictionary) = setupAccuracies(languageToCounttable, languageToPerformance)
|
|
return makeMacroScore(score(accuracy), dictionary)
|
|
|
|
proc createBestFromWordsBest*(db : DbConn, wordCount : int) : Table[string, HashSet[string]] =
|
|
let data = db.iterate("""
|
|
WITH RankedWords AS (
|
|
SELECT
|
|
language,
|
|
word,
|
|
ROW_NUMBER() OVER (PARTITION BY language ORDER BY score DESC) AS rn
|
|
FROM
|
|
WordScore
|
|
)
|
|
SELECT
|
|
language,
|
|
word
|
|
FROM
|
|
RankedWords
|
|
WHERE
|
|
rn <= ?;
|
|
""", wordCount).toSeq().map(x=>(x[0].strval, x[1].strval))
|
|
for (language, word) in data:
|
|
if language notin main.mostCommonWords: continue
|
|
if language notin result:
|
|
result[language] = initHashSet[string]()
|
|
result[language].incl(word)
|