Zipfs-Law-Language-Detector/nim/geneticTools.nim
2024-10-13 15:18:16 -04:00

334 lines
13 KiB
Nim

import tiny_sqlite
import sequtils
import sugar
import tables
import times
import algorithm
import strutils
import stats
import sets
import std/enumerate
import nimSHA2
import ./main
import ./scoring
import strformat
import math
## A lot of this file is now deprecated due to a different algorithm being introduced
## However, should the other algorithm be reintoduced, this will be beneficial
## And so, it will continue to be in the codebase.
type
GeneticBase* = object
scoreGene* : Table[string, Score]
wordsGene* : Table[string, HashSet[string]]
Generation* = object
startTime* : int64
endTime* : int64
proc wordsAndScoreToGene*(a : Table[string, HashSet[string]], b : Table[string, Score]) : GeneticBase =
result.wordsGene = a
result.scoreGene = b
proc tallyScores*(a : MacroScore) : float =
# Biggest portion, how successful it is
var successScore = 0.0;
successScore = a.faliureRates.mean + (a.faliureRates.max / 3)
var wordScore = a.percentBadUtilizations.mean + a.badUtilizationPerWords.mean + a.percentBadUtilizations.max
wordScore *= (1/ (a.percentGoodUtilizations.mean / 100))
wordScore *= (1 / (a.usedWordPercentsGood.mean) / 100)
result = (successScore*0.80)+(wordScore*0.20)
proc serializeAccuracy(db : DbConn, a : Accuracy, macroScoreId : int64, language : string) : int64 =
let dbval = toDbValues(macroScoreId, a.language, a.correct, a.incorrect, a.faliures)
db.exec("insert into Accuracy(MacroScoreId, Language, Correct, Incorrect, Faliures) VALUES (?, ?, ?, ?, ?)", dbval)
result = db.lastInsertRowId()
let accuracyId = result
let countyInsert = """INSERT INTO CountTable (MacroScoreId, AccuracyId, Language, StrKey, IntVal, Type)
VALUES (?, ?, ?, ?, ?, ?);"""
let languagesConfusedForMap = a.languagesConfusedFor.pairs.toSeq().map(x=> toDbValues(macroScoreId, accuracyId, language, x[0], x[1], "LanguagesConfusedFor"))
let correctMap = a.correctWordCounts.pairs.toSeq().map(x=> toDbValues(macroScoreId, accuracyId, language, x[0], x[1], "correctWords"))
let incorrectMap = a.incorrectWordCounts.pairs.toSeq().map(x=> toDbValues(macroScoreId, accuracyId, language, x[0], x[1], "incorrectWords"))
db.execMany(countyInsert, languagesConfusedForMap)
db.execMany(countyInsert, correctMap)
db.execMany(countyInsert, incorrectMap)
proc serializeScore(db : DbConn, a : Score, macroScoreId : int64, accuracyId : int64) =
let dbval = toDbValues(macroScoreId, accuracyId, a.language, a.faliureRate, a.totalWordUtilization, a.utlizationPerWord, a.totalGoodWordUtilization, a.goodUtilizationPerWord, a.percentGoodUtilization, a.totalBadWordUtilization, a.badUtilizationPerWord, a.percentBadUtilization, a.usedWordPercentGood, a.usedWordPercentBad)
let insertStatement = """
INSERT INTO Score (
MacroScoreId,
AccuracyRowId,
Language,
FaliureRate,
TotalWordUtilization,
UtlizationPerWord,
TotalGoodWordUtilization,
GoodUtilizationPerWord,
PercentGoodUtilization,
TotalBadWordUtilization,
BadUtilizationPerWord,
PercentBadUtilization,
UsedWordPercentGood,
UsedWordPercentBad
) VALUES (
?,
?,
?,
?,
?,
?,
?,
?,
?,
?,
?,
?,
?,
?
);"""
db.exec(insertStatement, dbval)
proc serializeRunningStat(db : DbConn, a : RunningStat, macroScoreId : int64) : int64 =
let dbval = toDbValues(macroScoreId, a.max, a.min, a.sum, a.mean, a.standardDeviation)
let insertStatement = """INSERT INTO RunningStat (MacroScoreId, Max, Min, Sum, Mean, StdDeviation) VALUES (?, ?, ?, ?, ?, ?)"""
db.exec(insertStatement, dbval)
return db.lastInsertRowId()
proc makeMacroScoreDb(db : DbConn, generationRowId : int64) : int64 =
db.exec("insert into MacroScore(IsPopulated, Generation) VALUES (1, ?)", generationRowId)
return db.lastInsertRowId()
proc seralizeMacroScores(db : DbConn, macroScoreId : int, a : MacroScore, foreginKeys : array[0 .. 10, int64]) =
let score = tallyScores(a)
let dbVals = toDbValues(score, a.wordCount,
foreginKeys[0], foreginKeys[1], foreginKeys[2], foreginKeys[3],
foreginKeys[4], foreginKeys[5], foreginKeys[6], foreginKeys[7],
foreginKeys[8], foreginKeys[9], foreginKeys[10], macroScoreId
)
let updateStatement = """UPDATE MacroScore SET
Score = ?,
WordCount = ?,
FaliureRates = ?,
TotalWordUtilizations = ?,
UtlizationPerWords = ?,
TotalGoodWordUtilizations = ?,
GoodUtilizationPerWords = ?,
PercentGoodUtilizations = ?,
TotalBadWordUtilizations = ?,
BadUtilizationPerWords = ?,
PercentBadUtilizations = ?,
UsedWordPercentsBad = ?,
UsedWordPercentsGood = ?,
IsPopulated = 0 WHERE rowid = ?"""
db.exec(updateStatement, dbVals)
proc wordsToString(a : Table[string, HashSet[string]]) : string =
return ($a).toSeq().sorted().join("")
proc vocabAlreadyExists*(trainingDb : DbConn, a : Table[string, HashSet[string]] ) : bool =
let digest = toHex($computeSHA256(wordsToString(a)))
return trainingDb.one("select * from WordSums where Sha256 = ?", $digest).isSome()
proc serializeMacroScore*(trainingDb : DbConn, a : MacroScore, generationRowId : int) =
if vocabAlreadyExists(trainingDb, a.words):
return
let sha256 = toHex $computeSHA256(wordsToString(a.words))
let vocabStmt = "INSERT INTO WordSums(Sha256) VALUES (?)"
trainingdb.exec(vocabStmt, $sha256)
# trainingDb.exec("PRAGMA synchronous = EXTRA")
# trainingDb.exec("PRAGMA journal_mode = WAL")
trainingDb.exec("PRAGMA foreign_keys = OFF")
let macroDbId = makeMacroScoreDb(trainingdb, generationRowId)
let stmt = "INSERT INTO Words(StrKey, Language, MacroScoreId) VALUES (?, ?, ?)"
let score = tallyScores(a)
for (key,val) in a.words.pairs:
let words = collect(for word in val: toDbValues(word, key, macroDbId))
trainingdb.execMany(stmt, words)
let statsArray = @[a.faliureRates, a.totalWordUtilizations, a.utlizationPerWords,
a.totalGoodWordUtilizations, a.goodUtilizationPerWords, a.percentGoodUtilizations,
a.totalBadWordUtilizations, a.badUtilizationPerWords, a.percentBadUtilizations,
a.usedWordPercentsBad, a.usedWordPercentsGood]
var stats : array[0..10, int64]
for (i,x) in enumerate statsArray:
stats[i] = serializeRunningStat(trainingdb, x, macroDbId)
seralizeMacroScores(trainingdb, macroDbId, a, stats)
for (key,val) in a.scores.pairs:
let score = serializeAccuracy(trainingdb, val.accuracy, macroDbId, key)
serializeScore(trainingdb, val, macroDbId, score)
proc createWordAccuracyTable*(a : GeneticBase) : Table[string, Table[string, float]] =
proc cubic(x : float) : float =
let h = -0.3
let a = 2.5
let k = 0.0
return a*(x-h)^3+k
for (language, score) in a.scoreGene.pairs:
let accuracy = score.accuracy
let iTable = accuracy.incorrectWordCounts
let cTable = accuracy.correctWordCounts
var totalTable = accuracy.correctWordCounts
totalTable.merge(accuracy.incorrectWordCounts)
if totalTable.values.toSeq().high == -1: continue
let totalFound = totalTable.values.toSeq().foldl(a+b)
result[language] = initTable[string, float]()
let weight = 1.0
for word in a.wordsGene[language]:
let percent =
#if its never been found, its useless, so its 100% wrong
if word notin cTable:
500000.0
#if its not found in iTable that means its 100% right
elif word notin iTable:
let step1 = weight
let normalizationFactor = cubic(1-((cTable[word] / totalFound)+0.1))
step1*normalizationFactor
else:
let step1 = (iTable[word] / totalTable[word])*100
let normalizationFactor = cubic((1-(cTable[word] / totalFound)+0.1))
step1*normalizationFactor
result[language][word] = percent
proc insertGeneration*(db : DbConn, a : Generation) : int64 =
let insert = """INSERT INTO Generation(TimeStarted, TimeEnded) VALUES(?, ?);"""
db.exec(insert, a.startTime, a.endTime)
return db.lastInsertRowId()
proc setupAccuracies(countTables : seq[(string, string, string, int64)], accuracies : seq[(string, int64, int64, int64)]) : (Table[string, Accuracy], Table[string, HashSet[string]]) =
let languages = toHashSet accuracies.map(x=>x[0])
for lang in languages:
var incorrectWords = initCountTable[string]()
var LanguagesConfusedFor = initCountTable[string]()
var correctWords = initCountTable[string]()
var dictonary : HashSet[string]
for (language, typeOfTable, key, val) in countTables:
if language == lang:
dictonary.incl(key)
if typeOfTable == "LanguagesConfusedFor":
LanguagesConfusedFor.inc(key, val)
elif typeOfTable == "incorrectWords":
incorrectWords.inc(key, val)
elif typeOfTable == "correctWords":
correctWords.inc(key, val)
var score : (string, int64, int64, int64)
for scores in accuracies:
if lang == scores[0]:
score = scores
break
doAssert(score[0] != "")
let correctCount = score[1]
let incorrectCount = score[2]
let failiureCount = score[3]
var newAcc = Accuracy()
newAcc.language = lang
newAcc.correct = correctCount
newAcc.incorrect = incorrectCount
newAcc.faliures = failiureCount
newAcc.wordCount = wordCount
newAcc.languagesConfusedFor = LanguagesConfusedFor
newAcc.correctWordCounts = correctWords
newAcc.incorrectWordCounts = incorrectWords
result[0][lang] = newAcc
result[1][lang] = dictonary
proc deserializeMacroScore*(db : DbConn, MacroScoreId : int, wordCount : int) : MacroScore =
let data = db.iterate("select Language, Type, StrKey, IntVal from CountTable where MacroScoreId = ?;", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].strval, x[2].strval, x[3].intval))
let languages = data.map(x=>x[0]).deduplicate()
let accuracies = db.iterate("select Language, Correct, Incorrect, Faliures from Accuracy where MacroScoreId = ?;", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].intval, x[2].intval, x[3].intval))
let words = db.iterate("select Language, StrKey from Words where MacroScoreId = ?", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].strval))
let (accuracy, dictonary) = setupAccuracies(data, accuracies)
result = makeMacroScore(score(accuracy), dictonary)
proc deserializeGeneration*(db : DbConn, generationNumber : int64) : seq[MacroScore] =
let accuracies = db.iterate("select Rowid, WordCount from MacroScore where generation = ? and IsPopulated = 0;", generationNumber).toSeq().map(x=> (x[0].intval, x[1].intval))
for (i, wordcount) in accuracies:
result.add(deserializeMacroScore(db, i, wordcount))
proc createBestModel*(db : DbConn) : MacroScore =
db.execScript("""
DROP TABLE IF EXISTS tmptable;
CREATE temporary TABLE tmptable AS
SELECT DISTINCT( s.language ),
(SELECT macroscoreid
FROM score AS s1
WHERE s1.language = s.language
ORDER BY faliurerate ASC
LIMIT 1) AS macroscoreidbest
FROM score AS s;
""")
echo db.iterate("""select language, macroscoreidbest from tmptable""").toSeq().map(x=> (x[0].strval, x[1].intval))
let languageToPerformance = db.iterate("""
SELECT tmptable.language,
macroscoreidbest,
correct,
Faliures,
incorrect FROM tmptable JOIN accuracy
ON accuracy.macroscoreid = macroscoreidbest
AND accuracy.language = tmptable.language;
""").toSeq().map(x=> (x[0].strval, x[2].intval, x[3].intval, x[4].intval))
echo "step2!"
let languageToCounttable = db.iterate(
""" SELECT tmptable.language,
macroscoreidbest,
type,
strkey,
intval
FROM tmptable
JOIN counttable
ON counttable.macroscoreid = macroscoreidbest
AND counttable.language = tmptable.language;
""").toSeq().map(x=> (x[0].strval, x[2].strval, x[3].strval, x[4].intval))
let (accuracy, dictionary) = setupAccuracies(languageToCounttable, languageToPerformance)
return makeMacroScore(score(accuracy), dictionary)
proc createBestFromWordsBest*(db : DbConn, wordCount : int) : Table[string, HashSet[string]] =
let data = db.iterate("""
WITH RankedWords AS (
SELECT
language,
word,
ROW_NUMBER() OVER (PARTITION BY language ORDER BY score DESC) AS rn
FROM
WordScore
)
SELECT
language,
word
FROM
RankedWords
WHERE
rn <= ?;
""", wordCount).toSeq().map(x=>(x[0].strval, x[1].strval))
for (language, word) in data:
if language notin main.mostCommonWords: continue
if language notin result:
result[language] = initHashSet[string]()
result[language].incl(word)