Restructure needed for genetic training, but here is some bug fixes

This commit is contained in:
user 2024-08-07 12:24:25 -04:00
parent 57463e47bf
commit a19a5d249c
5 changed files with 154 additions and 69 deletions

BIN
nim/geneticTools Executable file

Binary file not shown.

View file

@ -12,6 +12,8 @@ import sets
import std/enumerate
import nimSHA2
import ./main
import strformat
import math
type
GeneticBase* = object
@ -21,13 +23,17 @@ type
startTime* : int64
endTime* : int64
proc wordsAndScoreToGene*(a : Table[string, HashSet[string]], b : Table[string, Score]) : GeneticBase =
result.wordsGene = a
result.scoreGene = b
proc tallyScores*(a : MacroScore) : float =
# Biggest portion, how successful it is
var successScore = 0.0;
successScore = a.faliureRates.mean + (a.faliureRates.max / 3)
var wordScore = ((a.percentBadUtilizations.mean + a.badUtilizationPerWords.mean) + a.percentBadUtilizations.max)
wordScore *= (110-(a.percentGoodUtilizations.mean)) / 100
wordScore *= (110-(a.usedWordPercentsGood.mean)) / 100
var wordScore = a.percentBadUtilizations.mean + a.badUtilizationPerWords.mean + a.percentBadUtilizations.max
wordScore *= (1/ (a.percentGoodUtilizations.mean / 100))
wordScore *= (1 / (a.usedWordPercentsGood.mean) / 100)
result = (successScore*0.80)+(wordScore*0.20)
@ -135,10 +141,9 @@ proc vocabAlreadyExists*(trainingDb : DbConn, a : Table[string, HashSet[string]]
proc serializeMacroScore*(trainingDb : DbConn, a : MacroScore, generationRowId : int) =
if vocabAlreadyExists(trainingDb, a.words):
return
echo (vocabAlreadyExists(trainingDb, a.words))
let sha256 = toHex $computeSHA256(wordsToString(a.words))
let vocabStmt = "INSERT INTO WordSums(Sha256) VALUES (?)"
trainingdb.exec(vocabStmt, sha256)
trainingdb.exec(vocabStmt, $sha256)
# trainingDb.exec("PRAGMA synchronous = EXTRA")
# trainingDb.exec("PRAGMA journal_mode = WAL")
@ -168,55 +173,62 @@ proc serializeMacroScore*(trainingDb : DbConn, a : MacroScore, generationRowId :
serializeScore(trainingdb, val, macroDbId, score)
proc createWordAccuracyTable*(a : GeneticBase) : Table[string, Table[string, float]] =
proc cubic(x : float) : float =
let h = -0.3
let a = 2.5
let k = 0.0
return a*(x-h)^3+k
for (language, score) in a.scoreGene.pairs:
let accuracy = score.accuracy
var iTable = accuracy.incorrectWordCounts
var cTable = accuracy.correctWordCounts
cTable.merge(accuracy.incorrectWordCounts)
let iTable = accuracy.incorrectWordCounts
let cTable = accuracy.correctWordCounts
var totalTable = accuracy.correctWordCounts
totalTable.merge(accuracy.incorrectWordCounts)
if totalTable.values.toSeq().high == -1: continue
let totalFound = totalTable.values.toSeq().foldl(a+b)
result[language] = initTable[string, float]()
let weight = 1.0
for word in a.wordsGene[language]:
let percent =
#if its never been found, its useless, so its 100% wrong
if word notin cTable:
100.0
500000.0
#if its not found in iTable that means its 100% right
elif word notin iTable and word notin cTable:
0.0
elif word notin iTable:
let step1 = weight
let normalizationFactor = cubic(1-((cTable[word] / totalFound)+0.1))
step1*normalizationFactor
else:
(accuracy.incorrectWordCounts[word] / cTable[word])*100
result[language][word] = percent
let step1 = (iTable[word] / totalTable[word])*100
let normalizationFactor = cubic((1-(cTable[word] / totalFound)+0.1))
step1*normalizationFactor
result[language][word] = percent
proc insertGeneration*(db : DbConn, a : Generation) : int64 =
let insert = """INSERT INTO Generation(TimeStarted, TimeEnded) VALUES(?, ?);"""
db.exec(insert, a.startTime, a.endTime)
return db.lastInsertRowId()
proc deserializeMacroScore*(db : DbConn, MacroScoreId : int, wordCount : int) : MacroScore =
let data = db.iterate("select Language, Type, StrKey, IntVal from CountTable where MacroScoreId = ?;", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].strval, x[2].strval, x[3].intval))
let languages = data.map(x=>x[0]).deduplicate()
let accuracies = db.iterate("select Language, Correct, Incorrect, Faliures from Accuracy where MacroScoreId = ?;", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].intval, x[2].intval, x[3].intval))
proc setupAccuracies(countTables : seq[(string, string, string, int64)], accuracies : seq[(string, int64, int64, int64)]) : (Table[string, Accuracy], Table[string, HashSet[string]]) =
let languages = toHashSet accuracies.map(x=>x[0])
let words = db.iterate("select Language, StrKey from Words where MacroScoreId = ?", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].strval))
var dictonary = initTable[string, HashSet[string]]()
for (lang, word) in words:
if lang notin dictonary:
dictonary[lang] = initHashSet[string]()
dictonary[lang].incl(word)
var accuracy : Table[string, Accuracy]
for lang in languages:
var incorrectWords = initCountTable[string]()
var LanguagesConfusedFor = initCountTable[string]()
var correctWords = initCountTable[string]()
for (language, typeOfTable, key, val) in data:
if typeOfTable == "LanguagesConfusedFor":
LanguagesConfusedFor.inc(key, val)
elif typeOfTable == "incorrectWords":
incorrectWords.inc(key, val)
elif typeOfTable == "correctWords":
correctWords.inc(key, val)
var dictonary : HashSet[string]
for (language, typeOfTable, key, val) in countTables:
if language == lang:
dictonary.incl(key)
if typeOfTable == "LanguagesConfusedFor":
LanguagesConfusedFor.inc(key, val)
elif typeOfTable == "incorrectWords":
incorrectWords.inc(key, val)
elif typeOfTable == "correctWords":
correctWords.inc(key, val)
var score : (string, int64, int64, int64)
for scores in accuracies:
if lang == scores[0]:
@ -235,11 +247,61 @@ proc deserializeMacroScore*(db : DbConn, MacroScoreId : int, wordCount : int) :
newAcc.languagesConfusedFor = LanguagesConfusedFor
newAcc.correctWordCounts = correctWords
newAcc.incorrectWordCounts = incorrectWords
accuracy[lang] = newAcc
return makeMacroScore(score(accuracy), dictonary)
result[0][lang] = newAcc
result[1][lang] = dictonary
proc deserializeMacroScore*(db : DbConn, MacroScoreId : int, wordCount : int) : MacroScore =
let data = db.iterate("select Language, Type, StrKey, IntVal from CountTable where MacroScoreId = ?;", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].strval, x[2].strval, x[3].intval))
let languages = data.map(x=>x[0]).deduplicate()
let accuracies = db.iterate("select Language, Correct, Incorrect, Faliures from Accuracy where MacroScoreId = ?;", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].intval, x[2].intval, x[3].intval))
let words = db.iterate("select Language, StrKey from Words where MacroScoreId = ?", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].strval))
let (accuracy, dictonary) = setupAccuracies(data, accuracies)
result = makeMacroScore(score(accuracy), dictonary)
proc deserializeGeneration*(db : DbConn, generationNumber : int64) : seq[MacroScore] =
let accuracies = db.iterate("select Rowid, WordCount from MacroScore where generation = ? and IsPopulated = 0;", generationNumber).toSeq().map(x=> (x[0].intval, x[1].intval))
for (i, wordcount) in accuracies:
result.add(deserializeMacroScore(db, i, wordcount))
proc createBestModel*(db : DbConn) : MacroScore =
db.execScript("""
DROP TABLE IF EXISTS tmptable;
CREATE temporary TABLE tmptable AS
SELECT DISTINCT( s.language ),
(SELECT macroscoreid
FROM score AS s1
WHERE s1.language = s.language
ORDER BY faliurerate ASC
LIMIT 1) AS macroscoreidbest
FROM score AS s;
""")
echo db.iterate("""select language, macroscoreidbest from tmptable""").toSeq().map(x=> (x[0].strval, x[1].intval))
let languageToPerformance = db.iterate("""
SELECT tmptable.language,
macroscoreidbest,
correct,
Faliures,
incorrect FROM tmptable JOIN accuracy
ON accuracy.macroscoreid = macroscoreidbest
AND accuracy.language = tmptable.language;
""").toSeq().map(x=> (x[0].strval, x[2].intval, x[3].intval, x[4].intval))
echo "step2!"
let languageToCounttable = db.iterate(
""" SELECT tmptable.language,
macroscoreidbest,
type,
strkey,
intval
FROM tmptable
JOIN counttable
ON counttable.macroscoreid = macroscoreidbest
AND counttable.language = tmptable.language;
""").toSeq().map(x=> (x[0].strval, x[2].strval, x[3].strval, x[4].intval))
let (accuracy, dictionary) = setupAccuracies(languageToCounttable, languageToPerformance)
return makeMacroScore(score(accuracy), dictionary)

BIN
nim/geneticTraining Executable file

Binary file not shown.

View file

@ -37,13 +37,14 @@ let wordsDatabase = openDatabase("../data/words/words.db")
optimizeDB wordsDatabase
var languages = wordsDatabase.iterate("select distinct(Language) from Words;").toSeq().map(x=> x[0].strval).filter(x=> x in main.statistics)
var dbChannel : Channel[(MacroScore, int64)]
dbChannel.open()
let trainingDb = createShared(DbConn, sizeof(DbConn))
trainingDb[] = openDatabase("../data/training/training.db")
optimizeDB trainingDb[]
var macroScoreChannel : Channel[MacroScore]
macroScoreChannel.open()
@ -57,11 +58,11 @@ proc genRandomNumber(max : int) : int =
let highMidRange = int(maxFloaty*0.10)
let wordRange = rand(0 .. 10)
if wordRange in 0..3:
if wordRange in 0..1:
return rand(0..lowestRange)
elif wordRange in 4..6:
elif wordRange in 2..4:
return rand(0..lowMidRange)
elif wordRange in 7..8:
elif wordRange in 5..8:
return rand(0..highMidRange)
elif wordRange in 9..10:
return rand(0..max)
@ -86,52 +87,68 @@ proc createRandomWords(amountWanted : int, languages : seq[string]) : Table[stri
var retry = 0
if word.high >= 4:
if rand(0 .. 5) != 5:
let endy = rand(2 .. word.high)
let start = rand(0 .. endy-1)
let endy = rand(3 .. word.high)
let start = rand(0 .. endy-2)
wordToAdd = $word[start .. endy]
baseResult.incl(wordToAdd)
result[lang] = baseResult
proc mutateWords(a : Table[string, Table[string, float]]) : Table[string, HashSet[string]]=
let sizeOfMutation = 7
let newWordsTable = createRandomWords((sizeOfMutation-1)*5, a.keys.toSeq())
var newWords : Table[string, seq[string]]
for (language, words) in newWordsTable.pairs():
newWords[language] = words.toSeq()
for (language, words) in a.pairs():
block mutationProcess:
let newWords = toSeq createRandomWords((sizeOfMutation-1)*5, @[language])[language]
var wordCounter = 0
#Words most innacurate to lesat
let sort = words.pairs.toSeq().sorted((a,b) => cmp(a[1], b[1])).reversed()
var wordsTotality = toHashSet sort.map(x=> x[0])
var resultingWords = sort.map(x=> x[0])
for x in 0 .. sizeOfMutation-1:
var wordCounter = 0
let overOneHundread = sort.filter(x=> x[1] >= 400)
var replacePositions : seq[int]
if overOneHundread.high != -1:
for x in 0 .. sort.high:
if sort[x][1] >= 500000.0:
replacePositions.add(x)
else:
for x in 0 .. sizeOfMutation:
replacePositions.add(x)
echo language
echo replacePositions
echo sort
for pos in replacePositions:
while true:
if wordCounter == newWords.high:
if wordCounter == newWords[language].len()-1:
result[language] = wordsTotality
break mutationProcess
let randomWord = newWords[wordCounter]
let randomWord = newWords[language][wordCounter]
wordCounter+=1
if randomWord notin wordsTotality:
resultingWords[x] = randomWord
resultingWords[pos] = randomWord
break
when defined(debug):
echo language
echo sort
echo ""
echo resultingWords
echo "==="
result[language] = toHashSet resultingWords
proc wordsAndScoreToGene(a : Table[string, HashSet[string]], b : Table[string, Score]) : GeneticBase =
result.wordsGene = a
result.scoreGene = b
proc assembleNewGeneration(a : seq[MacroScore]) : seq[Table[string, HashSet[string]]] =
let scores = a.map(x=>(x, tallyScores x)).sorted((a,b) => cmp(a[1], b[1])).map(x=>x[0])
let half = floordiv(a.len, 2)
let survivors =
if a.len >= 10:
scores[0 .. half]
else:
scores
let survivors = scores
var genes = collect(for x in survivors: wordsAndScoreToGene(x.words, x.scores))
let percentages = genes.map(x=> createWordAccuracyTable(x))
#echo percentages
var compositeOrganismMutator = initTable[string, Table[string, float]]()
var compositeOrganismWords = initTable[string, HashSet[string]]()
@ -196,9 +213,12 @@ proc generationThread(generationStart : seq[MacroScore]) =
let gen = insertGeneration(trainingDb[], generation)
for vocab in vocabExists:
dbChannel.send((vocab, gen))
var newGeneration = assembleNewGeneration macroScores
let sumOfBest = createBestModel(trainingDb[])
var newGeneration = assembleNewGeneration (macroScores & sumOfBest)
var threads = newSeq[Thread[Table[string, HashSet[string]]]](newGeneration.len)
for i in 0 .. newGeneration.high:
doAssert(newGeneration[i].values.toSeq().all(x=> x.len == wordCount))
createThread(threads[i], scoreAndSend, newGeneration[i])
var generation = Generation()
@ -209,4 +229,4 @@ proc generationThread(generationStart : seq[MacroScore]) =
var thread : Thread[void]
createThread(thread, dbThread)
generationThread deserializeGeneration(trainingDb[], 1)
generationThread deserializeGeneration(trainingDb[], 2)

View file

@ -1,5 +1,5 @@
import ./main
import tiny_sqlite
import db_connector/db_sqlite
import sequtils
import sugar
import tables
@ -107,23 +107,27 @@ proc makeMacroScore*(a : Table[string, Score], words : Table[string, HashSet[str
result.words = words
result.wordCount = words.values.toSeq()[0].len()
for (key,val) in a.pairs:
if val.language in @["kn", "he", "ary", "yi", "ka", "gu", "ckb", "ta"]:
continue
result.faliureRates.push(val.faliureRate)
result.totalWordUtilizations.push(val.totalWordUtilization)
result.utlizationPerWords.push(val.utlizationPerWord)
result.totalGoodWordUtilizations.push(val.totalGoodWordUtilization)
result.goodUtilizationPerWords.push(val.goodUtilizationPerWord)
result.percentGoodUtilizations.push(val.percentGoodUtilization)
result.totalBadWordUtilizations.push(val.totalBadWordUtilization)
result.badUtilizationPerWords.push(val.badUtilizationPerWord)
result.percentBadUtilizations.push(val.percentBadUtilization)
result.usedWordPercentsBad.push(val.usedWordPercentBad)
result.usedWordPercentsGood.push(val.usedWordPercentGood)
result.totalBadWordUtilizations.push(val.totalBadWordUtilization)
result.badUtilizationPerWords.push(val.badUtilizationPerWord)
proc createWordScore*(words : Table[string, HashSet[string]], beFast = false) : MacroScore {.gcsafe.} =
let db = openDatabase("../data/testing/testingData.db")
db.exec("PRAGMA read_uncommitted = ON;")
db.exec("PRAGMA synchronous = NORMAL")
let db = open("../data/testing/testingData.db", "", "", "")
db.exec(sql"PRAGMA read_uncommitted = ON;")
db.exec(sql"PRAGMA synchronous = NORMAL")
var langToAccuracy = initTable[string, Accuracy]()
let wordCount = words.values.toSeq()[0].len()
var languages : seq[string]
@ -136,14 +140,13 @@ proc createWordScore*(words : Table[string, HashSet[string]], beFast = false) :
langToAccuracy[lang].wordCount = wordCount
languages.add lang
echo "2"
for row in db.iterate("select Lang, Sample, Rowid from TrainingData"):
let values = row.values
let correctLanguage = values[0].strVal
let sample = values[1].strval
for row in db.fastrows(sql"select Lang, Sample, Rowid from TrainingData"):
let correctLanguage = row[0]
let sample = row[1]
var wordCounts = newTable[string, CountTable[string]]()
if beFast:
if correctLanguage in fastCounter:
if fastCounter[correctLanguage] == 300:
if fastCounter[correctLanguage] == 500:
continue
fastCounter.inc(correctLanguage)
if correctLanguage notin languages: