Restructure needed for genetic training, but here is some bug fixes
This commit is contained in:
parent
57463e47bf
commit
a19a5d249c
5 changed files with 154 additions and 69 deletions
BIN
nim/geneticTools
Executable file
BIN
nim/geneticTools
Executable file
Binary file not shown.
|
@ -12,6 +12,8 @@ import sets
|
|||
import std/enumerate
|
||||
import nimSHA2
|
||||
import ./main
|
||||
import strformat
|
||||
import math
|
||||
|
||||
type
|
||||
GeneticBase* = object
|
||||
|
@ -21,13 +23,17 @@ type
|
|||
startTime* : int64
|
||||
endTime* : int64
|
||||
|
||||
proc wordsAndScoreToGene*(a : Table[string, HashSet[string]], b : Table[string, Score]) : GeneticBase =
|
||||
result.wordsGene = a
|
||||
result.scoreGene = b
|
||||
|
||||
proc tallyScores*(a : MacroScore) : float =
|
||||
# Biggest portion, how successful it is
|
||||
var successScore = 0.0;
|
||||
successScore = a.faliureRates.mean + (a.faliureRates.max / 3)
|
||||
var wordScore = ((a.percentBadUtilizations.mean + a.badUtilizationPerWords.mean) + a.percentBadUtilizations.max)
|
||||
wordScore *= (110-(a.percentGoodUtilizations.mean)) / 100
|
||||
wordScore *= (110-(a.usedWordPercentsGood.mean)) / 100
|
||||
var wordScore = a.percentBadUtilizations.mean + a.badUtilizationPerWords.mean + a.percentBadUtilizations.max
|
||||
wordScore *= (1/ (a.percentGoodUtilizations.mean / 100))
|
||||
wordScore *= (1 / (a.usedWordPercentsGood.mean) / 100)
|
||||
result = (successScore*0.80)+(wordScore*0.20)
|
||||
|
||||
|
||||
|
@ -135,10 +141,9 @@ proc vocabAlreadyExists*(trainingDb : DbConn, a : Table[string, HashSet[string]]
|
|||
proc serializeMacroScore*(trainingDb : DbConn, a : MacroScore, generationRowId : int) =
|
||||
if vocabAlreadyExists(trainingDb, a.words):
|
||||
return
|
||||
echo (vocabAlreadyExists(trainingDb, a.words))
|
||||
let sha256 = toHex $computeSHA256(wordsToString(a.words))
|
||||
let vocabStmt = "INSERT INTO WordSums(Sha256) VALUES (?)"
|
||||
trainingdb.exec(vocabStmt, sha256)
|
||||
trainingdb.exec(vocabStmt, $sha256)
|
||||
|
||||
# trainingDb.exec("PRAGMA synchronous = EXTRA")
|
||||
# trainingDb.exec("PRAGMA journal_mode = WAL")
|
||||
|
@ -168,55 +173,62 @@ proc serializeMacroScore*(trainingDb : DbConn, a : MacroScore, generationRowId :
|
|||
serializeScore(trainingdb, val, macroDbId, score)
|
||||
|
||||
proc createWordAccuracyTable*(a : GeneticBase) : Table[string, Table[string, float]] =
|
||||
proc cubic(x : float) : float =
|
||||
let h = -0.3
|
||||
let a = 2.5
|
||||
let k = 0.0
|
||||
return a*(x-h)^3+k
|
||||
for (language, score) in a.scoreGene.pairs:
|
||||
|
||||
let accuracy = score.accuracy
|
||||
var iTable = accuracy.incorrectWordCounts
|
||||
var cTable = accuracy.correctWordCounts
|
||||
cTable.merge(accuracy.incorrectWordCounts)
|
||||
let iTable = accuracy.incorrectWordCounts
|
||||
let cTable = accuracy.correctWordCounts
|
||||
|
||||
var totalTable = accuracy.correctWordCounts
|
||||
totalTable.merge(accuracy.incorrectWordCounts)
|
||||
|
||||
if totalTable.values.toSeq().high == -1: continue
|
||||
let totalFound = totalTable.values.toSeq().foldl(a+b)
|
||||
|
||||
result[language] = initTable[string, float]()
|
||||
|
||||
let weight = 1.0
|
||||
for word in a.wordsGene[language]:
|
||||
let percent =
|
||||
#if its never been found, its useless, so its 100% wrong
|
||||
if word notin cTable:
|
||||
100.0
|
||||
500000.0
|
||||
#if its not found in iTable that means its 100% right
|
||||
elif word notin iTable and word notin cTable:
|
||||
0.0
|
||||
elif word notin iTable:
|
||||
let step1 = weight
|
||||
let normalizationFactor = cubic(1-((cTable[word] / totalFound)+0.1))
|
||||
step1*normalizationFactor
|
||||
else:
|
||||
(accuracy.incorrectWordCounts[word] / cTable[word])*100
|
||||
result[language][word] = percent
|
||||
let step1 = (iTable[word] / totalTable[word])*100
|
||||
let normalizationFactor = cubic((1-(cTable[word] / totalFound)+0.1))
|
||||
step1*normalizationFactor
|
||||
|
||||
result[language][word] = percent
|
||||
proc insertGeneration*(db : DbConn, a : Generation) : int64 =
|
||||
let insert = """INSERT INTO Generation(TimeStarted, TimeEnded) VALUES(?, ?);"""
|
||||
db.exec(insert, a.startTime, a.endTime)
|
||||
return db.lastInsertRowId()
|
||||
|
||||
proc deserializeMacroScore*(db : DbConn, MacroScoreId : int, wordCount : int) : MacroScore =
|
||||
let data = db.iterate("select Language, Type, StrKey, IntVal from CountTable where MacroScoreId = ?;", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].strval, x[2].strval, x[3].intval))
|
||||
let languages = data.map(x=>x[0]).deduplicate()
|
||||
let accuracies = db.iterate("select Language, Correct, Incorrect, Faliures from Accuracy where MacroScoreId = ?;", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].intval, x[2].intval, x[3].intval))
|
||||
proc setupAccuracies(countTables : seq[(string, string, string, int64)], accuracies : seq[(string, int64, int64, int64)]) : (Table[string, Accuracy], Table[string, HashSet[string]]) =
|
||||
let languages = toHashSet accuracies.map(x=>x[0])
|
||||
|
||||
let words = db.iterate("select Language, StrKey from Words where MacroScoreId = ?", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].strval))
|
||||
var dictonary = initTable[string, HashSet[string]]()
|
||||
for (lang, word) in words:
|
||||
if lang notin dictonary:
|
||||
dictonary[lang] = initHashSet[string]()
|
||||
dictonary[lang].incl(word)
|
||||
var accuracy : Table[string, Accuracy]
|
||||
for lang in languages:
|
||||
var incorrectWords = initCountTable[string]()
|
||||
var LanguagesConfusedFor = initCountTable[string]()
|
||||
var correctWords = initCountTable[string]()
|
||||
for (language, typeOfTable, key, val) in data:
|
||||
if typeOfTable == "LanguagesConfusedFor":
|
||||
LanguagesConfusedFor.inc(key, val)
|
||||
elif typeOfTable == "incorrectWords":
|
||||
incorrectWords.inc(key, val)
|
||||
elif typeOfTable == "correctWords":
|
||||
correctWords.inc(key, val)
|
||||
var dictonary : HashSet[string]
|
||||
for (language, typeOfTable, key, val) in countTables:
|
||||
if language == lang:
|
||||
dictonary.incl(key)
|
||||
if typeOfTable == "LanguagesConfusedFor":
|
||||
LanguagesConfusedFor.inc(key, val)
|
||||
elif typeOfTable == "incorrectWords":
|
||||
incorrectWords.inc(key, val)
|
||||
elif typeOfTable == "correctWords":
|
||||
correctWords.inc(key, val)
|
||||
var score : (string, int64, int64, int64)
|
||||
for scores in accuracies:
|
||||
if lang == scores[0]:
|
||||
|
@ -235,11 +247,61 @@ proc deserializeMacroScore*(db : DbConn, MacroScoreId : int, wordCount : int) :
|
|||
newAcc.languagesConfusedFor = LanguagesConfusedFor
|
||||
newAcc.correctWordCounts = correctWords
|
||||
newAcc.incorrectWordCounts = incorrectWords
|
||||
accuracy[lang] = newAcc
|
||||
return makeMacroScore(score(accuracy), dictonary)
|
||||
result[0][lang] = newAcc
|
||||
result[1][lang] = dictonary
|
||||
|
||||
proc deserializeMacroScore*(db : DbConn, MacroScoreId : int, wordCount : int) : MacroScore =
|
||||
let data = db.iterate("select Language, Type, StrKey, IntVal from CountTable where MacroScoreId = ?;", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].strval, x[2].strval, x[3].intval))
|
||||
let languages = data.map(x=>x[0]).deduplicate()
|
||||
|
||||
|
||||
let accuracies = db.iterate("select Language, Correct, Incorrect, Faliures from Accuracy where MacroScoreId = ?;", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].intval, x[2].intval, x[3].intval))
|
||||
|
||||
let words = db.iterate("select Language, StrKey from Words where MacroScoreId = ?", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].strval))
|
||||
|
||||
let (accuracy, dictonary) = setupAccuracies(data, accuracies)
|
||||
|
||||
result = makeMacroScore(score(accuracy), dictonary)
|
||||
|
||||
proc deserializeGeneration*(db : DbConn, generationNumber : int64) : seq[MacroScore] =
|
||||
let accuracies = db.iterate("select Rowid, WordCount from MacroScore where generation = ? and IsPopulated = 0;", generationNumber).toSeq().map(x=> (x[0].intval, x[1].intval))
|
||||
for (i, wordcount) in accuracies:
|
||||
result.add(deserializeMacroScore(db, i, wordcount))
|
||||
|
||||
proc createBestModel*(db : DbConn) : MacroScore =
|
||||
db.execScript("""
|
||||
DROP TABLE IF EXISTS tmptable;
|
||||
CREATE temporary TABLE tmptable AS
|
||||
SELECT DISTINCT( s.language ),
|
||||
(SELECT macroscoreid
|
||||
FROM score AS s1
|
||||
WHERE s1.language = s.language
|
||||
ORDER BY faliurerate ASC
|
||||
LIMIT 1) AS macroscoreidbest
|
||||
FROM score AS s;
|
||||
""")
|
||||
echo db.iterate("""select language, macroscoreidbest from tmptable""").toSeq().map(x=> (x[0].strval, x[1].intval))
|
||||
|
||||
let languageToPerformance = db.iterate("""
|
||||
SELECT tmptable.language,
|
||||
macroscoreidbest,
|
||||
correct,
|
||||
Faliures,
|
||||
incorrect FROM tmptable JOIN accuracy
|
||||
ON accuracy.macroscoreid = macroscoreidbest
|
||||
AND accuracy.language = tmptable.language;
|
||||
""").toSeq().map(x=> (x[0].strval, x[2].intval, x[3].intval, x[4].intval))
|
||||
echo "step2!"
|
||||
let languageToCounttable = db.iterate(
|
||||
""" SELECT tmptable.language,
|
||||
macroscoreidbest,
|
||||
type,
|
||||
strkey,
|
||||
intval
|
||||
FROM tmptable
|
||||
JOIN counttable
|
||||
ON counttable.macroscoreid = macroscoreidbest
|
||||
AND counttable.language = tmptable.language;
|
||||
""").toSeq().map(x=> (x[0].strval, x[2].strval, x[3].strval, x[4].intval))
|
||||
let (accuracy, dictionary) = setupAccuracies(languageToCounttable, languageToPerformance)
|
||||
return makeMacroScore(score(accuracy), dictionary)
|
||||
|
|
BIN
nim/geneticTraining
Executable file
BIN
nim/geneticTraining
Executable file
Binary file not shown.
|
@ -37,13 +37,14 @@ let wordsDatabase = openDatabase("../data/words/words.db")
|
|||
optimizeDB wordsDatabase
|
||||
var languages = wordsDatabase.iterate("select distinct(Language) from Words;").toSeq().map(x=> x[0].strval).filter(x=> x in main.statistics)
|
||||
|
||||
|
||||
var dbChannel : Channel[(MacroScore, int64)]
|
||||
dbChannel.open()
|
||||
let trainingDb = createShared(DbConn, sizeof(DbConn))
|
||||
trainingDb[] = openDatabase("../data/training/training.db")
|
||||
optimizeDB trainingDb[]
|
||||
|
||||
|
||||
|
||||
var macroScoreChannel : Channel[MacroScore]
|
||||
macroScoreChannel.open()
|
||||
|
||||
|
@ -57,11 +58,11 @@ proc genRandomNumber(max : int) : int =
|
|||
let highMidRange = int(maxFloaty*0.10)
|
||||
|
||||
let wordRange = rand(0 .. 10)
|
||||
if wordRange in 0..3:
|
||||
if wordRange in 0..1:
|
||||
return rand(0..lowestRange)
|
||||
elif wordRange in 4..6:
|
||||
elif wordRange in 2..4:
|
||||
return rand(0..lowMidRange)
|
||||
elif wordRange in 7..8:
|
||||
elif wordRange in 5..8:
|
||||
return rand(0..highMidRange)
|
||||
elif wordRange in 9..10:
|
||||
return rand(0..max)
|
||||
|
@ -86,52 +87,68 @@ proc createRandomWords(amountWanted : int, languages : seq[string]) : Table[stri
|
|||
var retry = 0
|
||||
if word.high >= 4:
|
||||
if rand(0 .. 5) != 5:
|
||||
let endy = rand(2 .. word.high)
|
||||
let start = rand(0 .. endy-1)
|
||||
let endy = rand(3 .. word.high)
|
||||
let start = rand(0 .. endy-2)
|
||||
wordToAdd = $word[start .. endy]
|
||||
baseResult.incl(wordToAdd)
|
||||
result[lang] = baseResult
|
||||
|
||||
proc mutateWords(a : Table[string, Table[string, float]]) : Table[string, HashSet[string]]=
|
||||
let sizeOfMutation = 7
|
||||
let newWordsTable = createRandomWords((sizeOfMutation-1)*5, a.keys.toSeq())
|
||||
var newWords : Table[string, seq[string]]
|
||||
for (language, words) in newWordsTable.pairs():
|
||||
newWords[language] = words.toSeq()
|
||||
|
||||
for (language, words) in a.pairs():
|
||||
block mutationProcess:
|
||||
let newWords = toSeq createRandomWords((sizeOfMutation-1)*5, @[language])[language]
|
||||
var wordCounter = 0
|
||||
|
||||
#Words most innacurate to lesat
|
||||
let sort = words.pairs.toSeq().sorted((a,b) => cmp(a[1], b[1])).reversed()
|
||||
var wordsTotality = toHashSet sort.map(x=> x[0])
|
||||
var resultingWords = sort.map(x=> x[0])
|
||||
for x in 0 .. sizeOfMutation-1:
|
||||
var wordCounter = 0
|
||||
|
||||
let overOneHundread = sort.filter(x=> x[1] >= 400)
|
||||
var replacePositions : seq[int]
|
||||
if overOneHundread.high != -1:
|
||||
for x in 0 .. sort.high:
|
||||
if sort[x][1] >= 500000.0:
|
||||
replacePositions.add(x)
|
||||
else:
|
||||
for x in 0 .. sizeOfMutation:
|
||||
replacePositions.add(x)
|
||||
echo language
|
||||
echo replacePositions
|
||||
echo sort
|
||||
for pos in replacePositions:
|
||||
while true:
|
||||
if wordCounter == newWords.high:
|
||||
if wordCounter == newWords[language].len()-1:
|
||||
result[language] = wordsTotality
|
||||
break mutationProcess
|
||||
let randomWord = newWords[wordCounter]
|
||||
let randomWord = newWords[language][wordCounter]
|
||||
wordCounter+=1
|
||||
if randomWord notin wordsTotality:
|
||||
resultingWords[x] = randomWord
|
||||
resultingWords[pos] = randomWord
|
||||
break
|
||||
when defined(debug):
|
||||
echo language
|
||||
echo sort
|
||||
echo ""
|
||||
echo resultingWords
|
||||
echo "==="
|
||||
result[language] = toHashSet resultingWords
|
||||
|
||||
|
||||
proc wordsAndScoreToGene(a : Table[string, HashSet[string]], b : Table[string, Score]) : GeneticBase =
|
||||
result.wordsGene = a
|
||||
result.scoreGene = b
|
||||
|
||||
proc assembleNewGeneration(a : seq[MacroScore]) : seq[Table[string, HashSet[string]]] =
|
||||
let scores = a.map(x=>(x, tallyScores x)).sorted((a,b) => cmp(a[1], b[1])).map(x=>x[0])
|
||||
let half = floordiv(a.len, 2)
|
||||
let survivors =
|
||||
if a.len >= 10:
|
||||
scores[0 .. half]
|
||||
else:
|
||||
scores
|
||||
let survivors = scores
|
||||
|
||||
var genes = collect(for x in survivors: wordsAndScoreToGene(x.words, x.scores))
|
||||
let percentages = genes.map(x=> createWordAccuracyTable(x))
|
||||
|
||||
#echo percentages
|
||||
var compositeOrganismMutator = initTable[string, Table[string, float]]()
|
||||
var compositeOrganismWords = initTable[string, HashSet[string]]()
|
||||
|
||||
|
@ -196,9 +213,12 @@ proc generationThread(generationStart : seq[MacroScore]) =
|
|||
let gen = insertGeneration(trainingDb[], generation)
|
||||
for vocab in vocabExists:
|
||||
dbChannel.send((vocab, gen))
|
||||
var newGeneration = assembleNewGeneration macroScores
|
||||
let sumOfBest = createBestModel(trainingDb[])
|
||||
var newGeneration = assembleNewGeneration (macroScores & sumOfBest)
|
||||
var threads = newSeq[Thread[Table[string, HashSet[string]]]](newGeneration.len)
|
||||
for i in 0 .. newGeneration.high:
|
||||
doAssert(newGeneration[i].values.toSeq().all(x=> x.len == wordCount))
|
||||
|
||||
createThread(threads[i], scoreAndSend, newGeneration[i])
|
||||
var generation = Generation()
|
||||
|
||||
|
@ -209,4 +229,4 @@ proc generationThread(generationStart : seq[MacroScore]) =
|
|||
|
||||
var thread : Thread[void]
|
||||
createThread(thread, dbThread)
|
||||
generationThread deserializeGeneration(trainingDb[], 1)
|
||||
generationThread deserializeGeneration(trainingDb[], 2)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import ./main
|
||||
import tiny_sqlite
|
||||
import db_connector/db_sqlite
|
||||
import sequtils
|
||||
import sugar
|
||||
import tables
|
||||
|
@ -107,23 +107,27 @@ proc makeMacroScore*(a : Table[string, Score], words : Table[string, HashSet[str
|
|||
result.words = words
|
||||
result.wordCount = words.values.toSeq()[0].len()
|
||||
for (key,val) in a.pairs:
|
||||
if val.language in @["kn", "he", "ary", "yi", "ka", "gu", "ckb", "ta"]:
|
||||
continue
|
||||
result.faliureRates.push(val.faliureRate)
|
||||
result.totalWordUtilizations.push(val.totalWordUtilization)
|
||||
result.utlizationPerWords.push(val.utlizationPerWord)
|
||||
result.totalGoodWordUtilizations.push(val.totalGoodWordUtilization)
|
||||
result.goodUtilizationPerWords.push(val.goodUtilizationPerWord)
|
||||
result.percentGoodUtilizations.push(val.percentGoodUtilization)
|
||||
result.totalBadWordUtilizations.push(val.totalBadWordUtilization)
|
||||
result.badUtilizationPerWords.push(val.badUtilizationPerWord)
|
||||
result.percentBadUtilizations.push(val.percentBadUtilization)
|
||||
result.usedWordPercentsBad.push(val.usedWordPercentBad)
|
||||
result.usedWordPercentsGood.push(val.usedWordPercentGood)
|
||||
|
||||
result.totalBadWordUtilizations.push(val.totalBadWordUtilization)
|
||||
result.badUtilizationPerWords.push(val.badUtilizationPerWord)
|
||||
|
||||
|
||||
|
||||
proc createWordScore*(words : Table[string, HashSet[string]], beFast = false) : MacroScore {.gcsafe.} =
|
||||
let db = openDatabase("../data/testing/testingData.db")
|
||||
db.exec("PRAGMA read_uncommitted = ON;")
|
||||
db.exec("PRAGMA synchronous = NORMAL")
|
||||
let db = open("../data/testing/testingData.db", "", "", "")
|
||||
db.exec(sql"PRAGMA read_uncommitted = ON;")
|
||||
db.exec(sql"PRAGMA synchronous = NORMAL")
|
||||
var langToAccuracy = initTable[string, Accuracy]()
|
||||
let wordCount = words.values.toSeq()[0].len()
|
||||
var languages : seq[string]
|
||||
|
@ -136,14 +140,13 @@ proc createWordScore*(words : Table[string, HashSet[string]], beFast = false) :
|
|||
langToAccuracy[lang].wordCount = wordCount
|
||||
languages.add lang
|
||||
echo "2"
|
||||
for row in db.iterate("select Lang, Sample, Rowid from TrainingData"):
|
||||
let values = row.values
|
||||
let correctLanguage = values[0].strVal
|
||||
let sample = values[1].strval
|
||||
for row in db.fastrows(sql"select Lang, Sample, Rowid from TrainingData"):
|
||||
let correctLanguage = row[0]
|
||||
let sample = row[1]
|
||||
var wordCounts = newTable[string, CountTable[string]]()
|
||||
if beFast:
|
||||
if correctLanguage in fastCounter:
|
||||
if fastCounter[correctLanguage] == 300:
|
||||
if fastCounter[correctLanguage] == 500:
|
||||
continue
|
||||
fastCounter.inc(correctLanguage)
|
||||
if correctLanguage notin languages:
|
||||
|
|
Loading…
Reference in a new issue