Zipfs-Law-Language-Detector/nim/geneticTraining.nim
2024-10-10 22:37:54 -04:00

408 lines
12 KiB
Nim

import os
import math
import algorithm
import tiny_sqlite
import random
import sequtils
import sugar
import sets
import tables
import strformat
import strutils
import unicode
import locks
import times
import common
import std/typedthreads
import ./main
import ./scoring
import ./geneticTools
import std/enumerate
import json
randomize()
type WordScore = object
word : string
language : string
occurrence : int
occurrencePerSample : float
isolationPercentage : float
score : float
stage : Stage
data : CountTable[string]
samples : int
proc optimizeDB(a : DbConn) =
a.exec("PRAGMA synchronous = NORMAL")
a.exec("PRAGMA journal_mode = WAL")
a.exec("PRAGMA foreign_keys = OFF")
a.exec("pragma journal_size_limit = 6144000;")
a.exec("PRAGMA temp_store = MEMORY;")
a.exec("PRAGMA cache_size = 10000;")
let wordsDatabase = openDatabase("../data/words/words.db")
optimizeDB wordsDatabase
var languages = wordsDatabase.iterate("select distinct(Language) from Words;").toSeq().map(x=> x[0].strval).filter(x=> x in main.statistics)
var dbChannel : Channel[(MacroScore, int64)]
dbChannel.open()
let trainingDb = createShared(DbConn, sizeof(DbConn))
trainingDb[] = openDatabase("../data/training/training.db")
optimizeDB trainingDb[]
var macroScoreChannel : Channel[MacroScore]
macroScoreChannel.open()
var lock : Lock
initLock(lock)
proc genRandomNumber(max : int) : int =
let maxFloaty = float(max)
let lowestRange = int(maxFloaty*0.005)
let lowMidRange = int(maxFloaty*0.02)
let highMidRange = int(maxFloaty*0.10)
let wordRange = rand(0 .. 10)
if wordRange in 0..0:
return rand(0..lowestRange)
elif wordRange in 1..3:
return rand(0..lowMidRange)
elif wordRange in 4..8:
return rand(0..highMidRange)
elif wordRange in 9..10:
return rand(0..max)
proc createRandomWords(amountWanted : int, languages : seq[string], blacklist : seq[string] = @[], wordToRoot : TableRef[string, string] = nil) : Table[string, HashSet[string]] =
let query = """
select Word from Words
where language = ? and LENGTH(word) != 1
order by occurrences desc limit 10000;
"""
var totalWords : HashSet[string]
for lang in languages:
var breaking = 0
var baseResult : HashSet[string]
let words = wordsDatabase.iterate(query, lang).toSeq()
while true:
if baseResult.len() == amountWanted:
break
let randonNumber = genRandomNumber(words.high)
var word = words[randonNumber][0].strval.toRunes().map(x=>x.toLower())
var wordToAdd = $word
var retry = 0
if word.high >= 7:
if rand(0 .. 5) != 5:
let endy = rand(3 .. word.high)
let start = rand(0 .. endy-2)
wordToAdd = $word[start .. endy]
if wordToAdd in totalWords:
continue
if wordToAdd in blacklist or wordToAdd in totalWords:
if breaking == 1000:
echo lang
break
breaking+=1
continue
if wordToRoot != nil:
wordToRoot[wordToAdd] = $word
baseResult.incl(wordToAdd)
if baseresult.len == 0:
continue
totalWords.incl(baseResult)
echo (lang, baseresult.len)
result[lang] = baseResult
#[
proc mutateWords(a : Table[string, Table[string, float]]) : Table[string, HashSet[string]]=
let sizeOfMutation = 7
let newWordsTable = createRandomWords((sizeOfMutation-1)*5, a.keys.toSeq())
var newWords : Table[string, seq[string]]
for (language, words) in newWordsTable.pairs():
newWords[language] = words.toSeq()
for (language, words) in a.pairs():
block mutationProcess:
#Words most innacurate to lesat
let sort = words.pairs.toSeq().sorted((a,b) => cmp(a[1], b[1])).reversed()
var wordsTotality = toHashSet sort.map(x=> x[0])
var resultingWords = sort.map(x=> x[0])
var wordCounter = 0
let overOneHundread = sort.filter(x=> x[1] >= 400)
var replacePositions : seq[int]
if overOneHundread.high != -1:
for x in 0 .. sort.high:
if sort[x][1] >= 500000.0:
replacePositions.add(x)
else:
for x in 0 .. sizeOfMutation:
replacePositions.add(x)
echo language
echo replacePositions
echo sort
for pos in replacePositions:
while true:
if wordCounter == newWords[language].len()-1:
result[language] = wordsTotality
break mutationProcess
let randomWord = newWords[language][wordCounter]
wordCounter+=1
if randomWord notin wordsTotality:
resultingWords[pos] = randomWord
break
when defined(debug):
echo language
echo sort
echo ""
echo resultingWords
echo "==="
result[language] = toHashSet resultingWords
proc assembleNewGeneration(a : seq[MacroScore]) : seq[Table[string, HashSet[string]]] =
let scores = a.map(x=>(x, tallyScores x)).sorted((a,b) => cmp(a[1], b[1])).map(x=>x[0])
let half = floordiv(a.len, 2)
let survivors = scores
var genes = collect(for x in survivors: wordsAndScoreToGene(x.words, x.scores))
let percentages = genes.map(x=> createWordAccuracyTable(x))
#echo percentages
var compositeOrganismMutator = initTable[string, Table[string, float]]()
var compositeOrganismWords = initTable[string, HashSet[string]]()
for language in languages:
compositeOrganismMutator[language] = initTable[string, float]()
compositeOrganismWords[language] = initHashSet[string]()
var skipLanguage = false
if collect(for x in percentages: language notin x).any(x=>x):
let words = createRandomWords(wordCount, @[language])
for word in words[language]:
compositeOrganismMutator[language][word] = 0.0
compositeOrganismWords[language].incl(word)
skipLanguage = true
if skipLanguage:
continue
var composite = initTable[string, float]()
for gene in percentages:
for (key, val) in gene[language].pairs:
composite[key] = val
var count = 0
for (key, val) in composite.pairs.toSeq().sorted((a,b) => cmp(a[1], b[1])):
if count == wordCount: break
compositeOrganismMutator[language][key] = val
compositeOrganismWords[language].incl(key)
count+=1
let oneForth = floorDiv(generationSize, 4)
result.setLen(oneForth*3)
result[0] = compositeOrganismWords
for x in 1 .. result.high:
result[x] = mutateWords(compositeOrganismMutator)
for x in 0 .. oneForth:
result.add createRandomWords(wordCount, languages)
proc dbThread() {.gcsafe, thread.} =
while true:
let recved = dbChannel.recv()
withLock lock:
serializeMacroScore(trainingDb[], recved[0], recved[1])
proc scoreAndSend(words : Table[string, HashSet[string]]) {.gcsafe, thread} =
let result = createWordScore(words, true)
macroScoreChannel.send(result)
proc generationThread(generationStart : seq[MacroScore]) =
var macroScores = generationStart
while true:
var vocabExists : seq[MacroScore]
withLock lock:
vocabExists = macroScores.filter(x=> not vocabAlreadyExists(trainingDb[], x.words))
if vocabExists.high != -1:
let time = getTime().toUnix()
var generation = Generation()
generation.startTime = time
generation.endTime = time
let gen = insertGeneration(trainingDb[], generation)
for vocab in vocabExists:
dbChannel.send((vocab, gen))
let sumOfBest = createBestModel(trainingDb[])
var newGeneration = assembleNewGeneration (macroScores & sumOfBest)
var threads = newSeq[Thread[Table[string, HashSet[string]]]](newGeneration.len)
for i in 0 .. newGeneration.high:
doAssert(newGeneration[i].values.toSeq().all(x=> x.len == wordCount))
createThread(threads[i], scoreAndSend, newGeneration[i])
var generation = Generation()
joinThreads(threads)
macroScores = collect(for x in 0 .. macroScoreChannel.peek-1: macroScoreChannel.recv())
echo macroScores.map(x=> tallyScores x)
]#
proc serializeWordScores(words : seq[WordScore], rootWords : TableRef[string, string]) =
let initial = trainingdb[].one("select count(*) from WordScore").get[0].intval
var inserty : seq[seq[DbValue]]
var offset = 1
var counttables : seq[seq[DbValue]]
for word in words:
inserty.add toDbValues(
word.word,
rootWords[word.word],
word.language,
word.occurrence,
word.occurrencePerSample,
word.isolationPercentage,
word.score,
$word.stage,
word.samples)
for (key,val) in word.data.pairs():
counttables.add(toDbValues(key, val, offset+initial))
offset+=1
trainingdb[].execmany(
"""
INSERT INTO WordScore (
word,
rootword,
language,
occurrence,
occurrencePerSample,
isolationPercentage,
score,
stage,
samples
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?);
""", inserty)
trainingdb[].execmany(
"""
INSERT INTO WordCountTable (
Language, Count, WordScoreId
) VALUES (?, ?, ?);
""", counttables)
proc slope_adjustment(x : float64) : float64 =
let a = 4.9
let b = 1.03
let c = -2.8
return a*(pow(b,x))+c
proc trainWords(words : Table[string, HashSet[string]], result : var seq[WordScore], manager : TableRef[string, WordScore] = nil, stage = First) =
var wordTotality : HashSet[string]
var wordToLanguage = initTable[string, string]()
for (language, words) in words.pairs():
for w in words:
wordToLanguage[w] = language
wordTotality.incl(words)
var samples = 0
var wordsCatagorized = createWordLanguageOccurrences(wordTotality, toHashSet languages, stage, samples)
var results : seq[WordScore]
var i = 0
for w in wordTotality:
let language = wordToLanguage[w]
if stage != First:
manager[w].samples+=samples
wordsCatagorized[w].merge(manager[w].data)
let pairs = wordsCatagorized[w].pairs().toSeq()
let sum =
if pairs.high == -1:
0
else:
pairs.map(x=> x[1]).foldl(a+b)
let percentageIsolate =
if sum == 0:
0.0
else:
(wordsCatagorized[w][language] / sum)*100
let perSample =
if sum == 0:
0.0
else:
sum / samples
let reduecedPower = float64(percentageIsolate/float64(10.0))*2
let baseScore = pow((perSample/7)+1.0, reduecedPower)
let modifier = slope_adjustment(percentageIsolate)
let score = baseScore*modifier
var wordScore = WordScore()
wordScore.word = w
wordScore.language = language
wordScore.samples = samples
wordScore.score = score
wordScore.occurrence = sum
wordScore.occurrencePerSample = perSample
wordScore.isolationPercentage = percentageIsolate
wordScore.data = wordsCatagorized[w]
wordScore.stage = stage
results.add(wordScore)
i += 1
results.sort((a,b) => cmp(a.score, b.score))
results.reverse()
if stage == Forth:
result = result & results
result.sort((a,b) => cmp(a.score, b.score))
result.reverse()
return
let split = results.distribute(2)
result = result & split[1]
var nextResult = initTable[string, HashSet[string]]()
var manager = newTable[string, WordScore]()
for word in split[0]:
if word.language notin nextResult:
nextResult[word.language] = initHashSet[string]()
nextResult[word.language].incl(word.word)
manager[word.word] = word
trainWords(nextResult, result, manager, Stage(ord(stage)+1))
proc generateWords(numberOfIterations : int) =
for x in 0 .. numberOfIterations:
let existingWords = trainingdb[].iterate("select Word from WordScore ").toSeq().map(x=> x[0].strVal)
var rootWords = newTable[string, string]()
let words = createRandomWords(200, languages, existingWords, rootWords)
var result : seq[WordScore]
trainWords(words, result)
echo result.len
echo result.map(x=>x.word).deduplicate().len
serializeWordScores(result, rootWords)
var wordCount : int
var genSize : int
let cmdArgs = getParamSwitches()
if "--iterations" notin cmdArgs or "--output_words" notin cmdArgs:
echo "uhfkjlsudfh"
quit(0)
else:
wordCount = parseInt(cmdArgs["--output_words"])
genSize = parseInt(cmdArgs["--iterations"])
echo genSize
generateWords(genSize)
let result = createBestFromWordsBest(trainingDb[], wordCount)
var output = newJObject()
for (language, words) in result.pairs():
output[language] = %* words.toSeq()
writeFile("../data/mostCommonWords.json", $output)