408 lines
12 KiB
Nim
408 lines
12 KiB
Nim
import os
|
|
import math
|
|
import algorithm
|
|
import tiny_sqlite
|
|
import random
|
|
import sequtils
|
|
import sugar
|
|
import sets
|
|
import tables
|
|
import strformat
|
|
import strutils
|
|
import unicode
|
|
import locks
|
|
import times
|
|
import common
|
|
import std/typedthreads
|
|
import ./main
|
|
import ./scoring
|
|
import ./geneticTools
|
|
import std/enumerate
|
|
import json
|
|
|
|
randomize()
|
|
|
|
type WordScore = object
|
|
word : string
|
|
language : string
|
|
occurrence : int
|
|
occurrencePerSample : float
|
|
isolationPercentage : float
|
|
score : float
|
|
stage : Stage
|
|
data : CountTable[string]
|
|
samples : int
|
|
|
|
proc optimizeDB(a : DbConn) =
|
|
a.exec("PRAGMA synchronous = NORMAL")
|
|
a.exec("PRAGMA journal_mode = WAL")
|
|
a.exec("PRAGMA foreign_keys = OFF")
|
|
a.exec("pragma journal_size_limit = 6144000;")
|
|
a.exec("PRAGMA temp_store = MEMORY;")
|
|
a.exec("PRAGMA cache_size = 10000;")
|
|
|
|
|
|
let wordsDatabase = openDatabase("../data/words/words.db")
|
|
optimizeDB wordsDatabase
|
|
var languages = wordsDatabase.iterate("select distinct(Language) from Words;").toSeq().map(x=> x[0].strval).filter(x=> x in main.statistics)
|
|
|
|
var dbChannel : Channel[(MacroScore, int64)]
|
|
dbChannel.open()
|
|
let trainingDb = createShared(DbConn, sizeof(DbConn))
|
|
trainingDb[] = openDatabase("../data/training/training.db")
|
|
optimizeDB trainingDb[]
|
|
|
|
|
|
|
|
var macroScoreChannel : Channel[MacroScore]
|
|
macroScoreChannel.open()
|
|
|
|
var lock : Lock
|
|
initLock(lock)
|
|
|
|
proc genRandomNumber(max : int) : int =
|
|
let maxFloaty = float(max)
|
|
let lowestRange = int(maxFloaty*0.005)
|
|
let lowMidRange = int(maxFloaty*0.02)
|
|
let highMidRange = int(maxFloaty*0.10)
|
|
|
|
let wordRange = rand(0 .. 10)
|
|
if wordRange in 0..0:
|
|
return rand(0..lowestRange)
|
|
elif wordRange in 1..3:
|
|
return rand(0..lowMidRange)
|
|
elif wordRange in 4..8:
|
|
return rand(0..highMidRange)
|
|
elif wordRange in 9..10:
|
|
return rand(0..max)
|
|
|
|
|
|
proc createRandomWords(amountWanted : int, languages : seq[string], blacklist : seq[string] = @[], wordToRoot : TableRef[string, string] = nil) : Table[string, HashSet[string]] =
|
|
let query = """
|
|
select Word from Words
|
|
where language = ? and LENGTH(word) != 1
|
|
order by occurrences desc limit 10000;
|
|
"""
|
|
var totalWords : HashSet[string]
|
|
for lang in languages:
|
|
var breaking = 0
|
|
var baseResult : HashSet[string]
|
|
let words = wordsDatabase.iterate(query, lang).toSeq()
|
|
while true:
|
|
if baseResult.len() == amountWanted:
|
|
break
|
|
let randonNumber = genRandomNumber(words.high)
|
|
var word = words[randonNumber][0].strval.toRunes().map(x=>x.toLower())
|
|
var wordToAdd = $word
|
|
|
|
var retry = 0
|
|
if word.high >= 7:
|
|
if rand(0 .. 5) != 5:
|
|
let endy = rand(3 .. word.high)
|
|
let start = rand(0 .. endy-2)
|
|
wordToAdd = $word[start .. endy]
|
|
|
|
if wordToAdd in totalWords:
|
|
continue
|
|
if wordToAdd in blacklist or wordToAdd in totalWords:
|
|
if breaking == 1000:
|
|
echo lang
|
|
break
|
|
breaking+=1
|
|
continue
|
|
|
|
if wordToRoot != nil:
|
|
wordToRoot[wordToAdd] = $word
|
|
baseResult.incl(wordToAdd)
|
|
|
|
if baseresult.len == 0:
|
|
continue
|
|
totalWords.incl(baseResult)
|
|
echo (lang, baseresult.len)
|
|
result[lang] = baseResult
|
|
#[
|
|
proc mutateWords(a : Table[string, Table[string, float]]) : Table[string, HashSet[string]]=
|
|
let sizeOfMutation = 7
|
|
let newWordsTable = createRandomWords((sizeOfMutation-1)*5, a.keys.toSeq())
|
|
var newWords : Table[string, seq[string]]
|
|
for (language, words) in newWordsTable.pairs():
|
|
newWords[language] = words.toSeq()
|
|
|
|
for (language, words) in a.pairs():
|
|
block mutationProcess:
|
|
|
|
#Words most innacurate to lesat
|
|
let sort = words.pairs.toSeq().sorted((a,b) => cmp(a[1], b[1])).reversed()
|
|
var wordsTotality = toHashSet sort.map(x=> x[0])
|
|
var resultingWords = sort.map(x=> x[0])
|
|
var wordCounter = 0
|
|
|
|
let overOneHundread = sort.filter(x=> x[1] >= 400)
|
|
var replacePositions : seq[int]
|
|
if overOneHundread.high != -1:
|
|
for x in 0 .. sort.high:
|
|
if sort[x][1] >= 500000.0:
|
|
replacePositions.add(x)
|
|
else:
|
|
for x in 0 .. sizeOfMutation:
|
|
replacePositions.add(x)
|
|
echo language
|
|
echo replacePositions
|
|
echo sort
|
|
for pos in replacePositions:
|
|
while true:
|
|
if wordCounter == newWords[language].len()-1:
|
|
result[language] = wordsTotality
|
|
break mutationProcess
|
|
let randomWord = newWords[language][wordCounter]
|
|
wordCounter+=1
|
|
if randomWord notin wordsTotality:
|
|
resultingWords[pos] = randomWord
|
|
break
|
|
when defined(debug):
|
|
echo language
|
|
echo sort
|
|
echo ""
|
|
echo resultingWords
|
|
echo "==="
|
|
result[language] = toHashSet resultingWords
|
|
|
|
|
|
|
|
proc assembleNewGeneration(a : seq[MacroScore]) : seq[Table[string, HashSet[string]]] =
|
|
let scores = a.map(x=>(x, tallyScores x)).sorted((a,b) => cmp(a[1], b[1])).map(x=>x[0])
|
|
let half = floordiv(a.len, 2)
|
|
let survivors = scores
|
|
|
|
var genes = collect(for x in survivors: wordsAndScoreToGene(x.words, x.scores))
|
|
let percentages = genes.map(x=> createWordAccuracyTable(x))
|
|
#echo percentages
|
|
var compositeOrganismMutator = initTable[string, Table[string, float]]()
|
|
var compositeOrganismWords = initTable[string, HashSet[string]]()
|
|
|
|
for language in languages:
|
|
compositeOrganismMutator[language] = initTable[string, float]()
|
|
compositeOrganismWords[language] = initHashSet[string]()
|
|
|
|
var skipLanguage = false
|
|
if collect(for x in percentages: language notin x).any(x=>x):
|
|
let words = createRandomWords(wordCount, @[language])
|
|
for word in words[language]:
|
|
compositeOrganismMutator[language][word] = 0.0
|
|
compositeOrganismWords[language].incl(word)
|
|
skipLanguage = true
|
|
|
|
if skipLanguage:
|
|
continue
|
|
var composite = initTable[string, float]()
|
|
|
|
for gene in percentages:
|
|
for (key, val) in gene[language].pairs:
|
|
composite[key] = val
|
|
|
|
var count = 0
|
|
for (key, val) in composite.pairs.toSeq().sorted((a,b) => cmp(a[1], b[1])):
|
|
if count == wordCount: break
|
|
compositeOrganismMutator[language][key] = val
|
|
compositeOrganismWords[language].incl(key)
|
|
count+=1
|
|
|
|
let oneForth = floorDiv(generationSize, 4)
|
|
|
|
result.setLen(oneForth*3)
|
|
result[0] = compositeOrganismWords
|
|
for x in 1 .. result.high:
|
|
result[x] = mutateWords(compositeOrganismMutator)
|
|
for x in 0 .. oneForth:
|
|
result.add createRandomWords(wordCount, languages)
|
|
|
|
proc dbThread() {.gcsafe, thread.} =
|
|
while true:
|
|
let recved = dbChannel.recv()
|
|
withLock lock:
|
|
serializeMacroScore(trainingDb[], recved[0], recved[1])
|
|
|
|
proc scoreAndSend(words : Table[string, HashSet[string]]) {.gcsafe, thread} =
|
|
let result = createWordScore(words, true)
|
|
macroScoreChannel.send(result)
|
|
|
|
proc generationThread(generationStart : seq[MacroScore]) =
|
|
var macroScores = generationStart
|
|
|
|
while true:
|
|
var vocabExists : seq[MacroScore]
|
|
withLock lock:
|
|
vocabExists = macroScores.filter(x=> not vocabAlreadyExists(trainingDb[], x.words))
|
|
if vocabExists.high != -1:
|
|
let time = getTime().toUnix()
|
|
var generation = Generation()
|
|
generation.startTime = time
|
|
generation.endTime = time
|
|
let gen = insertGeneration(trainingDb[], generation)
|
|
for vocab in vocabExists:
|
|
dbChannel.send((vocab, gen))
|
|
let sumOfBest = createBestModel(trainingDb[])
|
|
var newGeneration = assembleNewGeneration (macroScores & sumOfBest)
|
|
var threads = newSeq[Thread[Table[string, HashSet[string]]]](newGeneration.len)
|
|
for i in 0 .. newGeneration.high:
|
|
doAssert(newGeneration[i].values.toSeq().all(x=> x.len == wordCount))
|
|
|
|
createThread(threads[i], scoreAndSend, newGeneration[i])
|
|
var generation = Generation()
|
|
|
|
joinThreads(threads)
|
|
macroScores = collect(for x in 0 .. macroScoreChannel.peek-1: macroScoreChannel.recv())
|
|
|
|
echo macroScores.map(x=> tallyScores x)
|
|
|
|
]#
|
|
|
|
proc serializeWordScores(words : seq[WordScore], rootWords : TableRef[string, string]) =
|
|
let initial = trainingdb[].one("select count(*) from WordScore").get[0].intval
|
|
var inserty : seq[seq[DbValue]]
|
|
var offset = 1
|
|
var counttables : seq[seq[DbValue]]
|
|
for word in words:
|
|
inserty.add toDbValues(
|
|
word.word,
|
|
rootWords[word.word],
|
|
word.language,
|
|
word.occurrence,
|
|
word.occurrencePerSample,
|
|
word.isolationPercentage,
|
|
word.score,
|
|
$word.stage,
|
|
word.samples)
|
|
for (key,val) in word.data.pairs():
|
|
counttables.add(toDbValues(key, val, offset+initial))
|
|
offset+=1
|
|
|
|
trainingdb[].execmany(
|
|
"""
|
|
INSERT INTO WordScore (
|
|
word,
|
|
rootword,
|
|
language,
|
|
occurrence,
|
|
occurrencePerSample,
|
|
isolationPercentage,
|
|
score,
|
|
stage,
|
|
samples
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?);
|
|
""", inserty)
|
|
|
|
trainingdb[].execmany(
|
|
"""
|
|
INSERT INTO WordCountTable (
|
|
Language, Count, WordScoreId
|
|
) VALUES (?, ?, ?);
|
|
""", counttables)
|
|
|
|
proc slope_adjustment(x : float64) : float64 =
|
|
let a = 4.9
|
|
let b = 1.03
|
|
let c = -2.8
|
|
return a*(pow(b,x))+c
|
|
|
|
proc trainWords(words : Table[string, HashSet[string]], result : var seq[WordScore], manager : TableRef[string, WordScore] = nil, stage = First) =
|
|
var wordTotality : HashSet[string]
|
|
var wordToLanguage = initTable[string, string]()
|
|
for (language, words) in words.pairs():
|
|
for w in words:
|
|
wordToLanguage[w] = language
|
|
wordTotality.incl(words)
|
|
var samples = 0
|
|
var wordsCatagorized = createWordLanguageOccurrences(wordTotality, toHashSet languages, stage, samples)
|
|
|
|
var results : seq[WordScore]
|
|
var i = 0
|
|
|
|
for w in wordTotality:
|
|
let language = wordToLanguage[w]
|
|
if stage != First:
|
|
manager[w].samples+=samples
|
|
wordsCatagorized[w].merge(manager[w].data)
|
|
let pairs = wordsCatagorized[w].pairs().toSeq()
|
|
|
|
let sum =
|
|
if pairs.high == -1:
|
|
0
|
|
else:
|
|
pairs.map(x=> x[1]).foldl(a+b)
|
|
|
|
let percentageIsolate =
|
|
if sum == 0:
|
|
0.0
|
|
else:
|
|
(wordsCatagorized[w][language] / sum)*100
|
|
let perSample =
|
|
if sum == 0:
|
|
0.0
|
|
else:
|
|
sum / samples
|
|
|
|
let reduecedPower = float64(percentageIsolate/float64(10.0))*2
|
|
let baseScore = pow((perSample/7)+1.0, reduecedPower)
|
|
let modifier = slope_adjustment(percentageIsolate)
|
|
let score = baseScore*modifier
|
|
var wordScore = WordScore()
|
|
|
|
wordScore.word = w
|
|
wordScore.language = language
|
|
wordScore.samples = samples
|
|
wordScore.score = score
|
|
wordScore.occurrence = sum
|
|
wordScore.occurrencePerSample = perSample
|
|
wordScore.isolationPercentage = percentageIsolate
|
|
wordScore.data = wordsCatagorized[w]
|
|
wordScore.stage = stage
|
|
results.add(wordScore)
|
|
i += 1
|
|
results.sort((a,b) => cmp(a.score, b.score))
|
|
results.reverse()
|
|
if stage == Forth:
|
|
result = result & results
|
|
result.sort((a,b) => cmp(a.score, b.score))
|
|
result.reverse()
|
|
return
|
|
let split = results.distribute(2)
|
|
result = result & split[1]
|
|
var nextResult = initTable[string, HashSet[string]]()
|
|
var manager = newTable[string, WordScore]()
|
|
|
|
for word in split[0]:
|
|
if word.language notin nextResult:
|
|
nextResult[word.language] = initHashSet[string]()
|
|
nextResult[word.language].incl(word.word)
|
|
manager[word.word] = word
|
|
trainWords(nextResult, result, manager, Stage(ord(stage)+1))
|
|
|
|
proc generateWords(numberOfIterations : int) =
|
|
for x in 0 .. numberOfIterations:
|
|
let existingWords = trainingdb[].iterate("select Word from WordScore ").toSeq().map(x=> x[0].strVal)
|
|
var rootWords = newTable[string, string]()
|
|
let words = createRandomWords(200, languages, existingWords, rootWords)
|
|
var result : seq[WordScore]
|
|
|
|
trainWords(words, result)
|
|
echo result.len
|
|
echo result.map(x=>x.word).deduplicate().len
|
|
serializeWordScores(result, rootWords)
|
|
|
|
var wordCount : int
|
|
var genSize : int
|
|
let cmdArgs = getParamSwitches()
|
|
if "--iterations" notin cmdArgs or "--output_words" notin cmdArgs:
|
|
echo "uhfkjlsudfh"
|
|
quit(0)
|
|
else:
|
|
wordCount = parseInt(cmdArgs["--output_words"])
|
|
genSize = parseInt(cmdArgs["--iterations"])
|
|
echo genSize
|
|
generateWords(genSize)
|
|
let result = createBestFromWordsBest(trainingDb[], wordCount)
|
|
var output = newJObject()
|
|
for (language, words) in result.pairs():
|
|
output[language] = %* words.toSeq()
|
|
writeFile("../data/mostCommonWords.json", $output)
|