270 lines
9 KiB
Nim
270 lines
9 KiB
Nim
import ./main
|
|
import db_connector/db_sqlite
|
|
import sequtils
|
|
import sugar
|
|
import tables
|
|
import times
|
|
import algorithm
|
|
import strutils
|
|
import stats
|
|
import sets
|
|
import osproc
|
|
import std/enumerate
|
|
type
|
|
Accuracy* = object
|
|
language* : string
|
|
correct* : int
|
|
incorrect* : int
|
|
faliures* : int
|
|
wordCount* : int
|
|
languagesConfusedFor* : CountTable[string]
|
|
correctWordCounts* : CountTable[string]
|
|
incorrectWordCounts* : CountTable[string]
|
|
|
|
Score* = object
|
|
accuracy* : Accuracy
|
|
faliureRate* : float
|
|
language* : string
|
|
|
|
totalWordUtilization* : int
|
|
utlizationPerWord* : float
|
|
|
|
totalGoodWordUtilization* : int
|
|
goodUtilizationPerWord* : float
|
|
percentGoodUtilization* : float
|
|
|
|
totalBadWordUtilization* : int
|
|
badUtilizationPerWord* : float
|
|
percentBadUtilization* : float
|
|
|
|
usedWordPercentGood* : float
|
|
usedWordPercentBad* : float
|
|
|
|
MacroScore* = object
|
|
wordCount* : int
|
|
words* : Table[string, HashSet[string]]
|
|
scores* : Table[string, Score]
|
|
faliureRates* : RunningStat
|
|
totalWordUtilizations* : RunningStat
|
|
utlizationPerWords* : RunningStat
|
|
|
|
totalGoodWordUtilizations* : RunningStat
|
|
goodUtilizationPerWords* : RunningStat
|
|
percentGoodUtilizations* : RunningStat
|
|
|
|
totalBadWordUtilizations* : RunningStat
|
|
badUtilizationPerWords* : RunningStat
|
|
percentBadUtilizations* : RunningStat
|
|
usedWordPercentsBad* : RunningStat
|
|
usedWordPercentsGood* : RunningStat
|
|
MiniScore = object
|
|
language : string
|
|
correct : int
|
|
incorrect : int
|
|
confusedFor : CountTable[string]
|
|
Stage* = enum
|
|
First, Second, Third, Forth
|
|
|
|
proc score*(a : Table[string, Accuracy]) : Table[string, Score] =
|
|
for (key,val) in a.pairs:
|
|
let accuracy = ((val.faliures + val.incorrect) / (val.correct + val.incorrect + val.faliures)) * 100
|
|
var correct = val.correctWordCounts
|
|
var incorrect = val.incorrectWordCounts
|
|
var icount = incorrect.values.toSeq()
|
|
var ccount = correct.values.toSeq()
|
|
var goodTotal =
|
|
if ccount.high == -1:
|
|
1
|
|
else:
|
|
ccount.foldl(a+b)
|
|
var badTotal =
|
|
if icount.high == -1:
|
|
1
|
|
else:
|
|
icount.foldl(a+b)
|
|
|
|
let total = goodTotal + badTotal
|
|
let totalPerWord = (total) / val.wordCount
|
|
let badPerWord = badTotal / val.wordCount
|
|
let goodPerWord = goodTotal / val.wordCount
|
|
let goodUtilizationPercent = (goodTotal / total)*100
|
|
let badUtilizationPercent = (badTotal / total)*100
|
|
let percentWordsUsedGood = (ccount.len() / val.wordCount) * 100
|
|
let percentWordsUsedBad = (icount.len() / val.wordCount) * 100
|
|
result[key] = Score()
|
|
result[key].language = key
|
|
result[key].usedWordPercentGood = percentWordsUsedGood
|
|
result[key].usedWordPercentBad = percentWordsUsedBad
|
|
result[key].accuracy = val
|
|
result[key].totalWordUtilization = total
|
|
result[key].utlizationPerWord = totalPerWord
|
|
result[key].totalGoodWordUtilization = goodTotal
|
|
result[key].goodUtilizationPerWord = goodPerWord
|
|
result[key].percentGoodUtilization = goodUtilizationPercent
|
|
result[key].totalBadWordUtilization = badTotal
|
|
result[key].badUtilizationPerWord = badPerWord
|
|
result[key].percentBadUtilization = badUtilizationPercent
|
|
let faliures = (val.incorrect + val.faliures)
|
|
let totalEntries = faliures + val.correct
|
|
result[key].faliureRate = (faliures / totalEntries)*100
|
|
|
|
proc makeMacroScore*(a : Table[string, Score], words : Table[string, HashSet[string]]) : MacroScore =
|
|
result.scores = a
|
|
result.words = words
|
|
result.wordCount = words.values.toSeq()[0].len()
|
|
for (key,val) in a.pairs:
|
|
if val.language in @["kn", "he", "ary", "yi", "ka", "gu", "ckb", "ta"]:
|
|
continue
|
|
result.faliureRates.push(val.faliureRate)
|
|
result.totalWordUtilizations.push(val.totalWordUtilization)
|
|
result.utlizationPerWords.push(val.utlizationPerWord)
|
|
result.totalGoodWordUtilizations.push(val.totalGoodWordUtilization)
|
|
result.goodUtilizationPerWords.push(val.goodUtilizationPerWord)
|
|
result.percentGoodUtilizations.push(val.percentGoodUtilization)
|
|
result.percentBadUtilizations.push(val.percentBadUtilization)
|
|
result.usedWordPercentsBad.push(val.usedWordPercentBad)
|
|
result.usedWordPercentsGood.push(val.usedWordPercentGood)
|
|
|
|
result.totalBadWordUtilizations.push(val.totalBadWordUtilization)
|
|
result.badUtilizationPerWords.push(val.badUtilizationPerWord)
|
|
|
|
proc createWordScore*(words : Table[string, HashSet[string]], beFast = false) : MacroScore {.gcsafe.} =
|
|
let db = open("../data/testing/testing.db", "", "", "")
|
|
db.exec(sql"PRAGMA read_uncommitted = ON;")
|
|
db.exec(sql"PRAGMA synchronous = NORMAL")
|
|
var langToAccuracy = initTable[string, Accuracy]()
|
|
let wordCount = words.values.toSeq()[0].len()
|
|
var languages : seq[string]
|
|
var fastCounter = initCountTable[string]()
|
|
|
|
for (lang, words) in words.pairs:
|
|
doAssert(words.len() == wordCount, $words.len() & " does not equal " & $wordCount)
|
|
langToAccuracy[lang] = Accuracy()
|
|
langToAccuracy[lang].language = lang
|
|
langToAccuracy[lang].wordCount = wordCount
|
|
languages.add lang
|
|
for row in db.fastrows(sql"select Lang, Sample, Rowid from TrainingData"):
|
|
let correctLanguage = row[0]
|
|
let sample = row[1]
|
|
if correctLanguage notin languages:
|
|
continue
|
|
var wordCounts = newTable[string, CountTable[string]]()
|
|
if beFast:
|
|
if correctLanguage in fastCounter:
|
|
if fastCounter[correctLanguage] == 500:
|
|
continue
|
|
fastCounter.inc(correctLanguage)
|
|
|
|
let result = makeResult zipfsLanguageDetector(languages, sample, wordCounter = wordCounts, words)
|
|
let correct = result[0][0] == correctLanguage
|
|
if correct:
|
|
langToAccuracy[correctLanguage].correct+=1
|
|
else:
|
|
langToAccuracy[correctLanguage].incorrect+=1
|
|
langToAccuracy[correctLanguage].languagesConfusedFor.inc(result[0][0])
|
|
|
|
for (key,val) in wordCounts.pairs:
|
|
if key == correctLanguage and correct:
|
|
langToAccuracy[correctLanguage].correctWordCounts.merge(val)
|
|
else:
|
|
langToAccuracy[key].incorrectWordCounts.merge(val)
|
|
|
|
return makeMacroScore(score(langToAccuracy), words)
|
|
|
|
var resultChannel : Channel[Table[string, CountTable[string]]]
|
|
resultChannel.open()
|
|
|
|
proc doWordStuff(a : (HashSet[string], seq[(string, string)])) {.gcsafe.} =
|
|
let samples = a[1]
|
|
let words = a[0]
|
|
var result = initTable[string, CountTable[string]]()
|
|
for w in words:
|
|
result[w] = initCountTable[string]()
|
|
for (sample, language) in samples:
|
|
if sample.contains(w):
|
|
result[w].inc(language, 1)
|
|
|
|
resultChannel.send(result)
|
|
|
|
proc createWordLanguageOccurrences*(words : HashSet[string], languages : HashSet[string], stage : Stage, samplesOutput : var int) : Table[string, CountTable[string]] =
|
|
let db = open("../data/testing/testing.db", "", "", "")
|
|
var fastCounter = initCountTable[string]()
|
|
|
|
var samples : seq[(string, string)]
|
|
for row in db.fastrows(sql"select Lang, Lower(substr(Sample, 1, 500)) AS smpl, Rowid from TrainingData"):
|
|
let language = row[0]
|
|
let sample = row[1]
|
|
|
|
if language notin language:
|
|
continue
|
|
let count = fastCounter[language]
|
|
fastCounter.inc(language)
|
|
|
|
case stage:
|
|
of First:
|
|
if count >= 100:
|
|
continue
|
|
of Second:
|
|
if count notin 100..300:
|
|
continue
|
|
of Third:
|
|
if count notin 301..500:
|
|
continue
|
|
else:
|
|
if count < 501:
|
|
continue
|
|
|
|
samples.add((sample, language))
|
|
|
|
samplesOutput = samples.len
|
|
let cpus = countProcessors()
|
|
let forEachThread = samples.distribute(cpus)
|
|
var threads = newSeq[Thread[(HashSet[string], seq[(string, string)])]](cpus)
|
|
for x in 0 .. cpus-1:
|
|
createThread(threads[x], doWordStuff, (words, forEachThread[x]))
|
|
for w in words:
|
|
result[w] = initCountTable[string]()
|
|
|
|
jointhreads(threads)
|
|
echo "done!"
|
|
let resultingObj = collect(for x in 0 .. cpus-1: resultChannel.recv())
|
|
echo resultingObj.len
|
|
for w in words:
|
|
for robj in resultingObj:
|
|
result[w].merge(robj[w])
|
|
echo "done2!"
|
|
|
|
|
|
proc createScoreNoWords(languages : seq[string], useWords : bool) : Table[string, MiniScore] =
|
|
|
|
let db = open("../data/testing/testing.db", "", "", "")
|
|
db.exec(sql"PRAGMA read_uncommitted = ON;")
|
|
db.exec(sql"PRAGMA synchronous = NORMAL")
|
|
for lang in languages:
|
|
result[lang] = MiniScore()
|
|
|
|
for row in db.fastrows(sql"select Lang, Lower(substr(Sample, 1, 500)) AS smpl, Rowid from TrainingData"):
|
|
if row[0] notin languages:
|
|
continue
|
|
let language = row[0]
|
|
let sample = row[1]
|
|
let emptyWords = initTable[string, HashSet[string]]()
|
|
let zipfsResult =
|
|
if useWords:
|
|
makeResult zipfsLanguageDetector(languages, sample)
|
|
else:
|
|
makeResult zipfsLanguageDetector(languages, sample, nil, emptyWords)
|
|
|
|
let detectedLanguage = zipfsResult[0][0]
|
|
if detectedLanguage != language:
|
|
result[language].incorrect+=1
|
|
result[language].confusedFor.inc(detectedLanguage)
|
|
result[detectedLanguage].incorrect+=1
|
|
result[detectedLanguage].confusedFor.inc(language)
|
|
else:
|
|
result[language].correct+=1
|
|
|
|
|
|
|
|
when isMainModule:
|
|
echo createWordScore(main.mostCommonWords).faliureRates
|