Zipfs-Law-Language-Detector/nim/scoring.nim
2024-10-13 15:18:16 -04:00

270 lines
9 KiB
Nim

import ./main
import db_connector/db_sqlite
import sequtils
import sugar
import tables
import times
import algorithm
import strutils
import stats
import sets
import osproc
import std/enumerate
type
Accuracy* = object
language* : string
correct* : int
incorrect* : int
faliures* : int
wordCount* : int
languagesConfusedFor* : CountTable[string]
correctWordCounts* : CountTable[string]
incorrectWordCounts* : CountTable[string]
Score* = object
accuracy* : Accuracy
faliureRate* : float
language* : string
totalWordUtilization* : int
utlizationPerWord* : float
totalGoodWordUtilization* : int
goodUtilizationPerWord* : float
percentGoodUtilization* : float
totalBadWordUtilization* : int
badUtilizationPerWord* : float
percentBadUtilization* : float
usedWordPercentGood* : float
usedWordPercentBad* : float
MacroScore* = object
wordCount* : int
words* : Table[string, HashSet[string]]
scores* : Table[string, Score]
faliureRates* : RunningStat
totalWordUtilizations* : RunningStat
utlizationPerWords* : RunningStat
totalGoodWordUtilizations* : RunningStat
goodUtilizationPerWords* : RunningStat
percentGoodUtilizations* : RunningStat
totalBadWordUtilizations* : RunningStat
badUtilizationPerWords* : RunningStat
percentBadUtilizations* : RunningStat
usedWordPercentsBad* : RunningStat
usedWordPercentsGood* : RunningStat
MiniScore = object
language : string
correct : int
incorrect : int
confusedFor : CountTable[string]
Stage* = enum
First, Second, Third, Forth
proc score*(a : Table[string, Accuracy]) : Table[string, Score] =
for (key,val) in a.pairs:
let accuracy = ((val.faliures + val.incorrect) / (val.correct + val.incorrect + val.faliures)) * 100
var correct = val.correctWordCounts
var incorrect = val.incorrectWordCounts
var icount = incorrect.values.toSeq()
var ccount = correct.values.toSeq()
var goodTotal =
if ccount.high == -1:
1
else:
ccount.foldl(a+b)
var badTotal =
if icount.high == -1:
1
else:
icount.foldl(a+b)
let total = goodTotal + badTotal
let totalPerWord = (total) / val.wordCount
let badPerWord = badTotal / val.wordCount
let goodPerWord = goodTotal / val.wordCount
let goodUtilizationPercent = (goodTotal / total)*100
let badUtilizationPercent = (badTotal / total)*100
let percentWordsUsedGood = (ccount.len() / val.wordCount) * 100
let percentWordsUsedBad = (icount.len() / val.wordCount) * 100
result[key] = Score()
result[key].language = key
result[key].usedWordPercentGood = percentWordsUsedGood
result[key].usedWordPercentBad = percentWordsUsedBad
result[key].accuracy = val
result[key].totalWordUtilization = total
result[key].utlizationPerWord = totalPerWord
result[key].totalGoodWordUtilization = goodTotal
result[key].goodUtilizationPerWord = goodPerWord
result[key].percentGoodUtilization = goodUtilizationPercent
result[key].totalBadWordUtilization = badTotal
result[key].badUtilizationPerWord = badPerWord
result[key].percentBadUtilization = badUtilizationPercent
let faliures = (val.incorrect + val.faliures)
let totalEntries = faliures + val.correct
result[key].faliureRate = (faliures / totalEntries)*100
proc makeMacroScore*(a : Table[string, Score], words : Table[string, HashSet[string]]) : MacroScore =
result.scores = a
result.words = words
result.wordCount = words.values.toSeq()[0].len()
for (key,val) in a.pairs:
if val.language in @["kn", "he", "ary", "yi", "ka", "gu", "ckb", "ta"]:
continue
result.faliureRates.push(val.faliureRate)
result.totalWordUtilizations.push(val.totalWordUtilization)
result.utlizationPerWords.push(val.utlizationPerWord)
result.totalGoodWordUtilizations.push(val.totalGoodWordUtilization)
result.goodUtilizationPerWords.push(val.goodUtilizationPerWord)
result.percentGoodUtilizations.push(val.percentGoodUtilization)
result.percentBadUtilizations.push(val.percentBadUtilization)
result.usedWordPercentsBad.push(val.usedWordPercentBad)
result.usedWordPercentsGood.push(val.usedWordPercentGood)
result.totalBadWordUtilizations.push(val.totalBadWordUtilization)
result.badUtilizationPerWords.push(val.badUtilizationPerWord)
proc createWordScore*(words : Table[string, HashSet[string]], beFast = false) : MacroScore {.gcsafe.} =
let db = open("../data/testing/testing.db", "", "", "")
db.exec(sql"PRAGMA read_uncommitted = ON;")
db.exec(sql"PRAGMA synchronous = NORMAL")
var langToAccuracy = initTable[string, Accuracy]()
let wordCount = words.values.toSeq()[0].len()
var languages : seq[string]
var fastCounter = initCountTable[string]()
for (lang, words) in words.pairs:
doAssert(words.len() == wordCount, $words.len() & " does not equal " & $wordCount)
langToAccuracy[lang] = Accuracy()
langToAccuracy[lang].language = lang
langToAccuracy[lang].wordCount = wordCount
languages.add lang
for row in db.fastrows(sql"select Lang, Sample, Rowid from TrainingData"):
let correctLanguage = row[0]
let sample = row[1]
if correctLanguage notin languages:
continue
var wordCounts = newTable[string, CountTable[string]]()
if beFast:
if correctLanguage in fastCounter:
if fastCounter[correctLanguage] == 500:
continue
fastCounter.inc(correctLanguage)
let result = makeResult zipfsLanguageDetector(languages, sample, wordCounter = wordCounts, words)
let correct = result[0][0] == correctLanguage
if correct:
langToAccuracy[correctLanguage].correct+=1
else:
langToAccuracy[correctLanguage].incorrect+=1
langToAccuracy[correctLanguage].languagesConfusedFor.inc(result[0][0])
for (key,val) in wordCounts.pairs:
if key == correctLanguage and correct:
langToAccuracy[correctLanguage].correctWordCounts.merge(val)
else:
langToAccuracy[key].incorrectWordCounts.merge(val)
return makeMacroScore(score(langToAccuracy), words)
var resultChannel : Channel[Table[string, CountTable[string]]]
resultChannel.open()
proc doWordStuff(a : (HashSet[string], seq[(string, string)])) {.gcsafe.} =
let samples = a[1]
let words = a[0]
var result = initTable[string, CountTable[string]]()
for w in words:
result[w] = initCountTable[string]()
for (sample, language) in samples:
if sample.contains(w):
result[w].inc(language, 1)
resultChannel.send(result)
proc createWordLanguageOccurrences*(words : HashSet[string], languages : HashSet[string], stage : Stage, samplesOutput : var int) : Table[string, CountTable[string]] =
let db = open("../data/testing/testing.db", "", "", "")
var fastCounter = initCountTable[string]()
var samples : seq[(string, string)]
for row in db.fastrows(sql"select Lang, Lower(substr(Sample, 1, 500)) AS smpl, Rowid from TrainingData"):
let language = row[0]
let sample = row[1]
if language notin language:
continue
let count = fastCounter[language]
fastCounter.inc(language)
case stage:
of First:
if count >= 100:
continue
of Second:
if count notin 100..300:
continue
of Third:
if count notin 301..500:
continue
else:
if count < 501:
continue
samples.add((sample, language))
samplesOutput = samples.len
let cpus = countProcessors()
let forEachThread = samples.distribute(cpus)
var threads = newSeq[Thread[(HashSet[string], seq[(string, string)])]](cpus)
for x in 0 .. cpus-1:
createThread(threads[x], doWordStuff, (words, forEachThread[x]))
for w in words:
result[w] = initCountTable[string]()
jointhreads(threads)
echo "done!"
let resultingObj = collect(for x in 0 .. cpus-1: resultChannel.recv())
echo resultingObj.len
for w in words:
for robj in resultingObj:
result[w].merge(robj[w])
echo "done2!"
proc createScoreNoWords(languages : seq[string], useWords : bool) : Table[string, MiniScore] =
let db = open("../data/testing/testing.db", "", "", "")
db.exec(sql"PRAGMA read_uncommitted = ON;")
db.exec(sql"PRAGMA synchronous = NORMAL")
for lang in languages:
result[lang] = MiniScore()
for row in db.fastrows(sql"select Lang, Lower(substr(Sample, 1, 500)) AS smpl, Rowid from TrainingData"):
if row[0] notin languages:
continue
let language = row[0]
let sample = row[1]
let emptyWords = initTable[string, HashSet[string]]()
let zipfsResult =
if useWords:
makeResult zipfsLanguageDetector(languages, sample)
else:
makeResult zipfsLanguageDetector(languages, sample, nil, emptyWords)
let detectedLanguage = zipfsResult[0][0]
if detectedLanguage != language:
result[language].incorrect+=1
result[language].confusedFor.inc(detectedLanguage)
result[detectedLanguage].incorrect+=1
result[detectedLanguage].confusedFor.inc(language)
else:
result[language].correct+=1
when isMainModule:
echo createWordScore(main.mostCommonWords).faliureRates