Zipfs-Law-Language-Detector/nim/geneticTools.nim

import tiny_sqlite
import sequtils
import sugar
import tables
import times
import algorithm
import strutils
import stats
import sets
import std/enumerate
import nimSHA2
import ./main
import ./scoring
import strformat
import math

##  A lot of this file is now deprecated due to a different algorithm being introduced
##  However, should the other algorithm be reintoduced, this will be beneficial
##  And so, it will continue to be in the codebase.

type
  GeneticBase* = object
    scoreGene* : Table[string, Score]
    wordsGene* : Table[string, HashSet[string]]
  Generation* = object
    startTime* : int64
    endTime* : int64

proc wordsAndScoreToGene*(a : Table[string, HashSet[string]], b : Table[string, Score]) : GeneticBase =
  result.wordsGene = a
  result.scoreGene = b

proc tallyScores*(a : MacroScore) : float =
  # Biggest portion, how successful it is
  var successScore = 0.0;
  successScore = a.faliureRates.mean + (a.faliureRates.max / 3)
  var wordScore = a.percentBadUtilizations.mean + a.badUtilizationPerWords.mean + a.percentBadUtilizations.max
  wordScore *= (1/ (a.percentGoodUtilizations.mean / 100))
  wordScore *= (1 / (a.usedWordPercentsGood.mean) / 100)
  result  = (successScore*0.80)+(wordScore*0.20)


proc serializeAccuracy(db : DbConn, a : Accuracy, macroScoreId : int64, language : string) : int64  =
  let dbval = toDbValues(macroScoreId, a.language, a.correct, a.incorrect, a.faliures)
  db.exec("insert into Accuracy(MacroScoreId, Language, Correct, Incorrect, Faliures) VALUES (?, ?, ?, ?, ?)", dbval)
  result = db.lastInsertRowId()
  let accuracyId = result
  let countyInsert = """INSERT INTO CountTable (MacroScoreId, AccuracyId, Language, StrKey, IntVal, Type)
  VALUES (?, ?, ?, ?, ?, ?);"""
  let languagesConfusedForMap = a.languagesConfusedFor.pairs.toSeq().map(x=> toDbValues(macroScoreId, accuracyId, language, x[0], x[1], "LanguagesConfusedFor"))

  let correctMap = a.correctWordCounts.pairs.toSeq().map(x=> toDbValues(macroScoreId, accuracyId, language, x[0], x[1], "correctWords"))

  let incorrectMap = a.incorrectWordCounts.pairs.toSeq().map(x=> toDbValues(macroScoreId, accuracyId, language,  x[0], x[1], "incorrectWords"))

  db.execMany(countyInsert, languagesConfusedForMap)
  db.execMany(countyInsert, correctMap)
  db.execMany(countyInsert, incorrectMap)

proc serializeScore(db : DbConn, a : Score, macroScoreId : int64, accuracyId : int64) =
  let dbval = toDbValues(macroScoreId, accuracyId, a.language, a.faliureRate, a.totalWordUtilization, a.utlizationPerWord, a.totalGoodWordUtilization, a.goodUtilizationPerWord, a.percentGoodUtilization, a.totalBadWordUtilization, a.badUtilizationPerWord, a.percentBadUtilization, a.usedWordPercentGood, a.usedWordPercentBad)

  let insertStatement = """
  INSERT INTO Score (
      MacroScoreId,
      AccuracyRowId,
      Language,
      FaliureRate,
      TotalWordUtilization,
      UtlizationPerWord,
      TotalGoodWordUtilization,
      GoodUtilizationPerWord,
      PercentGoodUtilization,
      TotalBadWordUtilization,
      BadUtilizationPerWord,
      PercentBadUtilization,
      UsedWordPercentGood,
      UsedWordPercentBad
  ) VALUES (
      ?,
      ?,
      ?,
      ?,
      ?,
      ?,
      ?,
      ?,
      ?,
      ?,
      ?,
      ?,
      ?,
      ?
  );"""
  db.exec(insertStatement, dbval)

proc serializeRunningStat(db : DbConn, a : RunningStat, macroScoreId : int64) : int64 =
  let dbval = toDbValues(macroScoreId, a.max, a.min, a.sum, a.mean, a.standardDeviation)
  let insertStatement = """INSERT INTO RunningStat (MacroScoreId, Max, Min, Sum, Mean, StdDeviation) VALUES (?, ?, ?, ?, ?, ?)"""
  db.exec(insertStatement, dbval)
  return db.lastInsertRowId()

proc makeMacroScoreDb(db : DbConn, generationRowId : int64) : int64 =
  db.exec("insert into MacroScore(IsPopulated, Generation) VALUES (1, ?)", generationRowId)
  return db.lastInsertRowId()

proc seralizeMacroScores(db : DbConn, macroScoreId : int, a : MacroScore, foreginKeys : array[0 .. 10, int64]) =
  let score = tallyScores(a)
  let dbVals = toDbValues(score, a.wordCount,
                            foreginKeys[0], foreginKeys[1], foreginKeys[2], foreginKeys[3],
                            foreginKeys[4], foreginKeys[5], foreginKeys[6], foreginKeys[7],
                            foreginKeys[8], foreginKeys[9], foreginKeys[10], macroScoreId
                          )
  let updateStatement = """UPDATE MacroScore SET
      Score = ?,
      WordCount = ?,

      FaliureRates = ?,
      TotalWordUtilizations = ?,
      UtlizationPerWords = ?,

      TotalGoodWordUtilizations = ?,
      GoodUtilizationPerWords = ?,
      PercentGoodUtilizations = ?,

      TotalBadWordUtilizations = ?,
      BadUtilizationPerWords = ?,
      PercentBadUtilizations = ?,

      UsedWordPercentsBad = ?,
      UsedWordPercentsGood = ?,

      IsPopulated = 0 WHERE rowid = ?"""
  db.exec(updateStatement, dbVals)

proc wordsToString(a : Table[string, HashSet[string]]) : string  =
  return ($a).toSeq().sorted().join("")


proc vocabAlreadyExists*(trainingDb : DbConn, a : Table[string, HashSet[string]] ) : bool =
  let digest = toHex($computeSHA256(wordsToString(a)))
  return trainingDb.one("select * from WordSums where Sha256 = ?", $digest).isSome()

proc serializeMacroScore*(trainingDb : DbConn, a : MacroScore, generationRowId : int) =
  if vocabAlreadyExists(trainingDb, a.words):
    return
  let sha256 = toHex $computeSHA256(wordsToString(a.words))
  let vocabStmt = "INSERT INTO WordSums(Sha256) VALUES (?)"
  trainingdb.exec(vocabStmt, $sha256)

 # trainingDb.exec("PRAGMA synchronous = EXTRA")
 # trainingDb.exec("PRAGMA journal_mode = WAL")
  trainingDb.exec("PRAGMA foreign_keys = OFF")

  let macroDbId = makeMacroScoreDb(trainingdb, generationRowId)
  let stmt = "INSERT INTO Words(StrKey, Language, MacroScoreId) VALUES (?, ?, ?)"
  let score = tallyScores(a)


  for (key,val) in a.words.pairs:
    let words = collect(for word in val: toDbValues(word, key, macroDbId))
    trainingdb.execMany(stmt, words)

  let statsArray = @[a.faliureRates, a.totalWordUtilizations, a.utlizationPerWords,
                    a.totalGoodWordUtilizations, a.goodUtilizationPerWords, a.percentGoodUtilizations,
                    a.totalBadWordUtilizations, a.badUtilizationPerWords, a.percentBadUtilizations,
                    a.usedWordPercentsBad, a.usedWordPercentsGood]
  var stats : array[0..10, int64]

  for (i,x) in enumerate statsArray:
    stats[i] = serializeRunningStat(trainingdb, x, macroDbId)

  seralizeMacroScores(trainingdb, macroDbId, a, stats)
  for (key,val) in a.scores.pairs:
    let score = serializeAccuracy(trainingdb, val.accuracy, macroDbId, key)
    serializeScore(trainingdb, val, macroDbId, score)

proc createWordAccuracyTable*(a : GeneticBase) : Table[string, Table[string, float]] =
  proc cubic(x : float) : float =
    let h = -0.3
    let a = 2.5
    let k = 0.0
    return  a*(x-h)^3+k
  for (language, score) in a.scoreGene.pairs:
    let accuracy = score.accuracy
    let iTable = accuracy.incorrectWordCounts
    let cTable = accuracy.correctWordCounts

    var totalTable = accuracy.correctWordCounts
    totalTable.merge(accuracy.incorrectWordCounts)

    if totalTable.values.toSeq().high == -1: continue
    let totalFound = totalTable.values.toSeq().foldl(a+b)

    result[language] = initTable[string, float]()
    let weight = 1.0
    for word in a.wordsGene[language]:
      let percent =
        #if its never been found, its useless, so its 100% wrong
        if word notin cTable:
          500000.0
        #if its not found in iTable that means its 100% right
        elif word notin iTable:
          let step1 = weight
          let normalizationFactor = cubic(1-((cTable[word] / totalFound)+0.1))
          step1*normalizationFactor
        else:
          let step1 = (iTable[word] / totalTable[word])*100
          let normalizationFactor = cubic((1-(cTable[word] / totalFound)+0.1))
          step1*normalizationFactor

      result[language][word] = percent
proc insertGeneration*(db : DbConn, a : Generation) : int64 =
  let insert = """INSERT INTO Generation(TimeStarted, TimeEnded) VALUES(?, ?);"""
  db.exec(insert, a.startTime, a.endTime)
  return db.lastInsertRowId()

proc setupAccuracies(countTables : seq[(string, string, string, int64)], accuracies : seq[(string, int64, int64, int64)]) : (Table[string, Accuracy], Table[string, HashSet[string]]) =
  let languages = toHashSet accuracies.map(x=>x[0])

  for lang in languages:
    var incorrectWords = initCountTable[string]()
    var LanguagesConfusedFor = initCountTable[string]()
    var correctWords = initCountTable[string]()
    var dictonary : HashSet[string]
    for (language, typeOfTable, key, val) in countTables:
      if language == lang:
        dictonary.incl(key)
        if typeOfTable == "LanguagesConfusedFor":
          LanguagesConfusedFor.inc(key, val)
        elif typeOfTable == "incorrectWords":
          incorrectWords.inc(key, val)
        elif typeOfTable == "correctWords":
          correctWords.inc(key, val)
    var score : (string, int64, int64, int64)
    for scores in accuracies:
      if lang == scores[0]:
        score = scores
        break
    doAssert(score[0] != "")
    let correctCount = score[1]
    let incorrectCount = score[2]
    let failiureCount = score[3]
    var newAcc = Accuracy()
    newAcc.language = lang
    newAcc.correct = correctCount
    newAcc.incorrect = incorrectCount
    newAcc.faliures = failiureCount
    newAcc.wordCount = wordCount
    newAcc.languagesConfusedFor = LanguagesConfusedFor
    newAcc.correctWordCounts = correctWords
    newAcc.incorrectWordCounts = incorrectWords
    result[0][lang] = newAcc
    result[1][lang] = dictonary

proc deserializeMacroScore*(db : DbConn, MacroScoreId : int, wordCount : int) : MacroScore =
  let data = db.iterate("select Language, Type, StrKey, IntVal from CountTable where MacroScoreId = ?;", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].strval, x[2].strval, x[3].intval))
  let languages = data.map(x=>x[0]).deduplicate()


  let accuracies = db.iterate("select Language, Correct, Incorrect, Faliures from Accuracy where MacroScoreId = ?;", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].intval, x[2].intval, x[3].intval))

  let words = db.iterate("select Language, StrKey from Words where MacroScoreId = ?", MacroScoreId).toSeq().map(x=> (x[0].strval, x[1].strval))

  let (accuracy, dictonary) = setupAccuracies(data, accuracies)

  result = makeMacroScore(score(accuracy), dictonary)

proc deserializeGeneration*(db : DbConn, generationNumber : int64) : seq[MacroScore] =
  let accuracies = db.iterate("select Rowid, WordCount from MacroScore where generation = ? and IsPopulated = 0;", generationNumber).toSeq().map(x=> (x[0].intval, x[1].intval))
  for (i, wordcount) in accuracies:
    result.add(deserializeMacroScore(db, i, wordcount))

proc createBestModel*(db : DbConn) : MacroScore =
  db.execScript("""
  DROP TABLE IF EXISTS tmptable;
  CREATE temporary TABLE tmptable AS
    SELECT DISTINCT( s.language ),
                  (SELECT macroscoreid
                    FROM   score AS s1
                    WHERE  s1.language = s.language
                    ORDER  BY faliurerate ASC
                    LIMIT  1) AS macroscoreidbest
    FROM score AS s;
  """)
  echo db.iterate("""select language, macroscoreidbest from tmptable""").toSeq().map(x=> (x[0].strval, x[1].intval))

  let languageToPerformance = db.iterate("""
  SELECT tmptable.language,
          macroscoreidbest,
          correct,
          Faliures,
          incorrect FROM tmptable JOIN accuracy
            ON accuracy.macroscoreid = macroscoreidbest
              AND accuracy.language = tmptable.language;
  """).toSeq().map(x=> (x[0].strval,  x[2].intval, x[3].intval, x[4].intval))
  echo "step2!"
  let languageToCounttable = db.iterate(
  """   SELECT tmptable.language,
          macroscoreidbest,
          type,
          strkey,
          intval
  FROM   tmptable
          JOIN counttable
            ON counttable.macroscoreid = macroscoreidbest
              AND counttable.language = tmptable.language;
   """).toSeq().map(x=> (x[0].strval, x[2].strval, x[3].strval, x[4].intval))
  let (accuracy, dictionary) = setupAccuracies(languageToCounttable, languageToPerformance)
  return makeMacroScore(score(accuracy), dictionary)

proc createBestFromWordsBest*(db : DbConn, wordCount : int) : Table[string, HashSet[string]] =
  let data = db.iterate("""
  WITH RankedWords AS (
      SELECT
          language,
          word,
          ROW_NUMBER() OVER (PARTITION BY language ORDER BY score DESC) AS rn
      FROM
          WordScore
  )
  SELECT
      language,
      word
  FROM
      RankedWords
  WHERE
      rn <= ?;
  """, wordCount).toSeq().map(x=>(x[0].strval, x[1].strval))
  for (language, word) in data:
    if language notin main.mostCommonWords: continue
    if language notin result:
      result[language] = initHashSet[string]()
    result[language].incl(word)