Zipfs-Law-Language-Detector/nim/main.nim

import std/jsonutils
import std/enumerate
import db_connector/db_sqlite
import tables
import unicode
import sugar
import sequtils
import json
import unicode
import strutils
import streams
import os
import sets
import algorithm
import math

const wordCount* = 30

proc generateMostCommonWords(a : Table[string, seq[string]]) : Table[string, HashSet[string]]=
  for (key,val) in a.pairs:
    result[key] = toHashSet val
const resultText = staticRead("../data/mostCommonCharacters.json")
const charactersJson = staticRead("../data/alphabets.json")
const wikiToEnglish = staticRead("../data/wikiToEng.json")
const mostCommonWords* = generateMostCommonWords((parseJson staticRead("../data/mostCommonWords.json")).to(Table[string, seq[string]]))
const forbiddenChars = @[","].join("").toRunes().toSeq()

proc isUsedChar(a : Rune, allValidChars : HashSet[Rune]) : (bool, Rune) =
  if a in forbiddenChars:
    return (false, Rune(0))
  #Haungul, unusued
  if int(a) in 0xAC00..0xD7AF:
    return (true, a)
  #Hanzi
  if int(a) in 0x4E00..0x9FFF:
    return (true, a)

  if a notin allValidChars:
    return (false, Rune(0))

  if a.isUpper():
    if a in forbiddenChars:
      return (false, Rune(0))
    else:
      return (true, a.toLower())
  if a.isLower():
    if a.toUpper() in forbiddenChars:
      return (false, Rune(0))

  return (true, a)

proc createValidChars(a : JsonNode, b : Table[string, Table[Rune, float]]) : HashSet[Rune] =
  let extraCharacterLangs = @["zh", "zh-yue"]
  for lang in extraCharacterLangs:
    for key in b[lang].keys:
      result.incl key
  for key in a.keys:
    var parsed : seq[string]
    fromJson(parsed, a[key])
    for y in parsed.join("").toRunes():
      result.incl y

proc jsonToRune(a : JsonNode) : Table[Rune, int] =
  var deSerialized : Table[string, int]
  fromJson(deSerialized, a)
  for key, val in deSerialized.pairs:
    let rune = key.toRunes()[0]
    result[rune] = val

proc allJsonToRuneAbsolute(a : JsonNode) : Table[string, Table[Rune, float]] =
  for key in a.keys:
    var tableBuilder = initTable[Rune, float]()

    let languageTable = a[key]
    var pairs = languageTable.pairs.toSeq().map(x => (x[0], x[1].getInt()) )
    var total = 0
    for (key,val) in pairs:
      total+=val

    for (key,val) in pairs:
      let percentage = (val / total) * 100
      if 0.1 > percentage:
        continue
      let rune = key.toRunes()[0]
      let charAdd =
        if rune.isUpper():
          rune.toLower()
        else:
          rune
      if charAdd in tableBuilder:
        tableBuilder[charAdd] += percentage
      else:
        tableBuilder[charAdd] = percentage

    result[key] = tableBuilder


const statistics* = allJsonToRuneAbsolute parseJson(resultText)
const allValidChars = createValidChars(parseJson(charactersJson), statistics)
const languages* = statistics.keys.toSeq().filter(x=> x in mostCommonWords)

proc createStringSlope(a : string, runeHolder : var seq[Rune]) : (Table[Rune, float], int) {.gcsafe.} =
  let stringRunes = a.toRunes()
  var stepOne = initCountTable[Rune]()
  var runeLength = 0

  for (i, char) in enumerate stringRunes:
    let isUsed = isUsedChar(char, allValidChars)
    if not isUsed[0]:
      continue

    stepOne.inc(isUsed[1])

    runeHolder[runeLength] = isUsed[1]
    runeLength+=1

  result[1] = runeLength

  var total = 0
  var pairs = stepOne.pairs.toSeq()
  for (key,val) in pairs:
    total+=val

  for (key,val) in pairs:
    let percentage = (val / total) * 100
    if 0.1 > percentage:
      continue
    result[0][key] = percentage

proc cubic(x : float) : float =
  let h = -0.33
  let a = 0.3
  let k = 0.56
  return  a*(x-h)^3+k

proc neighborDistance(a : Table[Rune, float], b : Table[Rune, float]) : float =
  var i = 0
  for _ in a.keys:
    i+=1
  var resultBuffer = newSeq[float](i)
  for (i, char) in enumerate a.keys:
    let distance =
      if char in b:
        b[char]
      else:
        -1
   # echo (distance, char)
    resultBuffer[i] = (cubic(abs(a[char]-distance)))
  return resultBuffer.foldl(a+b)

proc zipfsLanguageDetector*(comparisonLangs : seq[string], sample : string,
              wordCounter : TableRef[string, CountTable[string]] = nil,
              words : Table[string, HashSet[string]] = mostCommonWords) : Table[string, float] {.gcsafe.} =
  let deNoised = comparisonLangs.map(x => statistics[x])
  var runeStr = newSeq[Rune](sample.len())
  let (stringSlope, runeLength) = createStringSlope(sample, runeStr)
  var i = 0
  if runeLength == 0:
    result["unknown"] = -1
    return

  #echo sample
  #echo stringSlope
  #echo "==="

  let sample = runeStr.join("")
  let subsampleRunes =
    if runeLength > 100:
      runeStr[0 .. 100]
    else:
      runeStr

  let subsample = subsampleRunes.join("")

  let characters = runeStr.deduplicate()
  var potentialLanguages = toHasHSet comparisonLangs

  #This is needed for identifying chinese and not mixing up japanese
  var chineseScore = 0
  var japaneseScore = 0
  var charCount = 0
  for c in subsampleRunes:
    let val = int(c)
    if val in 0x4E00..0x9FFF:
      chineseScore += 1
    elif val in 0x3040..0x309F or val in 0x30A0..0x30FF:
      japaneseScore+=1
    charCount+=1

  #[
  if chineseScore >= floordiv(charCount, 5) and floordiv(charCount, 5) >= japaneseScore:
    potentialLanguages = toHasHSet @["zh", "zh-yue"]

  elif japaneseScore >= floordiv(charCount, 4):
    potentialLanguages = toHasHSet @["ja"]
  ]#
  for (language, slope) in zip(comparisonLangs, deNoised):
    #We check based on the keys in each language
    #If we put slope first, we check if each char in the language is found in the slope
    #And vice versa
    #if a lanauge is logographic, we should compare the sample to the lanugage
    #Becuase, it is less specalized. Wikipedia's Chinse has a lot of Characters
    #That your average sample will not have
    if language notin potentialLanguages:
      continue
    if language notin ["zh", "zh-yue", "ja"]:
      result[language] = neighborDistance(slope, stringSlope)
    else:
      result[language] = neighborDistance(stringSlope, slope)

    for c in characters:
      if c notin slope:
        result[language] *= 1.05

    if language notin words:
      continue

    if wordCounter != nil and language notin wordCounter:
        wordCounter[language] = initCountTable[string]()

    let mostCommon = words[language]
    #TODO: Make this not be insane
    for word in mostCommon:
      if subsample.contains(word):
        result[language] *= 0.7
        if wordCounter != nil:
          wordCounter[language].inc(word)


  if "ko" in potentialLanguages:
    if "ko" notin result:
      result["ko"] = 100
    for char in subsampleRunes:
      #the Hangul unicode block
      #Korean has A LOT of character combinations so this is the best way

      if int(char) notin 0xAC00..0xD7AF:
        result["ko"] *= 1.05
      else:
        result["ko"] *= 0.85

  var hasAnyKeys = false
  for x in result.keys:
    hasAnyKeys = true
    break
  if not hasAnyKeys:
    result["unknown"] = -1

proc createCStringArray(a : openArray[string]  | seq[string]) : (ptr UncheckedArray[cstring], uint16) =
    let length = a.len()
    var sum = 0
    if length == 0:
        let newArray = cast[ptr UncheckedArray[cstring]](create(cstring, 1))
        return (newArray, 0)
    for i in a:
        sum+=i.len()
    var newArray = cast[ptr UncheckedArray[cstring]](create(cstring, sum))

    for i in 0 .. a.high:
        newArray[i] = cstring a[i]
    return (newArray, uint16 length-1)


proc makeResult*(oldResult : Table[string, float]) : seq[(string, float)] =
  result = oldResult.pairs().toSeq()
  result.sort((x,y)=> cmp(x[1], y[1]))

proc zipfs_language_detector*(languages : ptr UncheckedArray[cstring],
                              languages_count : uint64,
                              sample : cstring,
                              successful : ptr bool,
                              length_output : ptr uint64,
                              result_buffer_float : ptr ptr UncheckedArray[float32],
                              result : ptr ptr UncheckedArray[cstring]
                              ) {.exportc, dynlib.} =
  let languages = languages.toOpenArray(0, int(languages_count)).toSeq().map(x=> $x)
  let sample = $sample
  let output = zipfsLanguageDetector(languages, sample)
  let resultPairs = makeResult output
  length_output[] = cuint(resultPairs.high)
  successful[] = length_output[] == languages_count
  result[] = cast[ptr UncheckedArray[cstring]](create(cstring, sizeof(cstring)*resultPairs.high))
  var i = 0
  for (language, _) in resultPairs:
    var newCstring = cast[cstring](create(byte, language.high))
    copyMem(newCstring, addr language[0], language.len)
    result[][i] = newCstring
    i+=1
  let floatsize = sizeof(float32)*(int(length_output[])+2)
  let floatResult = cast[ptr UncheckedArray[float32]](create(float32, floatsize))
  for x in 0 .. resultPairs.high:
    floatResult[x] = resultPairs[x][1]
  result_buffer_float[] = floatResult

#[
let testLanguages  = createCStringArray(@["sv", "en"])
var successful = false
var lengthResult : uint64 = 0
var result : ptr UncheckedArray[cstring]
var floatResult : ptr UncheckedArray[float32]

zipfs_language_detector(
  testLanguages[0],
  cuint testLanguages[1],
  cstring "the",
  addr successful,
  addr lengthResult,
  addr floatResult,
  addr result
)

echo successful
echo floatResult.toOpenArray(0, int(lengthResult)).toSeq()
echo result.toOpenArray(0, int(lengthResult)).toSeq()
]#