Zipfs-Law-Language-Detector/nim/main.nim
2024-10-10 22:37:54 -04:00

315 lines
9.3 KiB
Nim

import std/jsonutils
import std/enumerate
import db_connector/db_sqlite
import tables
import unicode
import sugar
import sequtils
import json
import unicode
import strutils
import streams
import os
import sets
import algorithm
import math
const wordCount* = 30
proc generateMostCommonWords(a : Table[string, seq[string]]) : Table[string, HashSet[string]]=
for (key,val) in a.pairs:
result[key] = toHashSet val
const resultText = staticRead("../data/mostCommonCharacters.json")
const charactersJson = staticRead("../data/alphabets.json")
const wikiToEnglish = staticRead("../data/wikiToEng.json")
const mostCommonWords* = generateMostCommonWords((parseJson staticRead("../data/mostCommonWords.json")).to(Table[string, seq[string]]))
const forbiddenChars = @[","].join("").toRunes().toSeq()
proc isUsedChar(a : Rune, allValidChars : HashSet[Rune]) : (bool, Rune) =
if a in forbiddenChars:
return (false, Rune(0))
#Haungul, unusued
if int(a) in 0xAC00..0xD7AF:
return (true, a)
#Hanzi
if int(a) in 0x4E00..0x9FFF:
return (true, a)
if a notin allValidChars:
return (false, Rune(0))
if a.isUpper():
if a in forbiddenChars:
return (false, Rune(0))
else:
return (true, a.toLower())
if a.isLower():
if a.toUpper() in forbiddenChars:
return (false, Rune(0))
return (true, a)
proc createValidChars(a : JsonNode, b : Table[string, Table[Rune, float]]) : HashSet[Rune] =
let extraCharacterLangs = @["zh", "zh-yue"]
for lang in extraCharacterLangs:
for key in b[lang].keys:
result.incl key
for key in a.keys:
var parsed : seq[string]
fromJson(parsed, a[key])
for y in parsed.join("").toRunes():
result.incl y
proc jsonToRune(a : JsonNode) : Table[Rune, int] =
var deSerialized : Table[string, int]
fromJson(deSerialized, a)
for key, val in deSerialized.pairs:
let rune = key.toRunes()[0]
result[rune] = val
proc allJsonToRuneAbsolute(a : JsonNode) : Table[string, Table[Rune, float]] =
for key in a.keys:
var tableBuilder = initTable[Rune, float]()
let languageTable = a[key]
var pairs = languageTable.pairs.toSeq().map(x => (x[0], x[1].getInt()) )
var total = 0
for (key,val) in pairs:
total+=val
for (key,val) in pairs:
let percentage = (val / total) * 100
if 0.1 > percentage:
continue
let rune = key.toRunes()[0]
let charAdd =
if rune.isUpper():
rune.toLower()
else:
rune
if charAdd in tableBuilder:
tableBuilder[charAdd] += percentage
else:
tableBuilder[charAdd] = percentage
result[key] = tableBuilder
const statistics* = allJsonToRuneAbsolute parseJson(resultText)
const allValidChars = createValidChars(parseJson(charactersJson), statistics)
const languages* = statistics.keys.toSeq().filter(x=> x in mostCommonWords)
proc createStringSlope(a : string, runeHolder : var seq[Rune]) : (Table[Rune, float], int) {.gcsafe.} =
let stringRunes = a.toRunes()
var stepOne = initCountTable[Rune]()
var runeLength = 0
for (i, char) in enumerate stringRunes:
let isUsed = isUsedChar(char, allValidChars)
if not isUsed[0]:
continue
stepOne.inc(isUsed[1])
runeHolder[runeLength] = isUsed[1]
runeLength+=1
result[1] = runeLength
var total = 0
var pairs = stepOne.pairs.toSeq()
for (key,val) in pairs:
total+=val
for (key,val) in pairs:
let percentage = (val / total) * 100
if 0.1 > percentage:
continue
result[0][key] = percentage
proc cubic(x : float) : float =
let h = -0.33
let a = 0.3
let k = 0.56
return a*(x-h)^3+k
proc neighborDistance(a : Table[Rune, float], b : Table[Rune, float]) : float =
var i = 0
for _ in a.keys:
i+=1
var resultBuffer = newSeq[float](i)
for (i, char) in enumerate a.keys:
let distance =
if char in b:
b[char]
else:
-1
# echo (distance, char)
resultBuffer[i] = (cubic(abs(a[char]-distance)))
return resultBuffer.foldl(a+b)
proc zipfsLanguageDetector*(comparisonLangs : seq[string], sample : string,
wordCounter : TableRef[string, CountTable[string]] = nil,
words : Table[string, HashSet[string]] = mostCommonWords) : Table[string, float] {.gcsafe.} =
let deNoised = comparisonLangs.map(x => statistics[x])
var runeStr = newSeq[Rune](sample.len())
let (stringSlope, runeLength) = createStringSlope(sample, runeStr)
var i = 0
if runeLength == 0:
result["unknown"] = -1
return
#echo sample
#echo stringSlope
#echo "==="
let sample = runeStr.join("")
let subsampleRunes =
if runeLength > 100:
runeStr[0 .. 100]
else:
runeStr
let subsample = subsampleRunes.join("")
let characters = runeStr.deduplicate()
var potentialLanguages = toHasHSet comparisonLangs
#This is needed for identifying chinese and not mixing up japanese
var chineseScore = 0
var japaneseScore = 0
var charCount = 0
for c in subsampleRunes:
let val = int(c)
if val in 0x4E00..0x9FFF:
chineseScore += 1
elif val in 0x3040..0x309F or val in 0x30A0..0x30FF:
japaneseScore+=1
charCount+=1
#[
if chineseScore >= floordiv(charCount, 5) and floordiv(charCount, 5) >= japaneseScore:
potentialLanguages = toHasHSet @["zh", "zh-yue"]
elif japaneseScore >= floordiv(charCount, 4):
potentialLanguages = toHasHSet @["ja"]
]#
for (language, slope) in zip(comparisonLangs, deNoised):
#We check based on the keys in each language
#If we put slope first, we check if each char in the language is found in the slope
#And vice versa
#if a lanauge is logographic, we should compare the sample to the lanugage
#Becuase, it is less specalized. Wikipedia's Chinse has a lot of Characters
#That your average sample will not have
if language notin potentialLanguages:
continue
if language notin ["zh", "zh-yue", "ja"]:
result[language] = neighborDistance(slope, stringSlope)
else:
result[language] = neighborDistance(stringSlope, slope)
for c in characters:
if c notin slope:
result[language] *= 1.05
if language notin words:
continue
if wordCounter != nil and language notin wordCounter:
wordCounter[language] = initCountTable[string]()
let mostCommon = words[language]
#TODO: Make this not be insane
for word in mostCommon:
if subsample.contains(word):
result[language] *= 0.7
if wordCounter != nil:
wordCounter[language].inc(word)
if "ko" in potentialLanguages:
if "ko" notin result:
result["ko"] = 100
for char in subsampleRunes:
#the Hangul unicode block
#Korean has A LOT of character combinations so this is the best way
if int(char) notin 0xAC00..0xD7AF:
result["ko"] *= 1.05
else:
result["ko"] *= 0.85
var hasAnyKeys = false
for x in result.keys:
hasAnyKeys = true
break
if not hasAnyKeys:
result["unknown"] = -1
proc createCStringArray(a : openArray[string] | seq[string]) : (ptr UncheckedArray[cstring], uint16) =
let length = a.len()
var sum = 0
if length == 0:
let newArray = cast[ptr UncheckedArray[cstring]](create(cstring, 1))
return (newArray, 0)
for i in a:
sum+=i.len()
var newArray = cast[ptr UncheckedArray[cstring]](create(cstring, sum))
for i in 0 .. a.high:
newArray[i] = cstring a[i]
return (newArray, uint16 length-1)
proc makeResult*(oldResult : Table[string, float]) : seq[(string, float)] =
result = oldResult.pairs().toSeq()
result.sort((x,y)=> cmp(x[1], y[1]))
proc zipfs_language_detector*(languages : ptr UncheckedArray[cstring],
languages_count : uint64,
sample : cstring,
successful : ptr bool,
length_output : ptr uint64,
result_buffer_float : ptr ptr UncheckedArray[float32],
result : ptr ptr UncheckedArray[cstring]
) {.exportc, dynlib.} =
let languages = languages.toOpenArray(0, int(languages_count)).toSeq().map(x=> $x)
let sample = $sample
let output = zipfsLanguageDetector(languages, sample)
let resultPairs = makeResult output
length_output[] = cuint(resultPairs.high)
successful[] = length_output[] == languages_count
result[] = cast[ptr UncheckedArray[cstring]](create(cstring, sizeof(cstring)*resultPairs.high))
var i = 0
for (language, _) in resultPairs:
var newCstring = cast[cstring](create(byte, language.high))
copyMem(newCstring, addr language[0], language.len)
result[][i] = newCstring
i+=1
let floatsize = sizeof(float32)*(int(length_output[])+2)
let floatResult = cast[ptr UncheckedArray[float32]](create(float32, floatsize))
for x in 0 .. resultPairs.high:
floatResult[x] = resultPairs[x][1]
result_buffer_float[] = floatResult
#[
let testLanguages = createCStringArray(@["sv", "en"])
var successful = false
var lengthResult : uint64 = 0
var result : ptr UncheckedArray[cstring]
var floatResult : ptr UncheckedArray[float32]
zipfs_language_detector(
testLanguages[0],
cuint testLanguages[1],
cstring "the",
addr successful,
addr lengthResult,
addr floatResult,
addr result
)
echo successful
echo floatResult.toOpenArray(0, int(lengthResult)).toSeq()
echo result.toOpenArray(0, int(lengthResult)).toSeq()
]#