315 lines
9.3 KiB
Nim
315 lines
9.3 KiB
Nim
import std/jsonutils
|
|
import std/enumerate
|
|
import db_connector/db_sqlite
|
|
import tables
|
|
import unicode
|
|
import sugar
|
|
import sequtils
|
|
import json
|
|
import unicode
|
|
import strutils
|
|
import streams
|
|
import os
|
|
import sets
|
|
import algorithm
|
|
import math
|
|
|
|
const wordCount* = 30
|
|
|
|
proc generateMostCommonWords(a : Table[string, seq[string]]) : Table[string, HashSet[string]]=
|
|
for (key,val) in a.pairs:
|
|
result[key] = toHashSet val
|
|
const resultText = staticRead("../data/mostCommonCharacters.json")
|
|
const charactersJson = staticRead("../data/alphabets.json")
|
|
const wikiToEnglish = staticRead("../data/wikiToEng.json")
|
|
const mostCommonWords* = generateMostCommonWords((parseJson staticRead("../data/mostCommonWords.json")).to(Table[string, seq[string]]))
|
|
const forbiddenChars = @[","].join("").toRunes().toSeq()
|
|
|
|
proc isUsedChar(a : Rune, allValidChars : HashSet[Rune]) : (bool, Rune) =
|
|
if a in forbiddenChars:
|
|
return (false, Rune(0))
|
|
#Haungul, unusued
|
|
if int(a) in 0xAC00..0xD7AF:
|
|
return (true, a)
|
|
#Hanzi
|
|
if int(a) in 0x4E00..0x9FFF:
|
|
return (true, a)
|
|
|
|
if a notin allValidChars:
|
|
return (false, Rune(0))
|
|
|
|
if a.isUpper():
|
|
if a in forbiddenChars:
|
|
return (false, Rune(0))
|
|
else:
|
|
return (true, a.toLower())
|
|
if a.isLower():
|
|
if a.toUpper() in forbiddenChars:
|
|
return (false, Rune(0))
|
|
|
|
return (true, a)
|
|
|
|
proc createValidChars(a : JsonNode, b : Table[string, Table[Rune, float]]) : HashSet[Rune] =
|
|
let extraCharacterLangs = @["zh", "zh-yue"]
|
|
for lang in extraCharacterLangs:
|
|
for key in b[lang].keys:
|
|
result.incl key
|
|
for key in a.keys:
|
|
var parsed : seq[string]
|
|
fromJson(parsed, a[key])
|
|
for y in parsed.join("").toRunes():
|
|
result.incl y
|
|
|
|
proc jsonToRune(a : JsonNode) : Table[Rune, int] =
|
|
var deSerialized : Table[string, int]
|
|
fromJson(deSerialized, a)
|
|
for key, val in deSerialized.pairs:
|
|
let rune = key.toRunes()[0]
|
|
result[rune] = val
|
|
|
|
proc allJsonToRuneAbsolute(a : JsonNode) : Table[string, Table[Rune, float]] =
|
|
for key in a.keys:
|
|
var tableBuilder = initTable[Rune, float]()
|
|
|
|
let languageTable = a[key]
|
|
var pairs = languageTable.pairs.toSeq().map(x => (x[0], x[1].getInt()) )
|
|
var total = 0
|
|
for (key,val) in pairs:
|
|
total+=val
|
|
|
|
for (key,val) in pairs:
|
|
let percentage = (val / total) * 100
|
|
if 0.1 > percentage:
|
|
continue
|
|
let rune = key.toRunes()[0]
|
|
let charAdd =
|
|
if rune.isUpper():
|
|
rune.toLower()
|
|
else:
|
|
rune
|
|
if charAdd in tableBuilder:
|
|
tableBuilder[charAdd] += percentage
|
|
else:
|
|
tableBuilder[charAdd] = percentage
|
|
|
|
result[key] = tableBuilder
|
|
|
|
|
|
const statistics* = allJsonToRuneAbsolute parseJson(resultText)
|
|
const allValidChars = createValidChars(parseJson(charactersJson), statistics)
|
|
const languages* = statistics.keys.toSeq().filter(x=> x in mostCommonWords)
|
|
|
|
proc createStringSlope(a : string, runeHolder : var seq[Rune]) : (Table[Rune, float], int) {.gcsafe.} =
|
|
let stringRunes = a.toRunes()
|
|
var stepOne = initCountTable[Rune]()
|
|
var runeLength = 0
|
|
|
|
for (i, char) in enumerate stringRunes:
|
|
let isUsed = isUsedChar(char, allValidChars)
|
|
if not isUsed[0]:
|
|
continue
|
|
|
|
stepOne.inc(isUsed[1])
|
|
|
|
runeHolder[runeLength] = isUsed[1]
|
|
runeLength+=1
|
|
|
|
result[1] = runeLength
|
|
|
|
var total = 0
|
|
var pairs = stepOne.pairs.toSeq()
|
|
for (key,val) in pairs:
|
|
total+=val
|
|
|
|
for (key,val) in pairs:
|
|
let percentage = (val / total) * 100
|
|
if 0.1 > percentage:
|
|
continue
|
|
result[0][key] = percentage
|
|
|
|
proc cubic(x : float) : float =
|
|
let h = -0.33
|
|
let a = 0.3
|
|
let k = 0.56
|
|
return a*(x-h)^3+k
|
|
|
|
proc neighborDistance(a : Table[Rune, float], b : Table[Rune, float]) : float =
|
|
var i = 0
|
|
for _ in a.keys:
|
|
i+=1
|
|
var resultBuffer = newSeq[float](i)
|
|
for (i, char) in enumerate a.keys:
|
|
let distance =
|
|
if char in b:
|
|
b[char]
|
|
else:
|
|
-1
|
|
# echo (distance, char)
|
|
resultBuffer[i] = (cubic(abs(a[char]-distance)))
|
|
return resultBuffer.foldl(a+b)
|
|
|
|
proc zipfsLanguageDetector*(comparisonLangs : seq[string], sample : string,
|
|
wordCounter : TableRef[string, CountTable[string]] = nil,
|
|
words : Table[string, HashSet[string]] = mostCommonWords) : Table[string, float] {.gcsafe.} =
|
|
let deNoised = comparisonLangs.map(x => statistics[x])
|
|
var runeStr = newSeq[Rune](sample.len())
|
|
let (stringSlope, runeLength) = createStringSlope(sample, runeStr)
|
|
var i = 0
|
|
if runeLength == 0:
|
|
result["unknown"] = -1
|
|
return
|
|
|
|
#echo sample
|
|
#echo stringSlope
|
|
#echo "==="
|
|
|
|
let sample = runeStr.join("")
|
|
let subsampleRunes =
|
|
if runeLength > 100:
|
|
runeStr[0 .. 100]
|
|
else:
|
|
runeStr
|
|
|
|
let subsample = subsampleRunes.join("")
|
|
|
|
let characters = runeStr.deduplicate()
|
|
var potentialLanguages = toHasHSet comparisonLangs
|
|
|
|
#This is needed for identifying chinese and not mixing up japanese
|
|
var chineseScore = 0
|
|
var japaneseScore = 0
|
|
var charCount = 0
|
|
for c in subsampleRunes:
|
|
let val = int(c)
|
|
if val in 0x4E00..0x9FFF:
|
|
chineseScore += 1
|
|
elif val in 0x3040..0x309F or val in 0x30A0..0x30FF:
|
|
japaneseScore+=1
|
|
charCount+=1
|
|
|
|
#[
|
|
if chineseScore >= floordiv(charCount, 5) and floordiv(charCount, 5) >= japaneseScore:
|
|
potentialLanguages = toHasHSet @["zh", "zh-yue"]
|
|
|
|
elif japaneseScore >= floordiv(charCount, 4):
|
|
potentialLanguages = toHasHSet @["ja"]
|
|
]#
|
|
for (language, slope) in zip(comparisonLangs, deNoised):
|
|
#We check based on the keys in each language
|
|
#If we put slope first, we check if each char in the language is found in the slope
|
|
#And vice versa
|
|
#if a lanauge is logographic, we should compare the sample to the lanugage
|
|
#Becuase, it is less specalized. Wikipedia's Chinse has a lot of Characters
|
|
#That your average sample will not have
|
|
if language notin potentialLanguages:
|
|
continue
|
|
if language notin ["zh", "zh-yue", "ja"]:
|
|
result[language] = neighborDistance(slope, stringSlope)
|
|
else:
|
|
result[language] = neighborDistance(stringSlope, slope)
|
|
|
|
for c in characters:
|
|
if c notin slope:
|
|
result[language] *= 1.05
|
|
|
|
if language notin words:
|
|
continue
|
|
|
|
if wordCounter != nil and language notin wordCounter:
|
|
wordCounter[language] = initCountTable[string]()
|
|
|
|
let mostCommon = words[language]
|
|
#TODO: Make this not be insane
|
|
for word in mostCommon:
|
|
if subsample.contains(word):
|
|
result[language] *= 0.7
|
|
if wordCounter != nil:
|
|
wordCounter[language].inc(word)
|
|
|
|
|
|
if "ko" in potentialLanguages:
|
|
if "ko" notin result:
|
|
result["ko"] = 100
|
|
for char in subsampleRunes:
|
|
#the Hangul unicode block
|
|
#Korean has A LOT of character combinations so this is the best way
|
|
|
|
if int(char) notin 0xAC00..0xD7AF:
|
|
result["ko"] *= 1.05
|
|
else:
|
|
result["ko"] *= 0.85
|
|
|
|
var hasAnyKeys = false
|
|
for x in result.keys:
|
|
hasAnyKeys = true
|
|
break
|
|
if not hasAnyKeys:
|
|
result["unknown"] = -1
|
|
|
|
proc createCStringArray(a : openArray[string] | seq[string]) : (ptr UncheckedArray[cstring], uint16) =
|
|
let length = a.len()
|
|
var sum = 0
|
|
if length == 0:
|
|
let newArray = cast[ptr UncheckedArray[cstring]](create(cstring, 1))
|
|
return (newArray, 0)
|
|
for i in a:
|
|
sum+=i.len()
|
|
var newArray = cast[ptr UncheckedArray[cstring]](create(cstring, sum))
|
|
|
|
for i in 0 .. a.high:
|
|
newArray[i] = cstring a[i]
|
|
return (newArray, uint16 length-1)
|
|
|
|
|
|
proc makeResult*(oldResult : Table[string, float]) : seq[(string, float)] =
|
|
result = oldResult.pairs().toSeq()
|
|
result.sort((x,y)=> cmp(x[1], y[1]))
|
|
|
|
proc zipfs_language_detector*(languages : ptr UncheckedArray[cstring],
|
|
languages_count : uint64,
|
|
sample : cstring,
|
|
successful : ptr bool,
|
|
length_output : ptr uint64,
|
|
result_buffer_float : ptr ptr UncheckedArray[float32],
|
|
result : ptr ptr UncheckedArray[cstring]
|
|
) {.exportc, dynlib.} =
|
|
let languages = languages.toOpenArray(0, int(languages_count)).toSeq().map(x=> $x)
|
|
let sample = $sample
|
|
let output = zipfsLanguageDetector(languages, sample)
|
|
let resultPairs = makeResult output
|
|
length_output[] = cuint(resultPairs.high)
|
|
successful[] = length_output[] == languages_count
|
|
result[] = cast[ptr UncheckedArray[cstring]](create(cstring, sizeof(cstring)*resultPairs.high))
|
|
var i = 0
|
|
for (language, _) in resultPairs:
|
|
var newCstring = cast[cstring](create(byte, language.high))
|
|
copyMem(newCstring, addr language[0], language.len)
|
|
result[][i] = newCstring
|
|
i+=1
|
|
let floatsize = sizeof(float32)*(int(length_output[])+2)
|
|
let floatResult = cast[ptr UncheckedArray[float32]](create(float32, floatsize))
|
|
for x in 0 .. resultPairs.high:
|
|
floatResult[x] = resultPairs[x][1]
|
|
result_buffer_float[] = floatResult
|
|
|
|
#[
|
|
let testLanguages = createCStringArray(@["sv", "en"])
|
|
var successful = false
|
|
var lengthResult : uint64 = 0
|
|
var result : ptr UncheckedArray[cstring]
|
|
var floatResult : ptr UncheckedArray[float32]
|
|
|
|
zipfs_language_detector(
|
|
testLanguages[0],
|
|
cuint testLanguages[1],
|
|
cstring "the",
|
|
addr successful,
|
|
addr lengthResult,
|
|
addr floatResult,
|
|
addr result
|
|
)
|
|
|
|
echo successful
|
|
echo floatResult.toOpenArray(0, int(lengthResult)).toSeq()
|
|
echo result.toOpenArray(0, int(lengthResult)).toSeq()
|
|
]#
|