Optimizations and added Khamer
This commit is contained in:
parent
db3c8d5ccd
commit
3836396ba3
9 changed files with 545 additions and 450 deletions
|
@ -211,6 +211,32 @@
|
|||
"ﻭ",
|
||||
"ﻳ"
|
||||
],
|
||||
"": [
|
||||
"у",
|
||||
"г",
|
||||
"р",
|
||||
"о",
|
||||
"ф",
|
||||
"и",
|
||||
"н",
|
||||
"с",
|
||||
"к",
|
||||
"ј",
|
||||
"е",
|
||||
"з",
|
||||
"ц",
|
||||
"д",
|
||||
"м",
|
||||
"т",
|
||||
"л",
|
||||
"а",
|
||||
"в",
|
||||
"х",
|
||||
"џ",
|
||||
"ш",
|
||||
"ч",
|
||||
"п"
|
||||
],
|
||||
"extended": [
|
||||
"À",
|
||||
"Á",
|
||||
|
@ -690,5 +716,337 @@
|
|||
"Э",
|
||||
"Ю",
|
||||
"Я"
|
||||
],
|
||||
"Japanese": [
|
||||
"あ",
|
||||
"い",
|
||||
"う",
|
||||
"え",
|
||||
"お",
|
||||
"か",
|
||||
"き",
|
||||
"く",
|
||||
"け",
|
||||
"こ",
|
||||
"さ",
|
||||
"し",
|
||||
"す",
|
||||
"せ",
|
||||
"そ",
|
||||
"た",
|
||||
"ち",
|
||||
"つ",
|
||||
"て",
|
||||
"と",
|
||||
"な",
|
||||
"に",
|
||||
"ぬ",
|
||||
"ね",
|
||||
"の",
|
||||
"は",
|
||||
"ひ",
|
||||
"ふ",
|
||||
"へ",
|
||||
"ほ",
|
||||
"ま",
|
||||
"み",
|
||||
"む",
|
||||
"め",
|
||||
"も",
|
||||
"や",
|
||||
"ゆ",
|
||||
"よ",
|
||||
"ら",
|
||||
"り",
|
||||
"る",
|
||||
"れ",
|
||||
"ろ",
|
||||
"わ",
|
||||
"を",
|
||||
"ん",
|
||||
"が",
|
||||
"ぎ",
|
||||
"ぐ",
|
||||
"げ",
|
||||
"ご",
|
||||
"ざ",
|
||||
"じ",
|
||||
"ず",
|
||||
"ぜ",
|
||||
"ぞ",
|
||||
"だ",
|
||||
"ぢ",
|
||||
"づ",
|
||||
"で",
|
||||
"ど",
|
||||
"ば",
|
||||
"び",
|
||||
"ぶ",
|
||||
"べ",
|
||||
"ぼ",
|
||||
"ぱ",
|
||||
"ぴ",
|
||||
"ぷ",
|
||||
"ぺ",
|
||||
"ぽ",
|
||||
"ア",
|
||||
"イ",
|
||||
"ウ",
|
||||
"エ",
|
||||
"オ",
|
||||
"カ",
|
||||
"キ",
|
||||
"ク",
|
||||
"ケ",
|
||||
"コ",
|
||||
"サ",
|
||||
"シ",
|
||||
"ス",
|
||||
"セ",
|
||||
"ソ",
|
||||
"タ",
|
||||
"チ",
|
||||
"ツ",
|
||||
"テ",
|
||||
"ト",
|
||||
"ナ",
|
||||
"ニ",
|
||||
"ヌ",
|
||||
"ネ",
|
||||
"ノ",
|
||||
"ハ",
|
||||
"ヒ",
|
||||
"フ",
|
||||
"ヘ",
|
||||
"ホ",
|
||||
"マ",
|
||||
"ミ",
|
||||
"ム",
|
||||
"メ",
|
||||
"モ",
|
||||
"ヤ",
|
||||
"ユ",
|
||||
"ヨ",
|
||||
"ラ",
|
||||
"リ",
|
||||
"ル",
|
||||
"レ",
|
||||
"ロ",
|
||||
"ワ",
|
||||
"ヲ",
|
||||
"ン",
|
||||
"ャ",
|
||||
"ュ",
|
||||
"ョ",
|
||||
"ゃ",
|
||||
"ゅ",
|
||||
"ょ"
|
||||
],
|
||||
"Indian": [
|
||||
"ऄ",
|
||||
"अ",
|
||||
"आ",
|
||||
"इ",
|
||||
"ई",
|
||||
"उ",
|
||||
"ऊ",
|
||||
"ऋ",
|
||||
"ऌ",
|
||||
"ऍ",
|
||||
"ऎ",
|
||||
"ए",
|
||||
"ऐ",
|
||||
"ऑ",
|
||||
"ऒ",
|
||||
"ओ",
|
||||
"औ",
|
||||
"क",
|
||||
"ख",
|
||||
"ग",
|
||||
"घ",
|
||||
"ङ",
|
||||
"च",
|
||||
"छ",
|
||||
"ज",
|
||||
"झ",
|
||||
"ञ",
|
||||
"ट",
|
||||
"ठ",
|
||||
"ड",
|
||||
"ढ",
|
||||
"ण",
|
||||
"त",
|
||||
"थ",
|
||||
"द",
|
||||
"ध",
|
||||
"न",
|
||||
"ऩ",
|
||||
"प",
|
||||
"फ",
|
||||
"ब",
|
||||
"भ",
|
||||
"म",
|
||||
"य",
|
||||
"र",
|
||||
"ऱ",
|
||||
"ल",
|
||||
"ळ",
|
||||
"ऴ",
|
||||
"व",
|
||||
"श",
|
||||
"ष",
|
||||
"स",
|
||||
"ह",
|
||||
"ऽ",
|
||||
"क़",
|
||||
"ख़",
|
||||
"ग़",
|
||||
"ज़",
|
||||
"ड़",
|
||||
"ढ़",
|
||||
"फ़",
|
||||
"य़",
|
||||
"ॠ",
|
||||
"ॡ",
|
||||
"३",
|
||||
"४",
|
||||
"५",
|
||||
"६",
|
||||
"७",
|
||||
"८",
|
||||
"९"
|
||||
],
|
||||
"Lao": [
|
||||
"ກ",
|
||||
"ຂ",
|
||||
"ຄ",
|
||||
"ງ",
|
||||
"ຈ",
|
||||
"ຉ",
|
||||
"ຊ",
|
||||
"ຍ",
|
||||
"ຎ",
|
||||
"ຏ",
|
||||
"ຐ",
|
||||
"ຑ",
|
||||
"ຒ",
|
||||
"ຓ",
|
||||
"ດ",
|
||||
"ຝ",
|
||||
"ຟ",
|
||||
"ຠ",
|
||||
"ມ",
|
||||
"ຢ",
|
||||
"ຣ",
|
||||
"",
|
||||
"ລ",
|
||||
"",
|
||||
"ວ",
|
||||
"ຨ",
|
||||
"ຩ",
|
||||
"ສ",
|
||||
"ອ",
|
||||
"ຬ"
|
||||
],
|
||||
"Odia": [
|
||||
"ଅ",
|
||||
"ଆ",
|
||||
"ଇ",
|
||||
"ଉ",
|
||||
"ଋ",
|
||||
"ୠ",
|
||||
"ଌ",
|
||||
"୪",
|
||||
"ଏ",
|
||||
"ଐ",
|
||||
"ଓ",
|
||||
"ଔ",
|
||||
"କ",
|
||||
"ଖ",
|
||||
"ଗ",
|
||||
"ଘ",
|
||||
"ଙ",
|
||||
"ଚ",
|
||||
"ଛ",
|
||||
"ଜ",
|
||||
"ଝ",
|
||||
"ଞ",
|
||||
"ଟ",
|
||||
"ଠ",
|
||||
"ଡ",
|
||||
"ଢ",
|
||||
"ଣ",
|
||||
"ତ",
|
||||
"ଥ",
|
||||
"ଦ",
|
||||
"ଧ",
|
||||
"ନ",
|
||||
"ପ",
|
||||
"ଫ",
|
||||
"ବ",
|
||||
"ଭ",
|
||||
"ମ",
|
||||
"ଯ",
|
||||
"ର",
|
||||
"ଲ",
|
||||
"ୱ",
|
||||
"ଶ",
|
||||
"ଷ",
|
||||
"ସ",
|
||||
"ହ"
|
||||
],
|
||||
"Khmer": [
|
||||
"ក",
|
||||
"ខ",
|
||||
"គ",
|
||||
"ឃ",
|
||||
"ង",
|
||||
"ច",
|
||||
"ឆ",
|
||||
"ជ",
|
||||
"ឈ",
|
||||
"ញ",
|
||||
"ដ",
|
||||
"ឋ",
|
||||
"ឌ",
|
||||
"ឍ",
|
||||
"ណ",
|
||||
"ត",
|
||||
"ថ",
|
||||
"ទ",
|
||||
"ធ",
|
||||
"ន",
|
||||
"ប",
|
||||
"ផ",
|
||||
"ព",
|
||||
"ភ",
|
||||
"ម",
|
||||
"យ",
|
||||
"រ",
|
||||
"ល",
|
||||
"វ",
|
||||
"ឝ",
|
||||
"ឞ",
|
||||
"ស",
|
||||
"ហ",
|
||||
"ឡ",
|
||||
"អ",
|
||||
"ឣ",
|
||||
"ឤ",
|
||||
"ឥ",
|
||||
"ឦ",
|
||||
"ឧ",
|
||||
"ឨ",
|
||||
"ឩ",
|
||||
"ឪ",
|
||||
"ឫ",
|
||||
"ឬ",
|
||||
"ឭ",
|
||||
"ឮ",
|
||||
"ឯ",
|
||||
"ឰ",
|
||||
"ឱ",
|
||||
"ឲ",
|
||||
"ឳ"
|
||||
]
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
2
graph.py
2
graph.py
|
@ -93,7 +93,7 @@ for i, char in enumerate(alphabet):
|
|||
|
||||
array = np.empty(0, dtype=float)
|
||||
characters = []
|
||||
langs = ["be", "ru", "uk", "kk"]
|
||||
langs = ["en", "id"]
|
||||
|
||||
charts = []
|
||||
for lang in langs:
|
||||
|
|
BIN
nim/main
BIN
nim/main
Binary file not shown.
208
nim/main.nim
208
nim/main.nim
|
@ -13,27 +13,39 @@ import os
|
|||
import sets
|
||||
import algorithm
|
||||
|
||||
proc wtvr(a : Table[string, seq[string]]) : Table[string, HashSet[string]]=
|
||||
for (key,val) in a.pairs:
|
||||
result[key] = toHashSet val
|
||||
|
||||
const resultText = staticRead("../data/mostCommonCharacters.json")
|
||||
const charactersJson = staticRead("../data/alphabets.json")
|
||||
const wikiToEnglish = staticRead("../data/wikiToEng.json")
|
||||
const mostCommonWords = (parseJson staticRead("../data/mostCommonWords.json")).to(Table[string, seq[string]])
|
||||
const forbiddenChars = @["A", "O", "I", "E", "U","Ё", "Y"].join("").toRunes().toSeq()
|
||||
const mostCommonWords = wtvr((parseJson staticRead("../data/mostCommonWords.json")).to(Table[string, seq[string]]))
|
||||
const forbiddenChars = @["9"].join("").toRunes().toSeq()
|
||||
|
||||
proc isUsedChar(a : Rune) : bool =
|
||||
|
||||
proc isUsedChar(a : Rune, allValidChars : HashSet[Rune]) : (bool, Rune) =
|
||||
if a notin allValidChars:
|
||||
return (false, Rune(0))
|
||||
#Haungul, unusued
|
||||
if int(a) in 0xAC00..0xD7AF:
|
||||
return true
|
||||
return (true, a)
|
||||
#Hanzi
|
||||
if int(a) in 0x4E00..0x9FFF:
|
||||
return true
|
||||
if a.toUpper() in forbiddenChars:
|
||||
return false;
|
||||
if a.size == 1:
|
||||
#if latin
|
||||
return a.isUpper()
|
||||
return not a.isLower()
|
||||
return (true, a)
|
||||
|
||||
proc createValidChars(a : JsonNode, b : Table[string, Table[Rune, int]]) : HashSet[Rune] =
|
||||
if a.isUpper():
|
||||
if a in forbiddenChars:
|
||||
return (false, Rune(0))
|
||||
else:
|
||||
return (true, a.toLower())
|
||||
if a.isLower():
|
||||
if a.toUpper() in forbiddenChars:
|
||||
return (false, Rune(0))
|
||||
|
||||
return (true, a)
|
||||
|
||||
proc createValidChars(a : JsonNode, b : Table[string, Table[Rune, float]]) : HashSet[Rune] =
|
||||
let extraCharacterLangs = @["zh", "zh-yue"]
|
||||
for lang in extraCharacterLangs:
|
||||
for key in b[lang].keys:
|
||||
|
@ -51,111 +63,110 @@ proc jsonToRune(a : JsonNode) : Table[Rune, int] =
|
|||
let rune = key.toRunes()[0]
|
||||
result[rune] = val
|
||||
|
||||
proc allJsonToRuneAbsolute(a : JsonNode) : Table[string, Table[Rune, int]] =
|
||||
for key, val in a.pairs:
|
||||
result[key] = jsonToRune val
|
||||
for key, val in result.pairs:
|
||||
let total = val.values.toSeq().foldl(a+b)
|
||||
for key1, val1 in val.pairs:
|
||||
if 0.1 > (val1 / total) * 100:
|
||||
result[key].del(key1)
|
||||
proc allJsonToRuneAbsolute(a : JsonNode) : Table[string, Table[Rune, float]] =
|
||||
for key in a.keys:
|
||||
var tableBuilder = initTable[Rune, float]()
|
||||
|
||||
let languageTable = a[key]
|
||||
var pairs = languageTable.pairs.toSeq().map(x => (x[0], x[1].getInt()) )
|
||||
var total = 0
|
||||
for (key,val) in pairs:
|
||||
total+=val
|
||||
|
||||
for (key,val) in pairs:
|
||||
let percentage = (val / total) * 100
|
||||
if 0.1 > percentage:
|
||||
continue
|
||||
let rune = key.toRunes()[0]
|
||||
let charAdd =
|
||||
if rune.isUpper():
|
||||
rune.toLower()
|
||||
else:
|
||||
rune
|
||||
if charAdd in tableBuilder:
|
||||
tableBuilder[charAdd] += percentage
|
||||
else:
|
||||
tableBuilder[charAdd] = percentage
|
||||
|
||||
result[key] = tableBuilder
|
||||
|
||||
|
||||
|
||||
when not defined(release) or not defined(danger):
|
||||
let absoluteCounts = allJsonToRuneAbsolute parseJson(resultText)
|
||||
let allValidChars = createValidChars(parseJson(charactersJson), absoluteCounts)
|
||||
let usedCharacters = toHashSet allValidChars.toSeq().filter(x=> isUsedChar x)
|
||||
let languages* = absoluteCounts.keys.toSeq().filter(x=> x in mostCommonWords)
|
||||
let statistics = allJsonToRuneAbsolute parseJson(resultText)
|
||||
let allValidChars = createValidChars(parseJson(charactersJson), statistics)
|
||||
let languages* = statistics.keys.toSeq().filter(x=> x in mostCommonWords)
|
||||
else:
|
||||
const absoluteCounts = allJsonToRuneAbsolute parseJson(resultText)
|
||||
const allValidChars = createValidChars(parseJson(charactersJson), absoluteCounts)
|
||||
const usedCharacters = toHashSet allValidChars.toSeq().filter(x=> isUsedChar x)
|
||||
const languages* = absoluteCounts.keys.toSeq().filter(x=> x in mostCommonWords)
|
||||
const statistics = allJsonToRuneAbsolute parseJson(resultText)
|
||||
const allValidChars = createValidChars(parseJson(charactersJson), statistics)
|
||||
const languages* = statistics.keys.toSeq().filter(x=> x in mostCommonWords)
|
||||
|
||||
proc createStringSlope(a : string, runeHolder : var seq[Rune]) : (Table[Rune, float], int) =
|
||||
let stringRunes = a.toRunes()
|
||||
var stepOne = initCountTable[Rune]()
|
||||
var runeLength = 0
|
||||
|
||||
for (i, char) in enumerate stringRunes:
|
||||
let isUsed = isUsedChar(char, allValidChars)
|
||||
if not isUsed[0]: continue
|
||||
stepOne.inc(isUsed[1])
|
||||
runeHolder[i] = isUsed[1]
|
||||
runeLength+=1
|
||||
|
||||
result[1] = runeLength
|
||||
|
||||
var total = 0
|
||||
var pairs = stepOne.pairs.toSeq()
|
||||
for (key,val) in pairs:
|
||||
total+=val
|
||||
|
||||
for (key,val) in pairs:
|
||||
let percentage = (val / total) * 100
|
||||
if 0.1 > percentage:
|
||||
continue
|
||||
result[0][key] = percentage
|
||||
|
||||
|
||||
proc createStatisticalTable(a : Table[Rune, int]) : Table[Rune, float] =
|
||||
let total = a.values.toSeq().foldl(a+b)
|
||||
let extraCharacters = toHashSet a.keys.toSeq().filter(x=> isUsedChar x)
|
||||
|
||||
for character in allValidChars + extraCharacters:
|
||||
let percentage =
|
||||
if character in a:
|
||||
(a[character] / total)*100
|
||||
else:
|
||||
0.0
|
||||
let charAdd =
|
||||
if character.isLower():
|
||||
character.toUpper()
|
||||
else:
|
||||
character
|
||||
if charAdd in result:
|
||||
result[charAdd] += percentage
|
||||
else:
|
||||
result[charAdd] = percentage
|
||||
|
||||
proc allJsonToRuneStatistic(a : JsonNode) : Table[string, Table[Rune, float]] =
|
||||
for key, val in a.pairs:
|
||||
result[key] = createStatisticalTable jsonToRune val
|
||||
#proc getLangWords(lang : string) : HashSet[string] =
|
||||
# for word in db.instantRows(sql"select * from words where language = ?", lang):
|
||||
# result.incl(word[1])
|
||||
|
||||
proc createStringSlope(a : string) : Table[Rune, float] =
|
||||
var stepOne = initTable[Rune, int]()
|
||||
let extraCharacters = toHashSet a.toRunes().filter(x=> isUsedChar x)
|
||||
let validExtra = allValidChars + extraCharacters
|
||||
for char in a.toRunes():
|
||||
if char in validExtra:
|
||||
if char notin stepOne:
|
||||
stepOne[char] = 1
|
||||
else:
|
||||
stepOne[char] += 1
|
||||
result = createStatisticalTable stepOne
|
||||
|
||||
proc reduceNoise(a : seq[Table[Rune, float]]) : seq[Table[Rune, float]]=
|
||||
result.setLen(a.len())
|
||||
for char in usedCharacters:
|
||||
let collected = collect(for i in a: i[char])
|
||||
if collected.any(x=> x > 0.1):
|
||||
for i in 0 .. a.high:
|
||||
result[i][char] = a[i][char]
|
||||
|
||||
proc neighborDistance(a : Table[Rune, float], b : Table[Rune, float]) : float =
|
||||
var resultBuffer : seq[float]
|
||||
for char in a.keys:
|
||||
resultBuffer.add(abs(a[char]-b[char]))
|
||||
let distance =
|
||||
if char in b:
|
||||
b[char]
|
||||
else:
|
||||
-10
|
||||
resultBuffer.add(abs(a[char]-distance))
|
||||
return resultBuffer.foldl(a+b)
|
||||
|
||||
when not defined(release) or not defined(danger):
|
||||
let statistics = allJsonToRuneStatistic parseJson(resultText)
|
||||
else:
|
||||
const statistics = allJsonToRuneStatistic parseJson(resultText)
|
||||
|
||||
proc doThing*(comparisonLangs : seq[string], sample : string) : Table[string, float] =
|
||||
var deNoised = reduceNoise(comparisonLangs.map(x => statistics[x]) & createStringSlope sample)
|
||||
let stringSlope = deNoised[^1]
|
||||
deNoised = deNoised[0 .. ^2]
|
||||
let deNoised = comparisonLangs.map(x => statistics[x])
|
||||
var runeStr = newSeq[Rune](sample.len())
|
||||
let (stringSlope, runeLength) = createStringSlope(sample, runeStr)
|
||||
var i = 0
|
||||
var runeStr : seq[Rune]
|
||||
for i, rune in enumerate toRunes sample:
|
||||
let char =
|
||||
if rune.isUpper():
|
||||
rune.toLower()
|
||||
else:
|
||||
rune
|
||||
runeStr.add(char)
|
||||
|
||||
let sample = runeStr.join("")
|
||||
|
||||
let subsample =
|
||||
if sample.high > 100:
|
||||
sample[0 .. 100]
|
||||
else:
|
||||
sample
|
||||
|
||||
let distinctCharacters = sample.toRunes().deduplicate().map(x=> x.toUpper()).filter(x=> isUsedChar(x))
|
||||
let characters = sample.toRunes().deduplicate().map(x=> isUsedChar(x, allValidChars)).map(x=>x[1])
|
||||
let distinctCharacters = characters.filter(x=> int(x) != 0)
|
||||
|
||||
for (language, slope) in zip(comparisonLangs, deNoised):
|
||||
result[language] = neighborDistance(slope, stringSlope)
|
||||
#We check based on the keys in each language
|
||||
#If we put slope first, we check if each char in the language is found in the slope
|
||||
#And vice versa
|
||||
#if a lanauge is logographic, we should compare the sample to the lanugage
|
||||
#Becuase, it is less specalized. Wikipedia's Chinse has a lot of Characters
|
||||
#That your average sample will not have
|
||||
if language notin ["zh", "zh-yue"]:
|
||||
result[language] = neighborDistance(slope, stringSlope)
|
||||
else:
|
||||
result[language] = neighborDistance(stringSlope, slope)
|
||||
|
||||
if language notin mostCommonWords:
|
||||
continue
|
||||
|
||||
|
@ -171,9 +182,9 @@ proc doThing*(comparisonLangs : seq[string], sample : string) : Table[string, fl
|
|||
continue
|
||||
#if its this low, its probably irredmable
|
||||
if result[language] >= 1000: break
|
||||
let notWithin = c notin absoluteCounts[language]
|
||||
let notWithin = c notin statistics[language]
|
||||
if notWithin:
|
||||
result[language] *= 1.1
|
||||
result[language] *= 1.2
|
||||
scriptBuffer.add(notWithin)
|
||||
if scriptBuffer.high != -1 and scriptBuffer.all(x=> x):
|
||||
result[language] *= 1000
|
||||
|
@ -187,10 +198,7 @@ proc doThing*(comparisonLangs : seq[string], sample : string) : Table[string, fl
|
|||
result["ko"] *= 1.05
|
||||
else:
|
||||
result["ko"] *= 0.95
|
||||
|
||||
proc makeResult*(oldResult : Table[string, float]) : seq[(string, float)] =
|
||||
result = oldResult.pairs().toSeq()
|
||||
result.sort((x,y)=> cmp(x[1], y[1]))
|
||||
|
||||
let sample = "蝴蝶係一種完全變態嘅昆蟲,即係話一隻蝴蝶一世蟲會經過膥、幼蟲、蛹同埋成蟲四個階段:一隻大咗肚嘅蝴蝶乸會喺啲植物嘅葉上面產卵;跟手啲幼蟲(毛蟲)孵咗出嚟之後就會靠食嗰啲葉嚟維生,啲幼蟲生到咁上下就會結蛹;當變態嘅過程完成咗之後,個蛹會爆開,隻成蟲(蝴蝶)就會由個蛹嗰度捐出嚟;等兩對翼乾咗之後,佢就會飛去搵嘢食同伴侶;交配完咗之後,啲蝴蝶乸就會產卵;而無論公定乸,蝴蝶通常喺交配嘅過程完咗之後冇幾耐就會死。佢哋嘅下一代跟住就會由頭噉經歷過呢個由生到死嘅過程。呢個過程做一次要幾耐係睇物種嘅:熱帶嗰頭啲蝴蝶物種好多時一年閒閒地生成兩三代咁多,而响凍啲地區嘅蝴蝶物種就好多時就要成幾年先至生到一代"
|
||||
echo statistics["zh"]
|
||||
echo makeResult doThing(languages & @["zh-yue"], sample)
|
||||
|
|
BIN
nim/tests
BIN
nim/tests
Binary file not shown.
|
@ -4,6 +4,8 @@ import sequtils
|
|||
import sugar
|
||||
import tables
|
||||
import times
|
||||
import pretty
|
||||
|
||||
let db = open("../data/testing/testingData.db", "", "", "")
|
||||
type Accuracy = object
|
||||
correct : int
|
||||
|
@ -16,8 +18,7 @@ var results = initTable[string, Accuracy]()
|
|||
for lang in main.languages:
|
||||
results[lang] = Accuracy()
|
||||
|
||||
for row in db.fastRows(sql"select Lang, Sample from TrainingData where lang = 'en'"):
|
||||
let t1 = cpuTime()
|
||||
for row in db.fastRows(sql"select Lang, Sample from TrainingData where lang = 'is'"):
|
||||
try:
|
||||
let result = makeResult doThing(main.languages, row[1])
|
||||
if result[0][0] == row[0]:
|
||||
|
@ -27,6 +28,5 @@ for row in db.fastRows(sql"select Lang, Sample from TrainingData where lang = 'e
|
|||
results[row[0]].languagesConfusedFor.inc(result[0][0])
|
||||
except:
|
||||
results[row[0]].faliures+=1
|
||||
echo cpuTime()-t1
|
||||
|
||||
echo results
|
||||
print results
|
||||
|
|
|
@ -7,10 +7,11 @@ use std::collections::HashSet;
|
|||
use std::thread::available_parallelism;
|
||||
use std::fs::DirEntry;
|
||||
use std::io::prelude::*;
|
||||
use rusqlite::{Connection, Result};
|
||||
use rusqlite::{Connection};
|
||||
use std::sync::atomic::{AtomicU32,Ordering};
|
||||
use rand::{thread_rng, Rng};
|
||||
use std::fs;
|
||||
use std::env;
|
||||
|
||||
fn gen_chars() -> HashSet<char>{
|
||||
let json = std::fs::read_to_string("../data/alphabets.json").unwrap();
|
||||
|
@ -22,7 +23,7 @@ fn gen_chars() -> HashSet<char>{
|
|||
return chars
|
||||
}
|
||||
|
||||
fn do_work(path : &str, chars : &HashSet<char>, do_word_search : bool) -> (HashMap<String, u64>, HashMap<char, u64>){
|
||||
fn do_work(path : &str, chars : &HashSet<char>, do_word_search : bool, language : &str) -> (HashMap<String, u64>, HashMap<char, u64>){
|
||||
let mut map : HashMap<char, u64> = HashMap::new();
|
||||
let mut word_map : HashMap<String, u64> = HashMap::new();
|
||||
|
||||
|
@ -31,6 +32,7 @@ fn do_work(path : &str, chars : &HashSet<char>, do_word_search : bool) -> (HashM
|
|||
let mut iter = reader.get_row_iter(None).unwrap();
|
||||
const MAX_WORD_LENGTH : usize = 22;
|
||||
let skippable_chars : Vec<char> = vec![',', '.', '!', '?', '\n', '\\', '\'', '"', ';', '<', '>'];
|
||||
let chinese_langs : Vec<&str> = vec!["zh", "zh-yue"];
|
||||
while let Some(record) = iter.next() {
|
||||
if record.is_err(){ continue; }
|
||||
let mut array: [char; MAX_WORD_LENGTH] = ['0'; MAX_WORD_LENGTH];
|
||||
|
@ -74,9 +76,19 @@ fn do_work(path : &str, chars : &HashSet<char>, do_word_search : bool) -> (HashM
|
|||
};
|
||||
}
|
||||
}
|
||||
if chars.contains(&chary){
|
||||
*map.entry(chary).or_insert(0) += 1;
|
||||
};
|
||||
if !chinese_langs.iter().any(|x| x == &language){
|
||||
if chars.contains(&chary){
|
||||
*map.entry(chary).or_insert(0) += 1;
|
||||
};
|
||||
}
|
||||
else{
|
||||
let c_code = chary as u32;
|
||||
let range = 0x4E00..0x9FFF;
|
||||
if range.contains(&c_code){
|
||||
*map.entry(chary).or_insert(0) += 1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return (word_map, map);
|
||||
|
@ -114,10 +126,43 @@ fn get_wikipedia_paths() -> (HashSet<String>,HashMap<String, Vec<String>>) {
|
|||
return (languages, language_paths)
|
||||
}
|
||||
|
||||
struct Actions{
|
||||
gen_words : bool,
|
||||
gen_test_data : bool,
|
||||
}
|
||||
|
||||
fn gen_actions() -> Actions {
|
||||
let mut result = Actions{
|
||||
gen_words : false,
|
||||
gen_test_data : false
|
||||
};
|
||||
let args: Vec<String> = env::args().collect();
|
||||
let length = args.len();
|
||||
if length == 1{
|
||||
return result;
|
||||
}
|
||||
|
||||
for arg in args[1 .. length].iter(){
|
||||
println!("{}", arg);
|
||||
if arg == "--chars" {
|
||||
result.gen_words = true
|
||||
}
|
||||
else if arg == "--test_data" {
|
||||
result.gen_test_data = true
|
||||
}
|
||||
else{
|
||||
panic!("UNKOWN ARG")
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
fn main(){
|
||||
let actions = gen_actions();
|
||||
//These languages don't have spaces and thus flood memory, and they don't make any sense to anaylize in this way
|
||||
generate_data();
|
||||
panic!();
|
||||
if actions.gen_test_data{
|
||||
generate_data();
|
||||
}
|
||||
let cpu_count = available_parallelism().unwrap().get()*2;
|
||||
let blacklisted_languages: Vec<String> =
|
||||
vec![
|
||||
|
@ -156,14 +201,21 @@ fn main(){
|
|||
let mut char_occurrences : HashMap<char, u64> = HashMap::new();
|
||||
let mut word_occurrences : HashMap<String, u64> = HashMap::new();
|
||||
let paquet_paths = paths.get(&lang).unwrap();
|
||||
let do_words = !b_lang.iter().any(|x| x == &lang);
|
||||
let do_words : bool;
|
||||
if actions.gen_words {
|
||||
do_words = !b_lang.iter().any(|x| x == &lang);
|
||||
}
|
||||
else{
|
||||
do_words = false
|
||||
}
|
||||
|
||||
for path in paquet_paths.iter(){
|
||||
if path.find(".parquet").is_none(){
|
||||
println!("{}", path);
|
||||
continue;
|
||||
};
|
||||
|
||||
let result = do_work(path, &chars, do_words);
|
||||
let result = do_work(path, &chars, do_words, &lang);
|
||||
|
||||
for (key,val) in result.0.into_iter(){
|
||||
*word_occurrences.entry(key).or_insert(0) += val;
|
||||
|
@ -190,19 +242,21 @@ fn main(){
|
|||
let db_insert = String::from_utf8(query_builder).unwrap();
|
||||
|
||||
//gets the lock but does nothing with it. You cannot easily share sql connections on threads.
|
||||
let _dblocked = dblock.lock().unwrap();
|
||||
let connection = Connection::open("../data/words/words.db").unwrap();
|
||||
connection.execute_batch("PRAGMA journal_mode = wal; PRAGMA synchronous = extra;").unwrap();
|
||||
if do_words{
|
||||
let _dblocked = dblock.lock().unwrap();
|
||||
let connection = Connection::open("../data/words/words.db").unwrap();
|
||||
connection.execute_batch("PRAGMA journal_mode = wal; PRAGMA synchronous = extra;").unwrap();
|
||||
|
||||
let potential_pain = connection.execute(&db_insert, ());
|
||||
let potential_pain = connection.execute(&db_insert, ());
|
||||
|
||||
if potential_pain.is_err(){
|
||||
println!("LANG {} FAILED, DATA: {}", lang, db_insert);
|
||||
if potential_pain.is_err(){
|
||||
println!("LANG {} FAILED, DATA: {}", lang, db_insert);
|
||||
}
|
||||
std::mem::drop(db_insert);
|
||||
|
||||
connection.execute_batch("PRAGMA analysis_limit=400; PRAGMA optimize").unwrap();
|
||||
connection.close().unwrap();
|
||||
}
|
||||
std::mem::drop(db_insert);
|
||||
|
||||
connection.execute_batch("PRAGMA analysis_limit=400; PRAGMA optimize").unwrap();
|
||||
connection.close().unwrap();
|
||||
//frees the thread from blocking another lang from starting.
|
||||
count.fetch_sub(1, Ordering::Relaxed);
|
||||
|
||||
|
@ -273,8 +327,6 @@ fn generate_data() -> bool{
|
|||
lang,path,i,sample);
|
||||
query_builder.write_all(formatted.as_bytes()).unwrap();
|
||||
i+=1;
|
||||
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
|
323
rust/testing
323
rust/testing
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue