Optimizations and added Khamer

This commit is contained in:
user 2024-07-30 04:18:46 -04:00
parent db3c8d5ccd
commit 3836396ba3
9 changed files with 545 additions and 450 deletions

View file

@ -211,6 +211,32 @@
"ﻭ",
"ﻳ"
],
"": [
"у",
"г",
"р",
"о",
"ф",
"и",
"н",
"с",
"к",
"ј",
"е",
"з",
"ц",
"д",
"м",
"т",
"л",
"а",
"в",
"х",
"џ",
"ш",
"ч",
"п"
],
"extended": [
"À",
"Á",
@ -690,5 +716,337 @@
"Э",
"Ю",
"Я"
],
"Japanese": [
"あ",
"い",
"う",
"え",
"お",
"か",
"き",
"く",
"け",
"こ",
"さ",
"し",
"す",
"せ",
"そ",
"た",
"ち",
"つ",
"て",
"と",
"な",
"に",
"ぬ",
"ね",
"の",
"は",
"ひ",
"ふ",
"へ",
"ほ",
"ま",
"み",
"む",
"め",
"も",
"や",
"ゆ",
"よ",
"ら",
"り",
"る",
"れ",
"ろ",
"わ",
"を",
"ん",
"が",
"ぎ",
"ぐ",
"げ",
"ご",
"ざ",
"じ",
"ず",
"ぜ",
"ぞ",
"だ",
"ぢ",
"づ",
"で",
"ど",
"ば",
"び",
"ぶ",
"べ",
"ぼ",
"ぱ",
"ぴ",
"ぷ",
"ぺ",
"ぽ",
"ア",
"イ",
"ウ",
"エ",
"オ",
"カ",
"キ",
"ク",
"ケ",
"コ",
"サ",
"シ",
"ス",
"セ",
"ソ",
"タ",
"チ",
"ツ",
"テ",
"ト",
"ナ",
"ニ",
"ヌ",
"ネ",
"",
"ハ",
"ヒ",
"フ",
"ヘ",
"ホ",
"マ",
"ミ",
"ム",
"メ",
"モ",
"ヤ",
"ユ",
"ヨ",
"ラ",
"リ",
"ル",
"レ",
"ロ",
"ワ",
"ヲ",
"ン",
"ャ",
"ュ",
"ョ",
"ゃ",
"ゅ",
"ょ"
],
"Indian": [
"ऄ",
"अ",
"आ",
"इ",
"ई",
"उ",
"ऊ",
"ऋ",
"ऌ",
"ऍ",
"ऎ",
"ए",
"ऐ",
"ऑ",
"ऒ",
"ओ",
"औ",
"क",
"ख",
"ग",
"घ",
"ङ",
"च",
"छ",
"ज",
"झ",
"ञ",
"ट",
"ठ",
"ड",
"ढ",
"ण",
"त",
"थ",
"द",
"ध",
"न",
"ऩ",
"प",
"फ",
"ब",
"भ",
"म",
"य",
"र",
"ऱ",
"ल",
"ळ",
"ऴ",
"व",
"श",
"ष",
"स",
"ह",
"ऽ",
"क़",
"ख़",
"ग़",
"ज़",
"ड़",
"ढ़",
"फ़",
"य़",
"ॠ",
"ॡ",
"३",
"४",
"५",
"६",
"७",
"८",
"९"
],
"Lao": [
"ກ",
"ຂ",
"ຄ",
"ງ",
"ຈ",
"ຉ",
"ຊ",
"ຍ",
"ຎ",
"ຏ",
"ຐ",
"ຑ",
"ຒ",
"ຓ",
"ດ",
"ຝ",
"ຟ",
"ຠ",
"ມ",
"ຢ",
"ຣ",
"຤",
"ລ",
"຦",
"ວ",
"ຨ",
"ຩ",
"ສ",
"ອ",
"ຬ"
],
"Odia": [
"ଅ",
"ଆ",
"ଇ",
"ଉ",
"ଋ",
"ୠ",
"ଌ",
"୪",
"ଏ",
"ଐ",
"ଓ",
"ଔ",
"କ",
"ଖ",
"ଗ",
"ଘ",
"ଙ",
"ଚ",
"ଛ",
"ଜ",
"ଝ",
"ଞ",
"ଟ",
"",
"ଡ",
"ଢ",
"ଣ",
"ତ",
"ଥ",
"ଦ",
"ଧ",
"ନ",
"ପ",
"ଫ",
"ବ",
"ଭ",
"ମ",
"ଯ",
"ର",
"ଲ",
"ୱ",
"ଶ",
"ଷ",
"ସ",
"ହ"
],
"Khmer": [
"ក",
"ខ",
"គ",
"ឃ",
"ង",
"ច",
"ឆ",
"ជ",
"ឈ",
"ញ",
"ដ",
"ឋ",
"ឌ",
"ឍ",
"ណ",
"ត",
"ថ",
"ទ",
"ធ",
"ន",
"ប",
"ផ",
"ព",
"ភ",
"ម",
"យ",
"រ",
"ល",
"វ",
"ឝ",
"ឞ",
"ស",
"ហ",
"ឡ",
"អ",
"ឣ",
"ឤ",
"ឥ",
"ឦ",
"ឧ",
"ឨ",
"ឩ",
"ឪ",
"ឫ",
"ឬ",
"ឭ",
"ឮ",
"ឯ",
"ឰ",
"ឱ",
"ឲ",
"ឳ"
]
}

File diff suppressed because one or more lines are too long

View file

@ -93,7 +93,7 @@ for i, char in enumerate(alphabet):
array = np.empty(0, dtype=float)
characters = []
langs = ["be", "ru", "uk", "kk"]
langs = ["en", "id"]
charts = []
for lang in langs:

BIN
nim/main

Binary file not shown.

View file

@ -13,27 +13,39 @@ import os
import sets
import algorithm
proc wtvr(a : Table[string, seq[string]]) : Table[string, HashSet[string]]=
for (key,val) in a.pairs:
result[key] = toHashSet val
const resultText = staticRead("../data/mostCommonCharacters.json")
const charactersJson = staticRead("../data/alphabets.json")
const wikiToEnglish = staticRead("../data/wikiToEng.json")
const mostCommonWords = (parseJson staticRead("../data/mostCommonWords.json")).to(Table[string, seq[string]])
const forbiddenChars = @["A", "O", "I", "E", "U","Ё", "Y"].join("").toRunes().toSeq()
const mostCommonWords = wtvr((parseJson staticRead("../data/mostCommonWords.json")).to(Table[string, seq[string]]))
const forbiddenChars = @["9"].join("").toRunes().toSeq()
proc isUsedChar(a : Rune) : bool =
proc isUsedChar(a : Rune, allValidChars : HashSet[Rune]) : (bool, Rune) =
if a notin allValidChars:
return (false, Rune(0))
#Haungul, unusued
if int(a) in 0xAC00..0xD7AF:
return true
return (true, a)
#Hanzi
if int(a) in 0x4E00..0x9FFF:
return true
if a.toUpper() in forbiddenChars:
return false;
if a.size == 1:
#if latin
return a.isUpper()
return not a.isLower()
return (true, a)
proc createValidChars(a : JsonNode, b : Table[string, Table[Rune, int]]) : HashSet[Rune] =
if a.isUpper():
if a in forbiddenChars:
return (false, Rune(0))
else:
return (true, a.toLower())
if a.isLower():
if a.toUpper() in forbiddenChars:
return (false, Rune(0))
return (true, a)
proc createValidChars(a : JsonNode, b : Table[string, Table[Rune, float]]) : HashSet[Rune] =
let extraCharacterLangs = @["zh", "zh-yue"]
for lang in extraCharacterLangs:
for key in b[lang].keys:
@ -51,111 +63,110 @@ proc jsonToRune(a : JsonNode) : Table[Rune, int] =
let rune = key.toRunes()[0]
result[rune] = val
proc allJsonToRuneAbsolute(a : JsonNode) : Table[string, Table[Rune, int]] =
for key, val in a.pairs:
result[key] = jsonToRune val
for key, val in result.pairs:
let total = val.values.toSeq().foldl(a+b)
for key1, val1 in val.pairs:
if 0.1 > (val1 / total) * 100:
result[key].del(key1)
proc allJsonToRuneAbsolute(a : JsonNode) : Table[string, Table[Rune, float]] =
for key in a.keys:
var tableBuilder = initTable[Rune, float]()
let languageTable = a[key]
var pairs = languageTable.pairs.toSeq().map(x => (x[0], x[1].getInt()) )
var total = 0
for (key,val) in pairs:
total+=val
for (key,val) in pairs:
let percentage = (val / total) * 100
if 0.1 > percentage:
continue
let rune = key.toRunes()[0]
let charAdd =
if rune.isUpper():
rune.toLower()
else:
rune
if charAdd in tableBuilder:
tableBuilder[charAdd] += percentage
else:
tableBuilder[charAdd] = percentage
result[key] = tableBuilder
when not defined(release) or not defined(danger):
let absoluteCounts = allJsonToRuneAbsolute parseJson(resultText)
let allValidChars = createValidChars(parseJson(charactersJson), absoluteCounts)
let usedCharacters = toHashSet allValidChars.toSeq().filter(x=> isUsedChar x)
let languages* = absoluteCounts.keys.toSeq().filter(x=> x in mostCommonWords)
let statistics = allJsonToRuneAbsolute parseJson(resultText)
let allValidChars = createValidChars(parseJson(charactersJson), statistics)
let languages* = statistics.keys.toSeq().filter(x=> x in mostCommonWords)
else:
const absoluteCounts = allJsonToRuneAbsolute parseJson(resultText)
const allValidChars = createValidChars(parseJson(charactersJson), absoluteCounts)
const usedCharacters = toHashSet allValidChars.toSeq().filter(x=> isUsedChar x)
const languages* = absoluteCounts.keys.toSeq().filter(x=> x in mostCommonWords)
const statistics = allJsonToRuneAbsolute parseJson(resultText)
const allValidChars = createValidChars(parseJson(charactersJson), statistics)
const languages* = statistics.keys.toSeq().filter(x=> x in mostCommonWords)
proc createStringSlope(a : string, runeHolder : var seq[Rune]) : (Table[Rune, float], int) =
let stringRunes = a.toRunes()
var stepOne = initCountTable[Rune]()
var runeLength = 0
for (i, char) in enumerate stringRunes:
let isUsed = isUsedChar(char, allValidChars)
if not isUsed[0]: continue
stepOne.inc(isUsed[1])
runeHolder[i] = isUsed[1]
runeLength+=1
result[1] = runeLength
var total = 0
var pairs = stepOne.pairs.toSeq()
for (key,val) in pairs:
total+=val
for (key,val) in pairs:
let percentage = (val / total) * 100
if 0.1 > percentage:
continue
result[0][key] = percentage
proc createStatisticalTable(a : Table[Rune, int]) : Table[Rune, float] =
let total = a.values.toSeq().foldl(a+b)
let extraCharacters = toHashSet a.keys.toSeq().filter(x=> isUsedChar x)
for character in allValidChars + extraCharacters:
let percentage =
if character in a:
(a[character] / total)*100
else:
0.0
let charAdd =
if character.isLower():
character.toUpper()
else:
character
if charAdd in result:
result[charAdd] += percentage
else:
result[charAdd] = percentage
proc allJsonToRuneStatistic(a : JsonNode) : Table[string, Table[Rune, float]] =
for key, val in a.pairs:
result[key] = createStatisticalTable jsonToRune val
#proc getLangWords(lang : string) : HashSet[string] =
# for word in db.instantRows(sql"select * from words where language = ?", lang):
# result.incl(word[1])
proc createStringSlope(a : string) : Table[Rune, float] =
var stepOne = initTable[Rune, int]()
let extraCharacters = toHashSet a.toRunes().filter(x=> isUsedChar x)
let validExtra = allValidChars + extraCharacters
for char in a.toRunes():
if char in validExtra:
if char notin stepOne:
stepOne[char] = 1
else:
stepOne[char] += 1
result = createStatisticalTable stepOne
proc reduceNoise(a : seq[Table[Rune, float]]) : seq[Table[Rune, float]]=
result.setLen(a.len())
for char in usedCharacters:
let collected = collect(for i in a: i[char])
if collected.any(x=> x > 0.1):
for i in 0 .. a.high:
result[i][char] = a[i][char]
proc neighborDistance(a : Table[Rune, float], b : Table[Rune, float]) : float =
var resultBuffer : seq[float]
for char in a.keys:
resultBuffer.add(abs(a[char]-b[char]))
let distance =
if char in b:
b[char]
else:
-10
resultBuffer.add(abs(a[char]-distance))
return resultBuffer.foldl(a+b)
when not defined(release) or not defined(danger):
let statistics = allJsonToRuneStatistic parseJson(resultText)
else:
const statistics = allJsonToRuneStatistic parseJson(resultText)
proc doThing*(comparisonLangs : seq[string], sample : string) : Table[string, float] =
var deNoised = reduceNoise(comparisonLangs.map(x => statistics[x]) & createStringSlope sample)
let stringSlope = deNoised[^1]
deNoised = deNoised[0 .. ^2]
let deNoised = comparisonLangs.map(x => statistics[x])
var runeStr = newSeq[Rune](sample.len())
let (stringSlope, runeLength) = createStringSlope(sample, runeStr)
var i = 0
var runeStr : seq[Rune]
for i, rune in enumerate toRunes sample:
let char =
if rune.isUpper():
rune.toLower()
else:
rune
runeStr.add(char)
let sample = runeStr.join("")
let subsample =
if sample.high > 100:
sample[0 .. 100]
else:
sample
let distinctCharacters = sample.toRunes().deduplicate().map(x=> x.toUpper()).filter(x=> isUsedChar(x))
let characters = sample.toRunes().deduplicate().map(x=> isUsedChar(x, allValidChars)).map(x=>x[1])
let distinctCharacters = characters.filter(x=> int(x) != 0)
for (language, slope) in zip(comparisonLangs, deNoised):
result[language] = neighborDistance(slope, stringSlope)
#We check based on the keys in each language
#If we put slope first, we check if each char in the language is found in the slope
#And vice versa
#if a lanauge is logographic, we should compare the sample to the lanugage
#Becuase, it is less specalized. Wikipedia's Chinse has a lot of Characters
#That your average sample will not have
if language notin ["zh", "zh-yue"]:
result[language] = neighborDistance(slope, stringSlope)
else:
result[language] = neighborDistance(stringSlope, slope)
if language notin mostCommonWords:
continue
@ -171,9 +182,9 @@ proc doThing*(comparisonLangs : seq[string], sample : string) : Table[string, fl
continue
#if its this low, its probably irredmable
if result[language] >= 1000: break
let notWithin = c notin absoluteCounts[language]
let notWithin = c notin statistics[language]
if notWithin:
result[language] *= 1.1
result[language] *= 1.2
scriptBuffer.add(notWithin)
if scriptBuffer.high != -1 and scriptBuffer.all(x=> x):
result[language] *= 1000
@ -187,10 +198,7 @@ proc doThing*(comparisonLangs : seq[string], sample : string) : Table[string, fl
result["ko"] *= 1.05
else:
result["ko"] *= 0.95
proc makeResult*(oldResult : Table[string, float]) : seq[(string, float)] =
result = oldResult.pairs().toSeq()
result.sort((x,y)=> cmp(x[1], y[1]))
let sample = "蝴蝶係一種完全變態嘅昆蟲,即係話一隻蝴蝶一世蟲會經過膥、幼蟲、蛹同埋成蟲四個階段:一隻大咗肚嘅蝴蝶乸會喺啲植物嘅葉上面產卵;跟手啲幼蟲(毛蟲)孵咗出嚟之後就會靠食嗰啲葉嚟維生,啲幼蟲生到咁上下就會結蛹;當變態嘅過程完成咗之後,個蛹會爆開,隻成蟲(蝴蝶)就會由個蛹嗰度捐出嚟;等兩對翼乾咗之後,佢就會飛去搵嘢食同伴侶;交配完咗之後,啲蝴蝶乸就會產卵;而無論公定乸,蝴蝶通常喺交配嘅過程完咗之後冇幾耐就會死。佢哋嘅下一代跟住就會由頭噉經歷過呢個由生到死嘅過程。呢個過程做一次要幾耐係睇物種嘅:熱帶嗰頭啲蝴蝶物種好多時一年閒閒地生成兩三代咁多,而响凍啲地區嘅蝴蝶物種就好多時就要成幾年先至生到一代"
echo statistics["zh"]
echo makeResult doThing(languages & @["zh-yue"], sample)

BIN
nim/tests

Binary file not shown.

View file

@ -4,6 +4,8 @@ import sequtils
import sugar
import tables
import times
import pretty
let db = open("../data/testing/testingData.db", "", "", "")
type Accuracy = object
correct : int
@ -16,8 +18,7 @@ var results = initTable[string, Accuracy]()
for lang in main.languages:
results[lang] = Accuracy()
for row in db.fastRows(sql"select Lang, Sample from TrainingData where lang = 'en'"):
let t1 = cpuTime()
for row in db.fastRows(sql"select Lang, Sample from TrainingData where lang = 'is'"):
try:
let result = makeResult doThing(main.languages, row[1])
if result[0][0] == row[0]:
@ -27,6 +28,5 @@ for row in db.fastRows(sql"select Lang, Sample from TrainingData where lang = 'e
results[row[0]].languagesConfusedFor.inc(result[0][0])
except:
results[row[0]].faliures+=1
echo cpuTime()-t1
echo results
print results

View file

@ -7,10 +7,11 @@ use std::collections::HashSet;
use std::thread::available_parallelism;
use std::fs::DirEntry;
use std::io::prelude::*;
use rusqlite::{Connection, Result};
use rusqlite::{Connection};
use std::sync::atomic::{AtomicU32,Ordering};
use rand::{thread_rng, Rng};
use std::fs;
use std::env;
fn gen_chars() -> HashSet<char>{
let json = std::fs::read_to_string("../data/alphabets.json").unwrap();
@ -22,7 +23,7 @@ fn gen_chars() -> HashSet<char>{
return chars
}
fn do_work(path : &str, chars : &HashSet<char>, do_word_search : bool) -> (HashMap<String, u64>, HashMap<char, u64>){
fn do_work(path : &str, chars : &HashSet<char>, do_word_search : bool, language : &str) -> (HashMap<String, u64>, HashMap<char, u64>){
let mut map : HashMap<char, u64> = HashMap::new();
let mut word_map : HashMap<String, u64> = HashMap::new();
@ -31,6 +32,7 @@ fn do_work(path : &str, chars : &HashSet<char>, do_word_search : bool) -> (HashM
let mut iter = reader.get_row_iter(None).unwrap();
const MAX_WORD_LENGTH : usize = 22;
let skippable_chars : Vec<char> = vec![',', '.', '!', '?', '\n', '\\', '\'', '"', ';', '<', '>'];
let chinese_langs : Vec<&str> = vec!["zh", "zh-yue"];
while let Some(record) = iter.next() {
if record.is_err(){ continue; }
let mut array: [char; MAX_WORD_LENGTH] = ['0'; MAX_WORD_LENGTH];
@ -74,9 +76,19 @@ fn do_work(path : &str, chars : &HashSet<char>, do_word_search : bool) -> (HashM
};
}
}
if chars.contains(&chary){
*map.entry(chary).or_insert(0) += 1;
};
if !chinese_langs.iter().any(|x| x == &language){
if chars.contains(&chary){
*map.entry(chary).or_insert(0) += 1;
};
}
else{
let c_code = chary as u32;
let range = 0x4E00..0x9FFF;
if range.contains(&c_code){
*map.entry(chary).or_insert(0) += 1;
}
}
}
}
return (word_map, map);
@ -114,10 +126,43 @@ fn get_wikipedia_paths() -> (HashSet<String>,HashMap<String, Vec<String>>) {
return (languages, language_paths)
}
struct Actions{
gen_words : bool,
gen_test_data : bool,
}
fn gen_actions() -> Actions {
let mut result = Actions{
gen_words : false,
gen_test_data : false
};
let args: Vec<String> = env::args().collect();
let length = args.len();
if length == 1{
return result;
}
for arg in args[1 .. length].iter(){
println!("{}", arg);
if arg == "--chars" {
result.gen_words = true
}
else if arg == "--test_data" {
result.gen_test_data = true
}
else{
panic!("UNKOWN ARG")
}
}
return result;
}
fn main(){
let actions = gen_actions();
//These languages don't have spaces and thus flood memory, and they don't make any sense to anaylize in this way
generate_data();
panic!();
if actions.gen_test_data{
generate_data();
}
let cpu_count = available_parallelism().unwrap().get()*2;
let blacklisted_languages: Vec<String> =
vec![
@ -156,14 +201,21 @@ fn main(){
let mut char_occurrences : HashMap<char, u64> = HashMap::new();
let mut word_occurrences : HashMap<String, u64> = HashMap::new();
let paquet_paths = paths.get(&lang).unwrap();
let do_words = !b_lang.iter().any(|x| x == &lang);
let do_words : bool;
if actions.gen_words {
do_words = !b_lang.iter().any(|x| x == &lang);
}
else{
do_words = false
}
for path in paquet_paths.iter(){
if path.find(".parquet").is_none(){
println!("{}", path);
continue;
};
let result = do_work(path, &chars, do_words);
let result = do_work(path, &chars, do_words, &lang);
for (key,val) in result.0.into_iter(){
*word_occurrences.entry(key).or_insert(0) += val;
@ -190,19 +242,21 @@ fn main(){
let db_insert = String::from_utf8(query_builder).unwrap();
//gets the lock but does nothing with it. You cannot easily share sql connections on threads.
let _dblocked = dblock.lock().unwrap();
let connection = Connection::open("../data/words/words.db").unwrap();
connection.execute_batch("PRAGMA journal_mode = wal; PRAGMA synchronous = extra;").unwrap();
if do_words{
let _dblocked = dblock.lock().unwrap();
let connection = Connection::open("../data/words/words.db").unwrap();
connection.execute_batch("PRAGMA journal_mode = wal; PRAGMA synchronous = extra;").unwrap();
let potential_pain = connection.execute(&db_insert, ());
let potential_pain = connection.execute(&db_insert, ());
if potential_pain.is_err(){
println!("LANG {} FAILED, DATA: {}", lang, db_insert);
if potential_pain.is_err(){
println!("LANG {} FAILED, DATA: {}", lang, db_insert);
}
std::mem::drop(db_insert);
connection.execute_batch("PRAGMA analysis_limit=400; PRAGMA optimize").unwrap();
connection.close().unwrap();
}
std::mem::drop(db_insert);
connection.execute_batch("PRAGMA analysis_limit=400; PRAGMA optimize").unwrap();
connection.close().unwrap();
//frees the thread from blocking another lang from starting.
count.fetch_sub(1, Ordering::Relaxed);
@ -273,8 +327,6 @@ fn generate_data() -> bool{
lang,path,i,sample);
query_builder.write_all(formatted.as_bytes()).unwrap();
i+=1;
}
};

File diff suppressed because one or more lines are too long