Search didn't work in testing. Tests finished, final phase. Making genetic algorithm

This commit is contained in:
user 2024-08-02 08:22:19 -04:00
parent d739d6aa29
commit 0ebe6f65fe
10 changed files with 1030 additions and 224 deletions

View file

@ -1082,5 +1082,869 @@
"ײ",
"׳",
"״"
],
"Malayalam": [
"അ",
"ആ",
"ഇ",
"ഈ",
"ഉ",
"ഊ",
"ഋ",
"ഌ",
"എ",
"ഏ",
"ഐ",
"ഒ",
"ഓ",
"ഔ",
"ക",
"ഖ",
"ഗ",
"ഘ",
"ങ",
"ച",
"ഛ",
"ജ",
"ഝ",
"ഞ",
"ട",
"",
"ഡ",
"ഢ",
"ണ",
"ത",
"ഥ",
"ദ",
"ധ",
"ന",
"ഩ",
"പ",
"ഫ",
"ബ",
"ഭ",
"മ",
"യ",
"ര",
"റ",
"ല",
"ള",
"ഴ",
"വ",
"ശ",
"ഷ",
"സ",
"ഹ",
"ഺ",
"ഽ",
"൏",
"ൔ",
"ൕ",
"ൖ",
"൘",
"൙",
"൚",
"൛",
"൜",
"൝",
"൞",
"ൟ",
"ൠ",
"ൡ",
"",
"൧",
"൨",
"൩",
"൪",
"൫",
"൬",
"",
"൮",
"൯",
"൰",
"൱",
"൲",
"൳",
"൴",
"൵",
"൶",
"൷",
"൸",
"൹",
"ൺ",
"ൻ",
"ർ",
"ൽ",
"ൾ",
"ൿ"
],
"bengali": [
"অ",
"আ",
"ই",
"ঈ",
"উ",
"ঊ",
"ঋ",
"ঌ",
"এ",
"ঐ",
"ও",
"ঔ",
"ক",
"খ",
"গ",
"ঘ",
"ঙ",
"চ",
"ছ",
"জ",
"ঝ",
"ঞ",
"ট",
"ঠ",
"ড",
"ঢ",
"ণ",
"ত",
"থ",
"দ",
"ধ",
"ন",
"প",
"ফ",
"ব",
"ভ",
"ম",
"য",
"র",
"ল",
"শ",
"ষ",
"স",
"হ",
"ড়",
"ঢ়",
"য়",
"ৠ",
"ৡ",
"১",
"২",
"৩",
"",
"৫",
"৬",
"",
"৮",
"৯",
"ৰ",
"ৱ",
"৲",
"৳",
"৴",
"৵",
"৶",
"৷",
"৸",
"৹",
"৺",
"৻",
"ৼ"
],
"Punjabi": [
"ਅ",
"ਆ",
"ਇ",
"ਈ",
"ਉ",
"ਊ",
"ਏ",
"ਐ",
"ਓ",
"ਔ",
"ਕ",
"ਖ",
"ਗ",
"ਘ",
"ਙ",
"ਚ",
"ਛ",
"ਜ",
"ਝ",
"ਞ",
"ਟ",
"ਠ",
"ਡ",
"ਢ",
"ਣ",
"ਤ",
"ਥ",
"ਦ",
"ਧ",
"ਨ",
"ਪ",
"ਫ",
"ਬ",
"ਭ",
"ਮ",
"ਯ",
"ਰ",
"ਲ",
"ਲ਼",
"ਵ",
"ਸ਼",
"ਸ",
"ਹ",
"ਖ਼",
"ਗ਼",
"ਜ਼",
"ੜ",
"ਫ਼",
"",
"",
"੨",
"੩",
"",
"੫",
"੬",
"੭",
"੮",
"੯",
"ੲ",
"ੳ",
"ੴ"
],
"AramaicEtheopian": [
"",
"ሁ",
"ሂ",
"ሃ",
"ሄ",
"ህ",
"ሆ",
"ሇ",
"ለ",
"ሉ",
"ሊ",
"ላ",
"ሌ",
"ል",
"ሎ",
"ሏ",
"ሐ",
"ሑ",
"ሒ",
"ሓ",
"ሔ",
"ሕ",
"ሖ",
"ሗ",
"መ",
"ሙ",
"ሚ",
"ማ",
"ሜ",
"ም",
"ሞ",
"ሟ",
"ሠ",
"ሡ",
"ሢ",
"ሣ",
"ሤ",
"ሥ",
"ሦ",
"ሧ",
"ረ",
"ሩ",
"ሪ",
"ራ",
"ሬ",
"ር",
"ሮ",
"ሯ",
"ሰ",
"ሱ",
"ሲ",
"ሳ",
"ሴ",
"ስ",
"ሶ",
"ሷ",
"ሸ",
"ሹ",
"ሺ",
"ሻ",
"ሼ",
"ሽ",
"ሾ",
"ሿ",
"ቀ",
"ቁ",
"ቂ",
"ቃ",
"ቄ",
"ቅ",
"ቆ",
"ቇ",
"ቈ",
"ቊ",
"ቋ",
"ቌ",
"ቍ",
"ቐ",
"ቑ",
"ቒ",
"ቓ",
"ቔ",
"ቕ",
"ቖ",
"ቘ",
"ቚ",
"ቛ",
"ቜ",
"ቝ",
"በ",
"ቡ",
"ቢ",
"ባ",
"ቤ",
"ብ",
"ቦ",
"ቧ",
"ቨ",
"ቩ",
"ቪ",
"ቫ",
"ቬ",
"ቭ",
"ቮ",
"ቯ",
"ተ",
"ቱ",
"ቲ",
"ታ",
"ቴ",
"ት",
"ቶ",
"ቷ",
"ቸ",
"ቹ",
"ቺ",
"ቻ",
"ቼ",
"ች",
"ቾ",
"ቿ",
"ኀ",
"ኁ",
"ኂ",
"ኃ",
"ኄ",
"ኅ",
"ኆ",
"ኇ",
"ኈ",
"ኊ",
"ኋ",
"ኌ",
"ኍ",
"ነ",
"ኑ",
"ኒ",
"ና",
"ኔ",
"ን",
"ኖ",
"ኗ",
"ኘ",
"ኙ",
"ኚ",
"ኛ",
"ኜ",
"ኝ",
"ኞ",
"ኟ",
"አ",
"ኡ",
"ኢ",
"ኣ",
"ኤ",
"እ",
"ኦ",
"ኧ",
"ከ",
"ኩ",
"ኪ",
"ካ",
"ኬ",
"ክ",
"ኮ",
"ኯ",
"ኰ",
"ኲ",
"ኳ",
"ኴ",
"ኵ",
"ኸ",
"ኹ",
"ኺ",
"ኻ",
"ኼ",
"ኽ",
"ኾ",
"ዀ",
"ዂ",
"ዃ",
"ዄ",
"ዅ",
"ወ",
"ዉ",
"ዊ",
"ዋ",
"ዌ",
"ው",
"ዎ",
"ዏ",
"",
"ዑ",
"ዒ",
"ዓ",
"ዔ",
"ዕ",
"ዖ",
"ዘ",
"ዙ",
"ዚ",
"ዛ",
"ዜ",
"ዝ",
"ዞ",
"ዟ",
"ዠ",
"ዡ",
"ዢ",
"ዣ",
"ዤ",
"ዥ",
"ዦ",
"ዧ",
"የ",
"ዩ",
"ዪ",
"ያ",
"ዬ",
"ይ",
"ዮ",
"ዯ",
"ደ",
"ዱ",
"ዲ",
"ዳ",
"ዴ",
"ድ",
"ዶ",
"ዷ",
"ዸ",
"ዹ",
"ዺ",
"ዻ",
"ዼ",
"ዽ",
"ዾ",
"ዿ",
"ጀ",
"ጁ",
"ጂ",
"ጃ",
"ጄ",
"ጅ",
"ጆ",
"ጇ",
"ገ",
"ጉ",
"ጊ",
"ጋ",
"ጌ",
"ግ",
"ጎ",
"ጏ",
"ጐ",
"ጒ",
"ጓ",
"ጔ",
"ጕ",
"ጘ",
"ጙ",
"ጚ",
"ጛ",
"ጜ",
"ጝ",
"ጞ",
"ጟ",
"ጠ",
"ጡ",
"ጢ",
"ጣ",
"ጤ",
"ጥ",
"ጦ",
"ጧ",
"ጨ",
"ጩ",
"ጪ",
"ጫ",
"ጬ",
"ጭ",
"ጮ",
"ጯ",
"ጰ",
"ጱ",
"ጲ",
"ጳ",
"ጴ",
"ጵ",
"ጶ",
"ጷ",
"ጸ",
"ጹ",
"ጺ",
"ጻ",
"ጼ",
"ጽ",
"ጾ",
"ጿ",
"ፀ",
"ፁ",
"ፂ",
"ፃ",
"ፄ",
"ፅ",
"ፆ",
"ፇ",
"ፈ",
"ፉ",
"ፊ",
"ፋ",
"ፌ",
"ፍ",
"ፎ",
"ፏ",
"ፐ",
"ፑ",
"ፒ",
"ፓ",
"ፔ",
"ፕ",
"ፖ",
"ፗ",
"ፘ",
"ፙ",
"ፚ",
"፩",
"፪",
"፫",
"፬",
"፭",
"፮",
"፯",
"፰",
"፱",
"፲",
"፳",
"፴",
"፵",
"፶",
"፷",
"፸",
"፹",
"፺",
"፻",
"፼"
],
"Kannada": [
"಄",
"ಅ",
"ಆ",
"ಇ",
"ಈ",
"ಉ",
"ಊ",
"ಋ",
"ಌ",
"ಎ",
"ಏ",
"ಐ",
"ಒ",
"ಓ",
"ಔ",
"ಕ",
"ಖ",
"ಗ",
"ಘ",
"ಙ",
"ಚ",
"ಛ",
"ಜ",
"ಝ",
"ಞ",
"ಟ",
"ಠ",
"ಡ",
"ಢ",
"ಣ",
"ತ",
"ಥ",
"ದ",
"ಧ",
"ನ",
"ಪ",
"ಫ",
"ಬ",
"ಭ",
"ಮ",
"ಯ",
"ರ",
"ಱ",
"ಲ",
"ಳ",
"ವ",
"ಶ",
"ಷ",
"ಸ",
"ಹ",
"ಽ",
"ಾ",
"ಿ",
"ೀ",
"ು",
"ೂ",
"ೊ",
"ೋ",
"",
"೧",
"೨",
"೩",
"೪",
"೫",
"೬",
"೭",
"೮",
"೯"
],
"Tamil": [
"ஃ",
"அ",
"ஆ",
"இ",
"ஈ",
"உ",
"ஊ",
"எ",
"ஏ",
"ஐ",
"ஒ",
"ஓ",
"ஔ",
"க",
"ங",
"ச",
"ஜ",
"ஞ",
"ட",
"ண",
"த",
"ந",
"ன",
"ப",
"ம",
"ய",
"ர",
"ற",
"ல",
"ள",
"ழ",
"வ",
"ஶ",
"ஷ",
"ஸ",
"ஹ",
"ௐ",
"ௗ",
"",
"௧",
"௨",
"௩",
"௪",
"௫",
"௬",
"௭",
"௮",
"௯",
"௰",
"௱",
"௲",
"௳",
"௴",
"௵",
"௶",
"௷",
"௸",
"௹",
"௺"
],
"Gujarati": [
"અ",
"આ",
"ઇ",
"ઈ",
"ઉ",
"ઊ",
"ઋ",
"ઌ",
"ઍ",
"એ",
"ઐ",
"ઑ",
"ઓ",
"ઔ",
"ક",
"ખ",
"ગ",
"ઘ",
"ઙ",
"ચ",
"છ",
"જ",
"ઝ",
"ઞ",
"ટ",
"ઠ",
"ડ",
"ઢ",
"ણ",
"ત",
"થ",
"દ",
"ધ",
"ન",
"પ",
"ફ",
"બ",
"ભ",
"મ",
"ય",
"ર",
"લ",
"ળ",
"વ",
"શ",
"ષ",
"સ",
"હ",
"",
"૧",
"૨",
"૩",
"૪",
"૫",
"૬",
"૭",
"૮",
"૯",
"ૠ",
"ૡ",
"૰",
"૱",
"ૹ"
],
"Georgian": [
"Ⴁ",
"Ⴂ",
"Ⴃ",
"Ⴄ",
"Ⴅ",
"Ⴆ",
"Ⴇ",
"Ⴈ",
"Ⴉ",
"Ⴊ",
"Ⴋ",
"Ⴌ",
"Ⴍ",
"Ⴎ",
"Ⴏ",
"Ⴐ",
"Ⴑ",
"Ⴒ",
"Ⴓ",
"Ⴔ",
"Ⴕ",
"Ⴖ",
"Ⴗ",
"Ⴘ",
"Ⴙ",
"Ⴚ",
"Ⴛ",
"Ⴜ",
"Ⴝ",
"Ⴞ",
"Ⴟ",
"Ⴠ",
"Ⴡ",
"Ⴢ",
"Ⴣ",
"Ⴤ",
"Ⴥ",
"Ⴧ",
"Ⴭ",
"ა",
"ბ",
"გ",
"დ",
"ე",
"ვ",
"ზ",
"თ",
"ი",
"კ",
"ლ",
"მ",
"ნ",
"ო",
"პ",
"ჟ",
"რ",
"ს",
"ტ",
"უ",
"ფ",
"ქ",
"ღ",
"",
"შ",
"ჩ",
"ც",
"ძ",
"წ",
"ჭ",
"ხ",
"ჯ",
"ჰ",
"ჱ",
"ჲ",
"ჳ",
"ჴ",
"ჵ",
"ჶ",
"ჷ",
"ჸ",
"ჹ",
"ჺ",
"჻",
"ჼ",
"ჽ",
"ჾ",
""
]
}

File diff suppressed because one or more lines are too long

BIN
nim/createMostCommonWords Executable file

Binary file not shown.

View file

@ -32,12 +32,12 @@ for file in walkDir("../words/data/wordfrequency.info/"):
for word in words:
if 2 >= word.len():
continue
if wordsAdded.high == 99: break
if wordsAdded.high == 199: break
if word == language: continue
if word in wordsAdded: continue
wordsAdded.add word
if wordsAdded.high != 99:
if wordsAdded.high != 199:
raise new CatchableError
result[language] = %* wordsAdded

BIN
nim/main

Binary file not shown.

View file

@ -14,38 +14,21 @@ import sets
import algorithm
import math
type LPS = object
lpsArray : seq[int]
pattern : seq[Rune]
const wordCount* = 30
proc createLps(a : string) =
let runified = a.toRunes()
var lps = newSeq[uint](runified.len())
var j : uint = 0
for i in 1 .. runified.high:
while runified[j] != runified[i] and j != 0:
j = lps[j-1]
if runified[j] == runified[i]:
j+=1
lps[i] = j
echo lps
proc wtvr(a : Table[string, seq[string]]) : Table[string, HashSet[string]]=
proc generateMostCommonWords(a : Table[string, seq[string]]) : Table[string, HashSet[string]]=
for (key,val) in a.pairs:
result[key] = toHashSet val
result[key] = toHashSet (val[0 .. wordCount-1])
const resultText = staticRead("../data/mostCommonCharacters.json")
const charactersJson = staticRead("../data/alphabets.json")
const wikiToEnglish = staticRead("../data/wikiToEng.json")
const mostCommonWords = wtvr((parseJson staticRead("../data/mostCommonWords.json")).to(Table[string, seq[string]]))
const mostCommonWords = generateMostCommonWords((parseJson staticRead("../data/mostCommonWords.json")).to(Table[string, seq[string]]))
const forbiddenChars = @[","].join("").toRunes().toSeq()
proc isUsedChar(a : Rune, allValidChars : HashSet[Rune]) : (bool, Rune) =
if a in forbiddenChars:
return (false, Rune(0))
if a notin allValidChars:
return (false, Rune(0))
#Haungul, unusued
if int(a) in 0xAC00..0xD7AF:
return (true, a)
@ -53,6 +36,9 @@ proc isUsedChar(a : Rune, allValidChars : HashSet[Rune]) : (bool, Rune) =
if int(a) in 0x4E00..0x9FFF:
return (true, a)
if a notin allValidChars:
return (false, Rune(0))
if a.isUpper():
if a in forbiddenChars:
return (false, Rune(0))
@ -166,7 +152,9 @@ proc neighborDistance(a : Table[Rune, float], b : Table[Rune, float]) : float =
resultBuffer[i] = (cubic(abs(a[char]-distance)))
return resultBuffer.foldl(a+b)
proc doThing*(comparisonLangs : seq[string], sample : string) : Table[string, float] =
proc doThing*(comparisonLangs : seq[string], sample : string,
wordCounter : TableRef[string, CountTable[string]] = nil,
words : Table[string, HashSet[string]] = mostCommonWords) : Table[string, float] =
let deNoised = comparisonLangs.map(x => statistics[x])
var runeStr = newSeq[Rune](sample.len())
let (stringSlope, runeLength) = createStringSlope(sample, runeStr)
@ -189,8 +177,7 @@ proc doThing*(comparisonLangs : seq[string], sample : string) : Table[string, fl
let subsample = subsampleRunes.join("")
let characters = sample.toRunes().deduplicate().map(x=> isUsedChar(x, allValidChars)).map(x=>x[1])
let distinctCharacters = characters.filter(x=> int(x) != 0)
let characters = runeStr.deduplicate()
var potentialLanguages = toHasHSet comparisonLangs
#This is needed for identifying chinese and not mixing up japanese
@ -225,18 +212,25 @@ proc doThing*(comparisonLangs : seq[string], sample : string) : Table[string, fl
else:
result[language] = neighborDistance(stringSlope, slope)
if language notin mostCommonWords:
for c in characters:
if c notin slope:
result[language] *= 1.05
if language notin words:
echo language
continue
let mostCommon = mostCommonWords[language]
if wordCounter != nil and language notin wordCounter:
wordCounter[language] = initCountTable[string]()
let mostCommon = words[language]
#TODO: Make this not be insane
for word in mostCommon:
if subsample.contains(word):
result[language] *= 0.80
result[language] *= 0.7
if wordCounter != nil:
wordCounter[language].inc(word)
for c in distinctCharacters:
if c notin slope:
result[language] *= 1.05
if "ko" in potentialLanguages:
if "ko" notin result:
@ -253,5 +247,3 @@ proc doThing*(comparisonLangs : seq[string], sample : string) : Table[string, fl
proc makeResult*(oldResult : Table[string, float]) : seq[(string, float)] =
result = oldResult.pairs().toSeq()
result.sort((x,y)=> cmp(x[1], y[1]))
createLps("abacus")

Binary file not shown.

View file

@ -1,172 +0,0 @@
#[ This version uses ASCII for case-sensitive matching. For Unicode you may want to match in UTF-8
bytes instead of creating a 0x10FFFF-sized table.
]#
import std/[algorithm, sequtils, strutils, unicode]
type PreProcessed = object
r : seq[seq[int]]
l : seq[int]
f : seq[int]
text : seq[Rune]
const AlphabetSize = 256
func reversed(s: string): string =
## Return the reverse of an ASCII string.
for i in countdown(s.high, 0):
result.add s[i]
proc alphabetIndex(c: Rune): int =
## Return the index of the given ASCII character.
result = int(c)
proc matchLength(s: string; idx1, idx2: int): int =
## Return the length of the match of the substrings of "s" beginning at "idx1" and "idx2".
if idx1 == idx2: return s.len - idx1
var idx1 = idx1
var idx2 = idx2
while idx1 < s.len and idx2 < s.len and s[idx1] == s[idx2]:
inc result
inc idx1
inc idx2
proc fundamentalPreprocess(s: string): seq[int] =
## Return "z", the Fundamental Preprocessing of "s".
# "z[i]" is the length of the substring beginning at "i" which is also a prefix of "s".
# This preprocessing is done in O(n) time, where n is the length of "s".
let length = s.len()
if s.len == 0: return
if s.len == 1: return @[1]
result = newSeq[int](length)
result[0] = length
result[1] = s.matchLength(0, 1)
for i in 2..result[1]:
result[i] = result[1] - i + 1
# Defines lower and upper limits of z-box.
var l, r = 0
for i in (2 + result[1])..s.high:
if i <= r: # "i" falls within existing z-box.
let k = i - l
let b = result[k]
let a = r - i + 1
if b < a: # "b" ends within existing z-box.
result[i] = b
else: # "b" ends at or after the end of the z-box.
# We need to do an explicit match to the right of the z-box.
result[i] = a + s.matchLength(a, r + 1)
l = i
r = i + result[i] - 1
else: # "i" does not reside within existing z-box.
result[i] = s.matchLength(0, i)
if result[i] > 0:
l = i
r = i + result[i] - 1
proc badCharacterTable(s: seq[Rune]): seq[seq[int]] =
## Generates "R" for "s", which is an array indexed by the position of some character "c"
## in the ASCII table.
# At that index in "R" is an array of length |s|+1, specifying for each index "i" in "s"
# (plus the index after "s") the next location of character "c" encountered when traversing
# "S" from right to left starting at "i". This is used for a constant-time lookup for the bad
# character rule in the Boyer-Moore string search algorithm, although it has a much larger
# size than non-constant-time solutions.
if s.len == 0: return newSeqWith(AlphabetSize, newSeq[int]())
result = repeat(@[-1], AlphabetSize)
var alpha = repeat(-1, AlphabetSize)
for i, c in s:
alpha[alphabetIndex(c)] = i
for j, a in alpha:
result[j].add a
proc goodSuffixTable(s: string): seq[int] =
## Generates "L" for "s", an array used in the implementation of the strong good suffix rule.
# "L[i] = k", the largest position in S such that "s[i:]" (the suffix of "s" starting at "i")
# matches a suffix of "s[:k]" (a substring in "s" ending at "k"). Used in Boyer-Moore, "L"
# gives an amount to shift "P" relative to "T" such that no instances of "P" in "T" are skipped
# and a suffix of "P[:L[i]]" matches the substring of "T" matched by a suffix of "P" in the
# previous match attempt.
# Specifically, if the mismatch took place at position "i-1" in "P", the shift magnitude is
# given by the formula "len(P) - L[i]". In the case that "L[i] = -1", the full shift table
# is used. Since only proper suffixes matter, "L[0] = -1".
result =repeat(-1, s.len)
var n = fundamentalPreprocess(reversed(s))
n.reverse()
for j in 0..(s.len - 2):
let i = s.len - n[j]
if i != s.len:
result[i] = j
proc fullShiftTable(s: string): seq[int] =
## Generates "F" for "s", an array used in a special case of the good suffix rule in the
## Boyer-Moore string search algorithm.
# "F[i]" is the length of the longest suffix of "s[i:]" that is also a prefix of "s". In
# the cases it is used, the shift magnitude of the pattern "P" relative to the text "T" is
# "len(P) - F[i]" for a mismatch occurring at "i-1".
result = repeat(0, s.len)
let z = fundamentalPreprocess(s)
var longest = 0
for i, zv in reversed(z):
if zv == i + 1:
longest = max(zv, longest)
result[^(i + 1)] = longest
proc prePrcoess*(a : string) : PreProcessed =
result.text = a.toRunes()
result.r = badCharacterTable(result.text)
result.l = goodSuffixTable(a)
result.f = fullShiftTable(a)
proc stringSearch*(processed : PreProcessed, t: seq[Rune]) : bool=
## Implementation of the Boyer-Moore string search algorithm.
# This finds all occurrences of "p" in "t", and incorporates numerous ways of preprocessing
# the pattern to determine the optimal amount to shift the string and skip comparisons.
# In practice it runs in O(m) (and even sublinear) time, where "m" is the length of "t".
# This implementation performs a case-sensitive search on ASCII characters. "p" must be
# ASCII as well.
let p = processed.text
let r = processed.r
let l = processed.l
let f = processed.f
if p.len == 0 or t.len == 0 or t.len < p.len: return true
var k = p.len - 1 # Represents alignment of end of "p" relative to "t".
var prevk = -1 # Represents alignment in previous phase (Galil's rule).
while k < t.len:
var i = p.len - 1 # Character to compare in "p".
var h = k # Character to compare in "t".
while i >= 0 and h > prevk and p[i] == t[h]: # Matches starting from end of "p".
dec i
dec h
if i == -1 or h == prevk: # Match has been found (Galil's rule).
return true
else: # No match: shift by max of bad character and good suffix rules.
let charShift = i - r[alphabetIndex(t[h])][i]
let suffixShift = if i + 1 == p.len: # Mismatch happened on first attempt.
1
elif l[i + 1] == -1: # Matched suffix does not appear anywhere in "p".
p.len - f[i + 1]
else: # Matched suffix appears in "p".
p.len - 1 - l[i + 1]
let shift = max(charShift, suffixShift)
if shift >= i + 1: prevk = k # Galil's rule
inc k, shift
const
Text1 = """InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesley
"DKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassembly
"languagestoillustratetheconceptsandalgorithmsastheyarepresented""".toRunes()
Text2 = """Nearby farms grew a half acre of alfalfa on the dairy's behalf,
with bales of all that alfalfa exchanged for milk.""".toRunes()
(Pat1, Pat2, Pat3) = (prePrcoess "put", prePrcoess "hello", prePrcoess "alfalfa")
echo Pat1.stringSearch(Text1)
echo Pat2.stringSearch(Text1)
echo Pat3.stringSearch(Text2)

BIN
nim/tests

Binary file not shown.

View file

@ -5,31 +5,153 @@ import sugar
import tables
import times
import algorithm
import strutils
import pretty
import stats
let db = open("../data/testing/testingData.db", "", "", "")
type Accuracy = object
correct : int
incorrect : int
faliures : int
languagesConfusedFor : CountTable[string]
type
Accuracy = object
correct : int
incorrect : int
faliures : int
wordCount: int
languagesConfusedFor : CountTable[string]
correctWordCounts : CountTable[string]
incorrectWordCounts : CountTable[string]
Score = object
accuracy : Accuracy
faliureRate : float
totalWordUtilization : int
utlizationPerWord : float
totalGoodWordUtilization : int
goodUtilizationPerWord : float
percentGoodUtilization : float
totalBadWordUtilization : int
badUtilizationPerWord : float
percentBadUtilization : float
usedWordPercentGood : float
usedWordPercentBad : float
MacroScore = object
scores : Table[string, Score]
faliureRates : RunningStat
totalWordUtilizations : RunningStat
utlizationPerWords : RunningStat
totalGoodWordUtilizations : RunningStat
goodUtilizationPerWords : RunningStat
percentGoodUtilizations : RunningStat
totalBadWordUtilizations : RunningStat
badUtilizationPerWords : RunningStat
percentBadUtilizations : RunningStat
usedWordPercentsBad : RunningStat
usedWordPercentsGood : RunningStat
let langs = db.fastRows(sql"select distinct(lang) from TrainingData").toSeq().map(x=> x[0])
var results = initTable[string, Accuracy]()
for lang in main.languages:
results[lang] = Accuracy()
results[lang].wordCount = main.wordCount
for row in db.fastRows(sql"select Lang, Sample from TrainingData where lang = 'en'"):
echo row[0]
if row[0] notin main.languages:
for row in db.fastRows(sql"select Lang, Sample, Rowid from TrainingData"):
var wordCounts = newTable[string, CountTable[string]]()
let language = row[0]
if language notin main.languages:
continue
let result = makeResult doThing(main.languages, row[1])
if result[0][0] == row[0]:
results[row[0]].correct+=1
let result = makeResult doThing(main.languages, row[1], wordCounter = wordCounts)
let correct = result[0][0] == language
if correct:
results[language].correct+=1
else:
results[row[0]].incorrect+=1
results[row[0]].languagesConfusedFor.inc(result[0][0])
results[language].incorrect+=1
results[language].languagesConfusedFor.inc(result[0][0])
proc score(a : Accuracy) : float =
((a.faliures + a.incorrect) / (a.correct + a.incorrect + a.faliures)) * 100
echo results
echo results.pairs.toSeq().map(x=> (x[0], score x[1])).sorted((x,y)=> cmp(x[1], y[1])).filter(x=> x[1] > 0)
for (key,val) in wordCounts.pairs:
if key == language and correct:
results[language].correctWordCounts.merge(val)
else:
results[key].incorrectWordCounts.merge(val)
proc score(a : Table[string, Accuracy]) : Table[string, Score] =
for (key,val) in a.pairs:
let accuracy = ((val.faliures + val.incorrect) / (val.correct + val.incorrect + val.faliures)) * 100
var correct = val.correctWordCounts
var incorrect = val.incorrectWordCounts
var icount = incorrect.values.toSeq()
var ccount = correct.values.toSeq()
var goodTotal =
if ccount.high == -1:
1
else:
ccount.foldl(a+b)
var badTotal =
if icount.high == -1:
1
else:
icount.foldl(a+b)
let total = goodTotal + badTotal
let totalPerWord = (total) / val.wordCount
let badPerWord = badTotal / val.wordCount
let goodPerWord = goodTotal / val.wordCount
let goodUtilizationPercent = (goodTotal / total)*100
let badUtilizationPercent = (badTotal / total)*100
let percentWordsUsedGood = (ccount.len() / val.wordCount) * 100
let percentWordsUsedBad = (icount.len() / val.wordCount) * 100
echo icount
result[key] = Score()
result[key].usedWordPercentGood = percentWordsUsedGood
result[key].usedWordPercentBad = percentWordsUsedBad
result[key].accuracy = val
result[key].totalWordUtilization = total
result[key].utlizationPerWord = totalPerWord
result[key].totalGoodWordUtilization = goodTotal
result[key].goodUtilizationPerWord = goodPerWord
result[key].percentGoodUtilization = goodUtilizationPercent
result[key].totalBadWordUtilization = badTotal
result[key].badUtilizationPerWord = badPerWord
result[key].percentBadUtilization = badUtilizationPercent
let faliures = (val.incorrect + val.faliures)
let totalEntries = faliures + val.correct
result[key].faliureRate = (faliures / totalEntries)*100
proc makeMacroScore(a : Table[string, Score]) : MacroScore =
result.scores = a
for (key,val) in a.pairs:
result.faliureRates.push(val.faliureRate)
result.totalWordUtilizations.push(val.totalWordUtilization)
result.utlizationPerWords.push(val.utlizationPerWord)
result.totalGoodWordUtilizations.push(val.totalGoodWordUtilization)
result.goodUtilizationPerWords.push(val.goodUtilizationPerWord)
result.percentGoodUtilizations.push(val.percentGoodUtilization)
result.totalBadWordUtilizations.push(val.totalBadWordUtilization)
result.badUtilizationPerWords.push(val.badUtilizationPerWord)
result.percentBadUtilizations.push(val.percentBadUtilization)
result.usedWordPercentsBad.push(val.usedWordPercentBad)
result.usedWordPercentsGood.push(val.usedWordPercentGood)
proc tallyScores(a : MacroScore) : float =
# Biggest portion, how successful it is
var successScore = 0.0;
successScore = a.faliureRates.mean + (a.faliureRates.max / 3)
var wordScore = ((a.percentBadUtilizations.mean + a.badUtilizationPerWords.mean) + a.percentBadUtilizations.max)
wordScore *= (110-(a.percentGoodUtilizations.mean)) / 100
wordScore *= (110-(a.usedWordPercentsGood.mean)) / 100
result = (successScore*0.80)+(wordScore*0.20)
echo tallyScores makeMacroScore score results