Search didn't work in testing. Tests finished, final phase. Making genetic algorithm

2024-08-02 08:22:19 -04:00 · 2024-08-02 08:22:19 -04:00 · 0ebe6f65fe
commit 0ebe6f65fe
parent d739d6aa29
10 changed files with 1030 additions and 224 deletions
--- a/data/alphabets.json
+++ b/data/alphabets.json
@ -1082,5 +1082,869 @@
    "ײ",
    "׳",
    "״"
+  ],
+  "Malayalam": [
+    "അ",
+    "ആ",
+    "ഇ",
+    "ഈ",
+    "ഉ",
+    "ഊ",
+    "ഋ",
+    "ഌ",
+    "എ",
+    "ഏ",
+    "ഐ",
+    "ഒ",
+    "ഓ",
+    "ഔ",
+    "ക",
+    "ഖ",
+    "ഗ",
+    "ഘ",
+    "ങ",
+    "ച",
+    "ഛ",
+    "ജ",
+    "ഝ",
+    "ഞ",
+    "ട",
+    "ഠ",
+    "ഡ",
+    "ഢ",
+    "ണ",
+    "ത",
+    "ഥ",
+    "ദ",
+    "ധ",
+    "ന",
+    "ഩ",
+    "പ",
+    "ഫ",
+    "ബ",
+    "ഭ",
+    "മ",
+    "യ",
+    "ര",
+    "റ",
+    "ല",
+    "ള",
+    "ഴ",
+    "വ",
+    "ശ",
+    "ഷ",
+    "സ",
+    "ഹ",
+    "ഺ",
+    "ഽ",
+    "൏",
+    "ൔ",
+    "ൕ",
+    "ൖ",
+    "൘",
+    "൙",
+    "൚",
+    "൛",
+    "൜",
+    "൝",
+    "൞",
+    "ൟ",
+    "ൠ",
+    "ൡ",
+    "൦",
+    "൧",
+    "൨",
+    "൩",
+    "൪",
+    "൫",
+    "൬",
+    "൭",
+    "൮",
+    "൯",
+    "൰",
+    "൱",
+    "൲",
+    "൳",
+    "൴",
+    "൵",
+    "൶",
+    "൷",
+    "൸",
+    "൹",
+    "ൺ",
+    "ൻ",
+    "ർ",
+    "ൽ",
+    "ൾ",
+    "ൿ"
+  ],
+  "bengali": [
+    "অ",
+    "আ",
+    "ই",
+    "ঈ",
+    "উ",
+    "ঊ",
+    "ঋ",
+    "ঌ",
+    "এ",
+    "ঐ",
+    "ও",
+    "ঔ",
+    "ক",
+    "খ",
+    "গ",
+    "ঘ",
+    "ঙ",
+    "চ",
+    "ছ",
+    "জ",
+    "ঝ",
+    "ঞ",
+    "ট",
+    "ঠ",
+    "ড",
+    "ঢ",
+    "ণ",
+    "ত",
+    "থ",
+    "দ",
+    "ধ",
+    "ন",
+    "প",
+    "ফ",
+    "ব",
+    "ভ",
+    "ম",
+    "য",
+    "র",
+    "ল",
+    "শ",
+    "ষ",
+    "স",
+    "হ",
+    "ড়",
+    "ঢ়",
+    "য়",
+    "ৠ",
+    "ৡ",
+    "১",
+    "২",
+    "৩",
+    "৪",
+    "৫",
+    "৬",
+    "৭",
+    "৮",
+    "৯",
+    "ৰ",
+    "ৱ",
+    "৲",
+    "৳",
+    "৴",
+    "৵",
+    "৶",
+    "৷",
+    "৸",
+    "৹",
+    "৺",
+    "৻",
+    "ৼ"
+  ],
+  "Punjabi": [
+    "ਅ",
+    "ਆ",
+    "ਇ",
+    "ਈ",
+    "ਉ",
+    "ਊ",
+    "ਏ",
+    "ਐ",
+    "ਓ",
+    "ਔ",
+    "ਕ",
+    "ਖ",
+    "ਗ",
+    "ਘ",
+    "ਙ",
+    "ਚ",
+    "ਛ",
+    "ਜ",
+    "ਝ",
+    "ਞ",
+    "ਟ",
+    "ਠ",
+    "ਡ",
+    "ਢ",
+    "ਣ",
+    "ਤ",
+    "ਥ",
+    "ਦ",
+    "ਧ",
+    "ਨ",
+    "ਪ",
+    "ਫ",
+    "ਬ",
+    "ਭ",
+    "ਮ",
+    "ਯ",
+    "ਰ",
+    "ਲ",
+    "ਲ਼",
+    "ਵ",
+    "ਸ਼",
+    "ਸ",
+    "ਹ",
+    "ਖ਼",
+    "ਗ਼",
+    "ਜ਼",
+    "ੜ",
+    "ਫ਼",
+    "੦",
+    "੧",
+    "੨",
+    "੩",
+    "੪",
+    "੫",
+    "੬",
+    "੭",
+    "੮",
+    "੯",
+    "ੲ",
+    "ੳ",
+    "ੴ"
+  ],
+  "AramaicEtheopian": [
+    "ሀ",
+    "ሁ",
+    "ሂ",
+    "ሃ",
+    "ሄ",
+    "ህ",
+    "ሆ",
+    "ሇ",
+    "ለ",
+    "ሉ",
+    "ሊ",
+    "ላ",
+    "ሌ",
+    "ል",
+    "ሎ",
+    "ሏ",
+    "ሐ",
+    "ሑ",
+    "ሒ",
+    "ሓ",
+    "ሔ",
+    "ሕ",
+    "ሖ",
+    "ሗ",
+    "መ",
+    "ሙ",
+    "ሚ",
+    "ማ",
+    "ሜ",
+    "ም",
+    "ሞ",
+    "ሟ",
+    "ሠ",
+    "ሡ",
+    "ሢ",
+    "ሣ",
+    "ሤ",
+    "ሥ",
+    "ሦ",
+    "ሧ",
+    "ረ",
+    "ሩ",
+    "ሪ",
+    "ራ",
+    "ሬ",
+    "ር",
+    "ሮ",
+    "ሯ",
+    "ሰ",
+    "ሱ",
+    "ሲ",
+    "ሳ",
+    "ሴ",
+    "ስ",
+    "ሶ",
+    "ሷ",
+    "ሸ",
+    "ሹ",
+    "ሺ",
+    "ሻ",
+    "ሼ",
+    "ሽ",
+    "ሾ",
+    "ሿ",
+    "ቀ",
+    "ቁ",
+    "ቂ",
+    "ቃ",
+    "ቄ",
+    "ቅ",
+    "ቆ",
+    "ቇ",
+    "ቈ",
+    "ቊ",
+    "ቋ",
+    "ቌ",
+    "ቍ",
+    "ቐ",
+    "ቑ",
+    "ቒ",
+    "ቓ",
+    "ቔ",
+    "ቕ",
+    "ቖ",
+    "ቘ",
+    "ቚ",
+    "ቛ",
+    "ቜ",
+    "ቝ",
+    "በ",
+    "ቡ",
+    "ቢ",
+    "ባ",
+    "ቤ",
+    "ብ",
+    "ቦ",
+    "ቧ",
+    "ቨ",
+    "ቩ",
+    "ቪ",
+    "ቫ",
+    "ቬ",
+    "ቭ",
+    "ቮ",
+    "ቯ",
+    "ተ",
+    "ቱ",
+    "ቲ",
+    "ታ",
+    "ቴ",
+    "ት",
+    "ቶ",
+    "ቷ",
+    "ቸ",
+    "ቹ",
+    "ቺ",
+    "ቻ",
+    "ቼ",
+    "ች",
+    "ቾ",
+    "ቿ",
+    "ኀ",
+    "ኁ",
+    "ኂ",
+    "ኃ",
+    "ኄ",
+    "ኅ",
+    "ኆ",
+    "ኇ",
+    "ኈ",
+    "ኊ",
+    "ኋ",
+    "ኌ",
+    "ኍ",
+    "ነ",
+    "ኑ",
+    "ኒ",
+    "ና",
+    "ኔ",
+    "ን",
+    "ኖ",
+    "ኗ",
+    "ኘ",
+    "ኙ",
+    "ኚ",
+    "ኛ",
+    "ኜ",
+    "ኝ",
+    "ኞ",
+    "ኟ",
+    "አ",
+    "ኡ",
+    "ኢ",
+    "ኣ",
+    "ኤ",
+    "እ",
+    "ኦ",
+    "ኧ",
+    "ከ",
+    "ኩ",
+    "ኪ",
+    "ካ",
+    "ኬ",
+    "ክ",
+    "ኮ",
+    "ኯ",
+    "ኰ",
+    "ኲ",
+    "ኳ",
+    "ኴ",
+    "ኵ",
+    "ኸ",
+    "ኹ",
+    "ኺ",
+    "ኻ",
+    "ኼ",
+    "ኽ",
+    "ኾ",
+    "ዀ",
+    "ዂ",
+    "ዃ",
+    "ዄ",
+    "ዅ",
+    "ወ",
+    "ዉ",
+    "ዊ",
+    "ዋ",
+    "ዌ",
+    "ው",
+    "ዎ",
+    "ዏ",
+    "ዐ",
+    "ዑ",
+    "ዒ",
+    "ዓ",
+    "ዔ",
+    "ዕ",
+    "ዖ",
+    "ዘ",
+    "ዙ",
+    "ዚ",
+    "ዛ",
+    "ዜ",
+    "ዝ",
+    "ዞ",
+    "ዟ",
+    "ዠ",
+    "ዡ",
+    "ዢ",
+    "ዣ",
+    "ዤ",
+    "ዥ",
+    "ዦ",
+    "ዧ",
+    "የ",
+    "ዩ",
+    "ዪ",
+    "ያ",
+    "ዬ",
+    "ይ",
+    "ዮ",
+    "ዯ",
+    "ደ",
+    "ዱ",
+    "ዲ",
+    "ዳ",
+    "ዴ",
+    "ድ",
+    "ዶ",
+    "ዷ",
+    "ዸ",
+    "ዹ",
+    "ዺ",
+    "ዻ",
+    "ዼ",
+    "ዽ",
+    "ዾ",
+    "ዿ",
+    "ጀ",
+    "ጁ",
+    "ጂ",
+    "ጃ",
+    "ጄ",
+    "ጅ",
+    "ጆ",
+    "ጇ",
+    "ገ",
+    "ጉ",
+    "ጊ",
+    "ጋ",
+    "ጌ",
+    "ግ",
+    "ጎ",
+    "ጏ",
+    "ጐ",
+    "ጒ",
+    "ጓ",
+    "ጔ",
+    "ጕ",
+    "ጘ",
+    "ጙ",
+    "ጚ",
+    "ጛ",
+    "ጜ",
+    "ጝ",
+    "ጞ",
+    "ጟ",
+    "ጠ",
+    "ጡ",
+    "ጢ",
+    "ጣ",
+    "ጤ",
+    "ጥ",
+    "ጦ",
+    "ጧ",
+    "ጨ",
+    "ጩ",
+    "ጪ",
+    "ጫ",
+    "ጬ",
+    "ጭ",
+    "ጮ",
+    "ጯ",
+    "ጰ",
+    "ጱ",
+    "ጲ",
+    "ጳ",
+    "ጴ",
+    "ጵ",
+    "ጶ",
+    "ጷ",
+    "ጸ",
+    "ጹ",
+    "ጺ",
+    "ጻ",
+    "ጼ",
+    "ጽ",
+    "ጾ",
+    "ጿ",
+    "ፀ",
+    "ፁ",
+    "ፂ",
+    "ፃ",
+    "ፄ",
+    "ፅ",
+    "ፆ",
+    "ፇ",
+    "ፈ",
+    "ፉ",
+    "ፊ",
+    "ፋ",
+    "ፌ",
+    "ፍ",
+    "ፎ",
+    "ፏ",
+    "ፐ",
+    "ፑ",
+    "ፒ",
+    "ፓ",
+    "ፔ",
+    "ፕ",
+    "ፖ",
+    "ፗ",
+    "ፘ",
+    "ፙ",
+    "ፚ",
+    "፩",
+    "፪",
+    "፫",
+    "፬",
+    "፭",
+    "፮",
+    "፯",
+    "፰",
+    "፱",
+    "፲",
+    "፳",
+    "፴",
+    "፵",
+    "፶",
+    "፷",
+    "፸",
+    "፹",
+    "፺",
+    "፻",
+    "፼"
+  ],
+  "Kannada": [
+    "಄",
+    "ಅ",
+    "ಆ",
+    "ಇ",
+    "ಈ",
+    "ಉ",
+    "ಊ",
+    "ಋ",
+    "ಌ",
+    "ಎ",
+    "ಏ",
+    "ಐ",
+    "ಒ",
+    "ಓ",
+    "ಔ",
+    "ಕ",
+    "ಖ",
+    "ಗ",
+    "ಘ",
+    "ಙ",
+    "ಚ",
+    "ಛ",
+    "ಜ",
+    "ಝ",
+    "ಞ",
+    "ಟ",
+    "ಠ",
+    "ಡ",
+    "ಢ",
+    "ಣ",
+    "ತ",
+    "ಥ",
+    "ದ",
+    "ಧ",
+    "ನ",
+    "ಪ",
+    "ಫ",
+    "ಬ",
+    "ಭ",
+    "ಮ",
+    "ಯ",
+    "ರ",
+    "ಱ",
+    "ಲ",
+    "ಳ",
+    "ವ",
+    "ಶ",
+    "ಷ",
+    "ಸ",
+    "ಹ",
+    "ಽ",
+    "ಾ",
+    "ಿ",
+    "ೀ",
+    "ು",
+    "ೂ",
+    "ೊ",
+    "ೋ",
+    "೦",
+    "೧",
+    "೨",
+    "೩",
+    "೪",
+    "೫",
+    "೬",
+    "೭",
+    "೮",
+    "೯"
+  ],
+  "Tamil": [
+    "ஃ",
+    "அ",
+    "ஆ",
+    "இ",
+    "ஈ",
+    "உ",
+    "ஊ",
+    "எ",
+    "ஏ",
+    "ஐ",
+    "ஒ",
+    "ஓ",
+    "ஔ",
+    "க",
+    "ங",
+    "ச",
+    "ஜ",
+    "ஞ",
+    "ட",
+    "ண",
+    "த",
+    "ந",
+    "ன",
+    "ப",
+    "ம",
+    "ய",
+    "ர",
+    "ற",
+    "ல",
+    "ள",
+    "ழ",
+    "வ",
+    "ஶ",
+    "ஷ",
+    "ஸ",
+    "ஹ",
+    "ௐ",
+    "ௗ",
+    "௦",
+    "௧",
+    "௨",
+    "௩",
+    "௪",
+    "௫",
+    "௬",
+    "௭",
+    "௮",
+    "௯",
+    "௰",
+    "௱",
+    "௲",
+    "௳",
+    "௴",
+    "௵",
+    "௶",
+    "௷",
+    "௸",
+    "௹",
+    "௺"
+  ],
+  "Gujarati": [
+    "અ",
+    "આ",
+    "ઇ",
+    "ઈ",
+    "ઉ",
+    "ઊ",
+    "ઋ",
+    "ઌ",
+    "ઍ",
+    "એ",
+    "ઐ",
+    "ઑ",
+    "ઓ",
+    "ઔ",
+    "ક",
+    "ખ",
+    "ગ",
+    "ઘ",
+    "ઙ",
+    "ચ",
+    "છ",
+    "જ",
+    "ઝ",
+    "ઞ",
+    "ટ",
+    "ઠ",
+    "ડ",
+    "ઢ",
+    "ણ",
+    "ત",
+    "થ",
+    "દ",
+    "ધ",
+    "ન",
+    "પ",
+    "ફ",
+    "બ",
+    "ભ",
+    "મ",
+    "ય",
+    "ર",
+    "લ",
+    "ળ",
+    "વ",
+    "શ",
+    "ષ",
+    "સ",
+    "હ",
+    "૦",
+    "૧",
+    "૨",
+    "૩",
+    "૪",
+    "૫",
+    "૬",
+    "૭",
+    "૮",
+    "૯",
+    "ૠ",
+    "ૡ",
+    "૰",
+    "૱",
+    "ૹ"
+  ],
+  "Georgian": [
+    "Ⴁ",
+    "Ⴂ",
+    "Ⴃ",
+    "Ⴄ",
+    "Ⴅ",
+    "Ⴆ",
+    "Ⴇ",
+    "Ⴈ",
+    "Ⴉ",
+    "Ⴊ",
+    "Ⴋ",
+    "Ⴌ",
+    "Ⴍ",
+    "Ⴎ",
+    "Ⴏ",
+    "Ⴐ",
+    "Ⴑ",
+    "Ⴒ",
+    "Ⴓ",
+    "Ⴔ",
+    "Ⴕ",
+    "Ⴖ",
+    "Ⴗ",
+    "Ⴘ",
+    "Ⴙ",
+    "Ⴚ",
+    "Ⴛ",
+    "Ⴜ",
+    "Ⴝ",
+    "Ⴞ",
+    "Ⴟ",
+    "Ⴠ",
+    "Ⴡ",
+    "Ⴢ",
+    "Ⴣ",
+    "Ⴤ",
+    "Ⴥ",
+    "Ⴧ",
+    "Ⴭ",
+    "ა",
+    "ბ",
+    "გ",
+    "დ",
+    "ე",
+    "ვ",
+    "ზ",
+    "თ",
+    "ი",
+    "კ",
+    "ლ",
+    "მ",
+    "ნ",
+    "ო",
+    "პ",
+    "ჟ",
+    "რ",
+    "ს",
+    "ტ",
+    "უ",
+    "ფ",
+    "ქ",
+    "ღ",
+    "ყ",
+    "შ",
+    "ჩ",
+    "ც",
+    "ძ",
+    "წ",
+    "ჭ",
+    "ხ",
+    "ჯ",
+    "ჰ",
+    "ჱ",
+    "ჲ",
+    "ჳ",
+    "ჴ",
+    "ჵ",
+    "ჶ",
+    "ჷ",
+    "ჸ",
+    "ჹ",
+    "ჺ",
+    "჻",
+    "ჼ",
+    "ჽ",
+    "ჾ",
+    "ჿ"
  ]
 }
--- a/data/mostCommonCharacters.json
+++ b/data/mostCommonCharacters.json
--- a/nim/createMostCommonWords
+++ b/nim/createMostCommonWords
--- a/nim/createMostCommonWords.nim
+++ b/nim/createMostCommonWords.nim
@ -32,12 +32,12 @@ for file in walkDir("../words/data/wordfrequency.info/"):
  for word in words:
    if 2 >= word.len():
      continue
-    if wordsAdded.high == 99: break
+    if wordsAdded.high == 199: break
    if word == language: continue
    if word in wordsAdded: continue
    wordsAdded.add word

-  if wordsAdded.high != 99:
+  if wordsAdded.high != 199:
    raise new CatchableError

  result[language] = %* wordsAdded
--- a/nim/main
+++ b/nim/main
--- a/nim/main.nim
+++ b/nim/main.nim
@ -14,38 +14,21 @@ import sets
 import algorithm
 import math

-type LPS = object
-  lpsArray : seq[int]
-  pattern : seq[Rune]
+const wordCount* = 30

-proc createLps(a : string) =
-  let runified = a.toRunes()
-  var lps = newSeq[uint](runified.len())
-  var j : uint = 0
-  for i in 1 .. runified.high:
-    while runified[j] != runified[i] and j != 0:
-      j = lps[j-1]
-    if runified[j] == runified[i]:
-      j+=1
-    lps[i] = j
-  echo lps
-proc wtvr(a : Table[string, seq[string]]) : Table[string, HashSet[string]]=
+proc generateMostCommonWords(a : Table[string, seq[string]]) : Table[string, HashSet[string]]=
  for (key,val) in a.pairs:
-    result[key] = toHashSet val
+    result[key] = toHashSet (val[0 .. wordCount-1])

 const resultText = staticRead("../data/mostCommonCharacters.json")
 const charactersJson = staticRead("../data/alphabets.json")
 const wikiToEnglish = staticRead("../data/wikiToEng.json")
-const mostCommonWords = wtvr((parseJson staticRead("../data/mostCommonWords.json")).to(Table[string, seq[string]]))
+const mostCommonWords = generateMostCommonWords((parseJson staticRead("../data/mostCommonWords.json")).to(Table[string, seq[string]]))
 const forbiddenChars = @[","].join("").toRunes().toSeq()

-
-
 proc isUsedChar(a : Rune, allValidChars : HashSet[Rune]) : (bool, Rune) =
  if a in forbiddenChars:
    return (false, Rune(0))
-  if a notin allValidChars:
-    return (false, Rune(0))
  #Haungul, unusued
  if int(a) in 0xAC00..0xD7AF:
    return (true, a)
@ -53,6 +36,9 @@ proc isUsedChar(a : Rune, allValidChars : HashSet[Rune]) : (bool, Rune) =
  if int(a) in 0x4E00..0x9FFF:
    return (true, a)

+  if a notin allValidChars:
+    return (false, Rune(0))
+
  if a.isUpper():
    if a in forbiddenChars:
      return (false, Rune(0))
@ -166,7 +152,9 @@ proc neighborDistance(a : Table[Rune, float], b : Table[Rune, float]) : float =
    resultBuffer[i] = (cubic(abs(a[char]-distance)))
  return resultBuffer.foldl(a+b)

-proc doThing*(comparisonLangs : seq[string], sample : string) : Table[string, float] =
+proc doThing*(comparisonLangs : seq[string], sample : string,
+              wordCounter : TableRef[string, CountTable[string]] = nil,
+              words : Table[string, HashSet[string]] = mostCommonWords) : Table[string, float] =
  let deNoised = comparisonLangs.map(x => statistics[x])
  var runeStr = newSeq[Rune](sample.len())
  let (stringSlope, runeLength) = createStringSlope(sample, runeStr)
@ -189,8 +177,7 @@ proc doThing*(comparisonLangs : seq[string], sample : string) : Table[string, fl

  let subsample = subsampleRunes.join("")

-  let characters = sample.toRunes().deduplicate().map(x=> isUsedChar(x, allValidChars)).map(x=>x[1])
-  let distinctCharacters = characters.filter(x=> int(x) != 0)
+  let characters = runeStr.deduplicate()
  var potentialLanguages = toHasHSet comparisonLangs

  #This is needed for identifying chinese and not mixing up japanese
@ -225,18 +212,25 @@ proc doThing*(comparisonLangs : seq[string], sample : string) : Table[string, fl
    else:
      result[language] = neighborDistance(stringSlope, slope)

-    if language notin mostCommonWords:
+    for c in characters:
+      if c notin slope:
+        result[language] *= 1.05
+
+    if language notin words:
+      echo language
      continue

-    let mostCommon = mostCommonWords[language]
+    if wordCounter != nil and language notin wordCounter:
+        wordCounter[language] = initCountTable[string]()
+
+    let mostCommon = words[language]
    #TODO: Make this not be insane
    for word in mostCommon:
      if subsample.contains(word):
-        result[language] *= 0.80
+        result[language] *= 0.7
+        if wordCounter != nil:
+          wordCounter[language].inc(word)

-    for c in distinctCharacters:
-      if c notin slope:
-        result[language] *= 1.05

  if "ko" in potentialLanguages:
    if "ko" notin result:
@ -253,5 +247,3 @@ proc doThing*(comparisonLangs : seq[string], sample : string) : Table[string, fl
 proc makeResult*(oldResult : Table[string, float]) : seq[(string, float)] =
  result = oldResult.pairs().toSeq()
  result.sort((x,y)=> cmp(x[1], y[1]))
-
-createLps("abacus")
--- a/nim/search
+++ b/nim/search
--- a/nim/search.nim
+++ b/nim/search.nim
@ -1,172 +0,0 @@
-#[ This version uses ASCII for case-sensitive matching. For Unicode you may want to match in UTF-8
-   bytes instead of creating a 0x10FFFF-sized table.
-]#
-
-import std/[algorithm, sequtils, strutils, unicode]
-
-type PreProcessed = object
-  r : seq[seq[int]]
-  l : seq[int]
-  f :  seq[int]
-  text : seq[Rune]
-
-const AlphabetSize = 256
-
-func reversed(s: string): string =
-  ## Return the reverse of an ASCII string.
-  for i in countdown(s.high, 0):
-    result.add s[i]
-
-proc alphabetIndex(c: Rune): int =
-  ## Return the index of the given ASCII character.
-  result = int(c)
-
-proc matchLength(s: string; idx1, idx2: int): int =
-  ## Return the length of the match of the substrings of "s" beginning at "idx1" and "idx2".
-  if idx1 == idx2: return s.len - idx1
-  var idx1 = idx1
-  var idx2 = idx2
-  while idx1 < s.len and idx2 < s.len and s[idx1] == s[idx2]:
-    inc result
-    inc idx1
-    inc idx2
-
-proc fundamentalPreprocess(s: string): seq[int] =
-  ## Return "z", the Fundamental Preprocessing of "s".
-
-  # "z[i]" is the length of the substring beginning at "i" which is also a prefix of "s".
-  # This preprocessing is done in O(n) time, where n is the length of "s".
-  let length = s.len()
-  if s.len == 0: return
-  if s.len == 1: return @[1]
-  result = newSeq[int](length)
-  result[0] = length
-  result[1] = s.matchLength(0, 1)
-  for i in 2..result[1]:
-    result[i] = result[1] - i + 1
-  # Defines lower and upper limits of z-box.
-  var l, r = 0
-  for i in (2 + result[1])..s.high:
-    if i <= r:    # "i" falls within existing z-box.
-      let k = i - l
-      let b = result[k]
-      let a = r - i + 1
-      if b < a:   # "b" ends within existing z-box.
-        result[i] = b
-      else:  # "b" ends at or after the end of the z-box.
-        # We need to do an explicit match to the right of the z-box.
-        result[i] = a + s.matchLength(a, r + 1)
-        l = i
-        r = i + result[i] - 1
-    else:  # "i" does not reside within existing z-box.
-      result[i] = s.matchLength(0, i)
-      if result[i] > 0:
-        l = i
-        r = i + result[i] - 1
-
-proc badCharacterTable(s: seq[Rune]): seq[seq[int]] =
-  ## Generates "R" for "s", which is an array indexed by the position of some character "c"
-  ## in the ASCII table.
-
-  # At that index in "R" is an array of length |s|+1, specifying for each index "i" in "s"
-  # (plus the index after "s") the next location of character "c" encountered when traversing
-  # "S" from right to left starting at "i". This is used for a constant-time lookup for the bad
-  # character rule in the Boyer-Moore string search algorithm, although it has a much larger
-  # size than non-constant-time solutions.
-  if s.len == 0: return newSeqWith(AlphabetSize, newSeq[int]())
-  result = repeat(@[-1], AlphabetSize)
-  var alpha = repeat(-1, AlphabetSize)
-  for i, c in s:
-    alpha[alphabetIndex(c)] = i
-    for j, a in alpha:
-      result[j].add a
-
-proc goodSuffixTable(s: string): seq[int] =
-  ## Generates "L" for "s", an array used in the implementation of the strong good suffix rule.
-
-  # "L[i] = k", the largest position in S such that "s[i:]" (the suffix of "s" starting at "i")
-  # matches a suffix of "s[:k]" (a substring in "s" ending at "k"). Used in Boyer-Moore, "L"
-  # gives an amount to shift "P" relative to "T" such that no instances of "P" in "T" are skipped
-  # and a suffix of "P[:L[i]]" matches the substring of "T" matched by a suffix of "P" in the
-  # previous match attempt.
-  # Specifically, if the mismatch took place at position "i-1" in "P", the shift magnitude is
-  # given by the formula "len(P) - L[i]". In the case that "L[i] = -1", the full shift table
-  # is used. Since only proper suffixes matter, "L[0] = -1".
-  result =repeat(-1, s.len)
-  var n = fundamentalPreprocess(reversed(s))
-  n.reverse()
-  for j in 0..(s.len - 2):
-    let i = s.len - n[j]
-    if i != s.len:
-      result[i] = j
-
-proc fullShiftTable(s: string): seq[int] =
-  ## Generates "F" for "s", an array used in a special case of the good suffix rule in the
-  ## Boyer-Moore string search algorithm.
-
-  # "F[i]" is the length of the longest suffix of "s[i:]" that is also a prefix of "s". In
-  # the cases it is used, the shift magnitude of the pattern "P" relative to the text "T" is
-  # "len(P) - F[i]" for a mismatch occurring at "i-1".
-  result = repeat(0, s.len)
-  let z = fundamentalPreprocess(s)
-  var longest = 0
-  for i, zv in reversed(z):
-    if zv == i + 1:
-      longest = max(zv, longest)
-      result[^(i + 1)] = longest
-
-proc prePrcoess*(a : string) : PreProcessed =
-  result.text = a.toRunes()
-  result.r = badCharacterTable(result.text)
-  result.l = goodSuffixTable(a)
-  result.f = fullShiftTable(a)
-
-proc stringSearch*(processed : PreProcessed, t: seq[Rune]) : bool=
-  ## Implementation of the Boyer-Moore string search algorithm.
-
-  # This finds all occurrences of "p" in "t", and incorporates numerous ways of preprocessing
-  # the pattern to determine the optimal amount to shift the string and skip comparisons.
-  # In practice it runs in O(m) (and even sublinear) time, where "m" is the length of "t".
-  # This implementation performs a case-sensitive search on ASCII characters. "p" must be
-  # ASCII as well.
-  let p = processed.text
-  let r = processed.r
-  let l = processed.l
-  let f = processed.f
-
-  if p.len == 0 or t.len == 0 or t.len < p.len: return true
-
-
-  var k = p.len - 1   # Represents alignment of end of "p" relative to "t".
-  var prevk = -1      # Represents alignment in previous phase (Galil's rule).
-  while k < t.len:
-    var i = p.len - 1  # Character to compare in "p".
-    var h = k          # Character to compare in "t".
-    while i >= 0 and h > prevk and p[i] == t[h]:  # Matches starting from end of "p".
-      dec i
-      dec h
-    if i == -1 or h == prevk:  # Match has been found (Galil's rule).
-      return true
-    else:  # No match: shift by max of bad character and good suffix rules.
-      let charShift = i - r[alphabetIndex(t[h])][i]
-      let suffixShift = if i + 1 == p.len:    # Mismatch happened on first attempt.
-                          1
-                        elif l[i + 1] == -1:  # Matched suffix does not appear anywhere in "p".
-                          p.len - f[i + 1]
-                        else:                 # Matched suffix appears in "p".
-                          p.len - 1 - l[i + 1]
-      let shift = max(charShift, suffixShift)
-      if shift >= i + 1: prevk = k  # Galil's rule
-      inc k, shift
-
-const
-  Text1 = """InhisbookseriesTheArtofComputerProgrammingpublishedbyAddisonWesley
-          "DKnuthusesanimaginarycomputertheMIXanditsassociatedmachinecodeandassembly
-          "languagestoillustratetheconceptsandalgorithmsastheyarepresented""".toRunes()
-  Text2 = """Nearby farms grew a half acre of alfalfa on the dairy's behalf,
-          with bales of all that alfalfa exchanged for milk.""".toRunes()
-  (Pat1, Pat2, Pat3) = (prePrcoess "put", prePrcoess "hello", prePrcoess "alfalfa")
-
-echo  Pat1.stringSearch(Text1)
-echo  Pat2.stringSearch(Text1)
-echo  Pat3.stringSearch(Text2)
--- a/nim/tests
+++ b/nim/tests
--- a/nim/tests.nim
+++ b/nim/tests.nim
@ -5,31 +5,153 @@ import sugar
 import tables
 import times
 import algorithm
+import strutils
+import pretty
+import stats

 let db = open("../data/testing/testingData.db", "", "", "")
-type Accuracy = object
-  correct : int
-  incorrect : int
-  faliures : int
-  languagesConfusedFor : CountTable[string]
+
+type
+  Accuracy = object
+    correct : int
+    incorrect : int
+    faliures : int
+    wordCount: int
+    languagesConfusedFor : CountTable[string]
+    correctWordCounts : CountTable[string]
+    incorrectWordCounts :  CountTable[string]
+  Score = object
+    accuracy : Accuracy
+    faliureRate : float
+
+    totalWordUtilization : int
+    utlizationPerWord : float
+
+    totalGoodWordUtilization : int
+    goodUtilizationPerWord : float
+    percentGoodUtilization : float
+
+    totalBadWordUtilization : int
+    badUtilizationPerWord : float
+    percentBadUtilization : float
+
+    usedWordPercentGood : float
+    usedWordPercentBad : float
+
+  MacroScore = object
+    scores : Table[string, Score]
+
+    faliureRates : RunningStat
+    totalWordUtilizations : RunningStat
+    utlizationPerWords : RunningStat
+
+    totalGoodWordUtilizations : RunningStat
+    goodUtilizationPerWords : RunningStat
+    percentGoodUtilizations : RunningStat
+
+    totalBadWordUtilizations : RunningStat
+    badUtilizationPerWords : RunningStat
+    percentBadUtilizations : RunningStat
+    usedWordPercentsBad : RunningStat
+    usedWordPercentsGood : RunningStat
+
 let langs = db.fastRows(sql"select distinct(lang) from TrainingData").toSeq().map(x=> x[0])

 var results  = initTable[string, Accuracy]()
 for lang in main.languages:
  results[lang] = Accuracy()
+  results[lang].wordCount = main.wordCount

-for row in db.fastRows(sql"select Lang, Sample from TrainingData where lang = 'en'"):
-  echo row[0]
-  if row[0] notin main.languages:
+for row in db.fastRows(sql"select Lang, Sample, Rowid from TrainingData"):
+  var wordCounts = newTable[string, CountTable[string]]()
+  let language = row[0]
+
+  if language notin main.languages:
      continue
-  let result = makeResult doThing(main.languages, row[1])
-  if result[0][0] == row[0]:
-    results[row[0]].correct+=1
+  let result = makeResult doThing(main.languages, row[1], wordCounter = wordCounts)
+  let correct = result[0][0] == language
+  if correct:
+    results[language].correct+=1
  else:
-    results[row[0]].incorrect+=1
-    results[row[0]].languagesConfusedFor.inc(result[0][0])
+    results[language].incorrect+=1
+    results[language].languagesConfusedFor.inc(result[0][0])

-proc score(a : Accuracy) : float =
-  ((a.faliures + a.incorrect) / (a.correct + a.incorrect + a.faliures)) * 100
-echo results
-echo results.pairs.toSeq().map(x=> (x[0], score x[1])).sorted((x,y)=> cmp(x[1], y[1])).filter(x=> x[1] > 0)
+  for (key,val) in wordCounts.pairs:
+    if key == language and correct:
+      results[language].correctWordCounts.merge(val)
+    else:
+      results[key].incorrectWordCounts.merge(val)
+
+
+proc score(a : Table[string, Accuracy]) : Table[string, Score] =
+  for (key,val) in a.pairs:
+    let accuracy = ((val.faliures + val.incorrect) / (val.correct + val.incorrect + val.faliures)) * 100
+    var correct = val.correctWordCounts
+    var incorrect = val.incorrectWordCounts
+    var icount = incorrect.values.toSeq()
+    var ccount = correct.values.toSeq()
+    var goodTotal =
+      if ccount.high == -1:
+        1
+      else:
+        ccount.foldl(a+b)
+    var badTotal =
+      if icount.high == -1:
+        1
+      else:
+        icount.foldl(a+b)
+
+    let total = goodTotal + badTotal
+    let totalPerWord = (total) / val.wordCount
+    let badPerWord = badTotal / val.wordCount
+    let goodPerWord = goodTotal / val.wordCount
+    let goodUtilizationPercent = (goodTotal / total)*100
+    let badUtilizationPercent = (badTotal / total)*100
+    let percentWordsUsedGood = (ccount.len() / val.wordCount) * 100
+    let percentWordsUsedBad = (icount.len() / val.wordCount) * 100
+    echo icount
+
+    result[key] = Score()
+
+    result[key].usedWordPercentGood = percentWordsUsedGood
+    result[key].usedWordPercentBad = percentWordsUsedBad
+    result[key].accuracy = val
+    result[key].totalWordUtilization = total
+    result[key].utlizationPerWord = totalPerWord
+    result[key].totalGoodWordUtilization = goodTotal
+    result[key].goodUtilizationPerWord = goodPerWord
+    result[key].percentGoodUtilization = goodUtilizationPercent
+    result[key].totalBadWordUtilization = badTotal
+    result[key].badUtilizationPerWord = badPerWord
+    result[key].percentBadUtilization = badUtilizationPercent
+    let faliures  = (val.incorrect + val.faliures)
+    let totalEntries = faliures + val.correct
+    result[key].faliureRate = (faliures / totalEntries)*100
+
+proc makeMacroScore(a : Table[string, Score]) : MacroScore =
+  result.scores = a
+  for (key,val) in a.pairs:
+    result.faliureRates.push(val.faliureRate)
+    result.totalWordUtilizations.push(val.totalWordUtilization)
+    result.utlizationPerWords.push(val.utlizationPerWord)
+    result.totalGoodWordUtilizations.push(val.totalGoodWordUtilization)
+    result.goodUtilizationPerWords.push(val.goodUtilizationPerWord)
+    result.percentGoodUtilizations.push(val.percentGoodUtilization)
+    result.totalBadWordUtilizations.push(val.totalBadWordUtilization)
+    result.badUtilizationPerWords.push(val.badUtilizationPerWord)
+    result.percentBadUtilizations.push(val.percentBadUtilization)
+    result.usedWordPercentsBad.push(val.usedWordPercentBad)
+    result.usedWordPercentsGood.push(val.usedWordPercentGood)
+proc tallyScores(a : MacroScore) : float =
+
+  # Biggest portion, how successful it is
+  var successScore = 0.0;
+  successScore = a.faliureRates.mean + (a.faliureRates.max / 3)
+  var wordScore = ((a.percentBadUtilizations.mean + a.badUtilizationPerWords.mean) + a.percentBadUtilizations.max)
+
+  wordScore *= (110-(a.percentGoodUtilizations.mean)) / 100
+  wordScore *= (110-(a.usedWordPercentsGood.mean)) / 100
+
+  result  = (successScore*0.80)+(wordScore*0.20)
+
+echo tallyScores makeMacroScore score results