Zipfs-Law-Language-Detector/nim/createMostCommonWords.nim

44 lines
1 KiB
Nim

import db_connector/db_sqlite
import bitops
import streams
import os
import strutils
import sugar
import sequtils
import times
import tables
import json
let w2m = parseJson readFile("../data/wikiToMulti.json")
var backwards = initTable[string, string]()
for key,val in w2m.pairs:
backwards[val.getStr()] = key
var result = newJObject()
for file in walkDir("../words/data/wordfrequency.info/"):
let languageRaw = file.path.split("/")[^1].split(".")[0]
let words = readFile(file.path).split("\n")[0 .. 300]
var language : string
if languageRaw notin backwards:
#TODO: FIX CHINESE (ZH)
echo languageRaw
continue
else:
language = backwards[languageRaw]
var wordsAdded : seq[string]
for word in words:
if 2 >= word.len():
continue
if wordsAdded.high == 199: break
if word == language: continue
if word in wordsAdded: continue
wordsAdded.add word
if wordsAdded.high != 199:
raise new CatchableError
result[language] = %* wordsAdded
writeFile("../data/mostCommonWords.json", $result)