Added word Occurrences

This commit is contained in:
user 2024-07-27 13:48:44 -04:00
parent 15c2a11ce1
commit b302ddcbf3
12 changed files with 254 additions and 825 deletions

3
.gitmodules vendored
View file

@ -1,3 +0,0 @@
[submodule "words"]
path = words
url = https://github.com/frekwencja/most-common-words-multilingual

View file

@ -1,659 +0,0 @@
{
"alphabets:": [
"A",
"B",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
"J",
"K",
"L",
"M",
"N",
"O",
"P",
"Q",
"R",
"S",
"T",
"U",
"V",
"W",
"X",
"Y",
"Z",
"a",
"b",
"c",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"q",
"r",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z"
],
"greek": [
"Α",
"Β",
"Γ",
"Δ",
"Ε",
"Ζ",
"Η",
"Θ",
"Ι",
"Κ",
"Λ",
"Μ",
"Ν",
"Ξ",
"Ο",
"Π",
"Ρ",
"Σ",
"Τ",
"Υ",
"Φ",
"Χ",
"Ψ",
"Ω",
"α",
"β",
"γ",
"δ",
"ε",
"ζ",
"η",
"θ",
"ι",
"κ",
"λ",
"μ",
"ν",
"ξ",
"ο",
"π",
"ρ",
"σ",
"τ",
"υ",
"φ",
"χ",
"ψ",
"ω"
],
"arabic": [
"ا",
"ب",
"ت",
"ث",
"ج",
"ح",
"خ",
"د",
"ذ",
"ر",
"ز",
"س",
"ش",
"ص",
"ض",
"ط",
"ظ",
"ع",
"غ",
"ف",
"ق",
"ك",
"ل",
"م",
"ن",
"ه",
"و",
"ي",
"أ",
"إ",
"آ",
"ى",
"ة",
"ﻉ",
"ﻍ",
"ﺯ",
"ﺱ",
"ﺷ",
"ﺹ",
"ﺽ",
"ﻁ",
"ﻅ",
"ﻋ",
"ﻏ",
"ﻓ",
"ﻗ",
"ﻛ",
"ﻝ",
"ﻡ",
"ﻥ",
"",
"ﻭ",
"ﻳ",
"ـ",
"ﻻ",
"ﻷ",
"ﻹ",
"ﻵ",
"ﻺ",
"",
"ﺏ",
"ﺕ",
"ﺙ",
"ﺟ",
"ﺣ",
"ﺧ",
"ﺩ",
"ﺫ",
"ﺭ",
"ﺯ",
"ﺱ",
"ﺷ",
"ﺹ",
"ﺽ",
"ﻁ",
"ﻇ",
"ﻉ",
"ﻍ",
"ﺑ",
"",
"ﺕ",
"ﺙ",
"ﺝ",
"ﺣ",
"ﺧ",
"ﺩ",
"ﺫ",
"ﺭ",
"ﺯ",
"ﺱ",
"ﺷ",
"ﺹ",
"ﺽ",
"ﻁ",
"ﻇ",
"ﻉ",
"ﻍ",
"ﻑ",
"ﻕ",
"ﻙ",
"ﻝ",
"ﻡ",
"ﻥ",
"",
"ﻭ",
"ﻳ"
],
"extended": [
"À",
"Á",
"Â",
"Ã",
"Ä",
"Å",
"Æ",
"Ç",
"È",
"É",
"Ê",
"Ë",
"Ì",
"Í",
"Î",
"Ï",
"Ð",
"Ñ",
"Ò",
"Ó",
"Ô",
"Õ",
"Ö",
"Ø",
"Ù",
"Ú",
"Û",
"Ü",
"Ý",
"Þ",
"ß",
"à",
"á",
"â",
"ã",
"ä",
"å",
"æ",
"ç",
"è",
"é",
"ê",
"ë",
"ì",
"í",
"î",
"ï",
"ð",
"ñ",
"ò",
"ó",
"ô",
"õ",
"ö",
"ø",
"ù",
"ú",
"û",
"ü",
"ý",
"þ",
"ÿ",
"Ē",
"ē",
"Ĕ",
"ĕ",
"Ė",
"ė",
"Ę",
"ę",
"Ě",
"ě",
"Ĝ",
"ĝ",
"Ğ",
"ğ",
"Ġ",
"ġ",
"Ģ",
"ģ",
"Ĥ",
"ĥ",
"Ħ",
"ħ",
"Ĩ",
"ĩ",
"Ī",
"ī",
"Ĭ",
"ĭ",
"Į",
"į",
"IJ",
"ij",
"Ĵ",
"ĵ",
"Ķ",
"ķ",
"ĸ",
"Ĺ",
"ĺ",
"Ļ",
"ļ",
"Ľ",
"ľ",
"Ŀ",
"ŀ",
"Ł",
"ł",
"Ń",
"ń",
"Ņ",
"ņ",
"Ň",
"ň",
"ʼn",
"Ŋ",
"ŋ",
"Ō",
"ō",
"Ŏ",
"ŏ",
"Ő",
"ő",
"Œ",
"œ",
"Ŕ",
"ŕ",
"Ŗ",
"ŗ",
"Ř",
"ř",
"Ś",
"ś",
"Ŝ",
"ŝ",
"Ş",
"ş",
"Š",
"š",
"Ţ",
"ţ",
"Ť",
"ť",
"Ŧ",
"ŧ",
"Ũ",
"ũ",
"Ū",
"ū",
"Ŭ",
"ŭ",
"Ů",
"ů",
"Ű",
"ű",
"Ų",
"ų",
"Ŵ",
"ŵ",
"Ŷ",
"ŷ",
"Ÿ",
"Ź",
"ź",
"Ż",
"ż",
"Ž",
"ž",
"Ɓ",
"Ƃ",
"ƃ",
"Ƅ",
"ƅ",
"Ɔ",
"Ƈ",
"ƈ",
"Ɖ",
"Ɗ",
"Ƌ",
"ƌ",
"ƍ",
"Ǝ",
"Ə",
"Ɲ",
"ƞ",
"Ɵ",
"Ơ",
"ơ",
"Ơ",
"ơ",
"Ƣ",
"ƣ",
"Ƥ",
"ƥ",
"Ʀ",
"Ƨ",
"ƨ",
"Ʃ",
"ƪ",
"ƫ",
"Ƭ",
"ƭ",
"Ʈ",
"Ư",
"ư",
"Ʊ",
"Ʋ",
"Ƴ",
"ƴ",
"Ƶ",
"ƶ",
"Ʒ",
"Ƹ",
"ƹ",
"ƺ",
"ƻ",
"Ƽ",
"ƽ",
"ƾ",
"ƿ",
"Ǎ",
"ǎ",
"Ǐ",
"ǐ",
"Ǔ",
"ǔ",
"Ǖ",
"ǖ",
"Ǘ",
"ǘ",
"Ǚ",
"ǚ",
"Ǜ",
"ǜ",
"ǝ",
"Ǟ",
"ǟ",
"Ǡ",
"ǡ",
"Ǣ",
"ǣ",
"Ǥ",
"ǥ",
"Ǧ",
"ǧ",
"Ǩ",
"ǩ",
"Ǫ",
"ǫ",
"Ǭ",
"ǭ",
"Ǯ",
"ǯ",
"ǰ",
"Ǵ",
"ǵ",
"Ƕ",
"Ƿ",
"Ǹ",
"ǹ",
"Ǻ",
"ǻ",
"Ǽ",
"ǽ",
"Ǿ",
"ǿ",
"Ȁ",
"ȁ",
"Ȃ",
"ȃ",
"Ȅ",
"ȅ",
"Ȇ",
"ȇ",
"Ȉ",
"ȉ",
"Ȋ",
"ȋ",
"Ȍ",
"ȍ",
"Ȏ",
"ȏ",
"Ȑ",
"ȑ",
"Ȓ",
"ȓ",
"Ȕ",
"ȕ",
"Ȗ",
"ȗ",
"Ș",
"ș",
"Ț",
"ț",
"Ȝ",
"ȝ",
"Ȟ",
"ȟ",
"Ƞ",
"ȡ",
"Ȣ",
"ȣ",
"Ȥ",
"ȥ",
"Ȧ",
"ȧ",
"Ȩ",
"ȩ",
"Ȫ",
"ȫ",
"Ȭ",
"ȭ",
"Ȯ",
"ȯ",
"Ȱ",
"ȱ",
"Ȳ",
"ȳ",
"ȴ",
"ȵ",
"ȶ",
"ȷ",
"ȸ",
"ȹ",
"Ⱥ",
"Ȼ",
"ȼ",
"Ƚ",
"Ⱦ",
"ɀ",
"ɂ",
"Ƀ",
"Ʉ",
"Ʌ",
"Ɇ",
"ɇ",
"Ɉ",
"ɉ",
"Ɋ",
"ɋ",
"Ɍ",
"ɍ",
"Ɏ",
"ɏ",
"ɐ",
"ɑ",
"ɒ",
"ɓ",
"ɔ",
"ƃ",
"ƅ",
"ɖ",
"ɗ",
"ɘ",
"ə",
"ɚ",
"ɛ",
"ɜ",
"ɝ",
"ɞ",
"ɟ",
"ɠ",
"ɡ",
"ɢ",
"ɣ",
"ɤ",
"ɥ",
"ɦ",
"ɧ",
"ɨ",
"ɩ",
"ɪ",
"ɫ",
"ɬ",
"ɭ",
"ɮ",
"ɯ",
"ɰ",
"ɱ",
"ɲ",
"ɳ",
"ŋ",
"ɴ",
"ɵ",
"ɶ",
"ɷ",
"ɸ",
"ɹ",
"ɺ",
"ɻ",
"ɼ",
"ɽ",
"ɾ",
"ɿ",
"ʀ",
"ʁ",
"ʂ",
"ʃ",
"ʄ",
"ʅ",
"ʆ",
"ʇ",
"ʈ",
"ʉ",
"ʊ",
"ʋ",
"ʌ",
"ʍ",
"ʎ",
"ʏ",
"ʐ",
"ʑ",
"ʒ",
"ʓ",
"ʔ",
"ʕ",
"ʖ",
"ʗ",
"ʘ",
"ʙ",
"ʚ",
"ʛ",
"ʜ",
"ʝ",
"ʞ",
"ʟ",
"ʠ",
"ʡ",
"ʢ",
"ʣ",
"ʤ",
"ʥ",
"ʦ",
"ʧ",
"ʨ",
"ʩ",
"ʪ",
"ʫ",
"ʬ",
"ʭ",
"ʮ",
"ʯ"
]
}

1
data/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
mostCommonWords.json

File diff suppressed because one or more lines are too long

1
data/words/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
words.*

5
data/words/schema.sql Normal file
View file

@ -0,0 +1,5 @@
create table Words(
Language text,
Word text,
Occurrences integer
);

View file

@ -1,20 +0,0 @@
import json
import tables
import std/jsonutils
import unicode
let rustOutput = readFile("./result.json")
let parsed = parseJson(rustOutput)
for key in parsed.keys:
var outputType : CountTable[Rune]
let bloc = parsed[key]
for key in bloc.keys:
let letter = toRunes(key)[0]
let addLetter =
if letter.isLower():
letter.toUpper()
else:
letter
outputType.inc(addLetter, bloc[key].getInt())
outputType.sort()
echo key
echo outputType

File diff suppressed because one or more lines are too long

171
rust/Cargo.lock generated
View file

@ -37,6 +37,12 @@ dependencies = [
"alloc-no-stdlib",
]
[[package]]
name = "allocator-api2"
version = "0.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f"
[[package]]
name = "android-tzdata"
version = "0.1.1"
@ -173,15 +179,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "brotli"
version = "3.5.0"
name = "bitflags"
version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d640d25bc63c50fb1f0b545ffd80207d2e10a4c965530809b40ba3386825c391"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
"brotli-decompressor 2.5.1",
]
checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
[[package]]
name = "brotli"
@ -191,17 +192,7 @@ checksum = "74f7971dbd9326d58187408ab83117d8ac1bb9c17b085fdacd1cf2f598719b6b"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
"brotli-decompressor 4.0.1",
]
[[package]]
name = "brotli-decompressor"
version = "2.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e2e4afe60d7dd600fdd3de8d0f08c2b7ec039712e3b6137ff98b7004e82de4f"
dependencies = [
"alloc-no-stdlib",
"alloc-stdlib",
"brotli-decompressor",
]
[[package]]
@ -301,6 +292,12 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
[[package]]
name = "fallible-iterator"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
[[package]]
name = "fallible-streaming-iterator"
version = "0.1.9"
@ -313,7 +310,7 @@ version = "24.3.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8add37afff2d4ffa83bc748a70b4b1370984f6980768554182424ef71447c35f"
dependencies = [
"bitflags",
"bitflags 1.3.2",
"rustc_version",
]
@ -354,6 +351,19 @@ name = "hashbrown"
version = "0.14.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
dependencies = [
"ahash",
"allocator-api2",
]
[[package]]
name = "hashlink"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
dependencies = [
"hashbrown",
]
[[package]]
name = "iana-time-zone"
@ -484,32 +494,22 @@ version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
[[package]]
name = "libsqlite3-sys"
version = "0.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cf4e226dcd58b4be396f7bd3c20da8fdee2911400705297ba7d2d7cc2c30f716"
dependencies = [
"pkg-config",
"vcpkg",
]
[[package]]
name = "log"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
[[package]]
name = "lz4"
version = "1.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6eab492fe7f8651add23237ea56dbf11b3c4ff762ab83d40a47f11433421f91"
dependencies = [
"libc",
"lz4-sys",
]
[[package]]
name = "lz4-sys"
version = "1.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9764018d143cc854c9f17f0b907de70f14393b1f502da6375dce70f00514eb3"
dependencies = [
"cc",
"libc",
]
[[package]]
name = "lz4_flex"
version = "0.11.3"
@ -632,7 +632,7 @@ dependencies = [
"arrow-schema",
"arrow-select",
"base64",
"brotli 6.0.0",
"brotli",
"bytes",
"chrono",
"flate2",
@ -646,39 +646,16 @@ dependencies = [
"snap",
"thrift",
"twox-hash",
"zstd 0.13.2",
"zstd",
"zstd-sys",
]
[[package]]
name = "parquet-format-safe"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1131c54b167dd4e4799ce762e1ab01549ebb94d5bdd13e6ec1b467491c378e1f"
[[package]]
name = "parquet2"
version = "0.17.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "579fe5745f02cef3d5f236bfed216fd4693e49e4e920a13475c6132233283bce"
dependencies = [
"brotli 3.5.0",
"flate2",
"lz4",
"parquet-format-safe",
"seq-macro",
"snap",
"streaming-decompression",
"xxhash-rust",
"zstd 0.12.4",
]
[[package]]
name = "parquet_thing"
version = "0.1.0"
dependencies = [
"parquet",
"parquet2",
"rusqlite",
"serde_json",
]
@ -712,6 +689,20 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "rusqlite"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a78046161564f5e7cd9008aff3b2990b3850dc8e0349119b98e8f251e099f24d"
dependencies = [
"bitflags 2.6.0",
"fallible-iterator",
"fallible-streaming-iterator",
"hashlink",
"libsqlite3-sys",
"smallvec",
]
[[package]]
name = "rustc_version"
version = "0.4.0"
@ -770,6 +761,12 @@ dependencies = [
"serde",
]
[[package]]
name = "smallvec"
version = "1.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
[[package]]
name = "snap"
version = "1.1.1"
@ -782,15 +779,6 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "streaming-decompression"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf6cc3b19bfb128a8ad11026086e31d3ce9ad23f8ea37354b31383a187c44cf3"
dependencies = [
"fallible-streaming-iterator",
]
[[package]]
name = "syn"
version = "2.0.71"
@ -838,6 +826,12 @@ version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
[[package]]
name = "vcpkg"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "version_check"
version = "0.9.4"
@ -977,12 +971,6 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "xxhash-rust"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63658493314859b4dfdf3fb8c1defd61587839def09582db50b8a4e93afca6bb"
[[package]]
name = "zerocopy"
version = "0.7.35"
@ -1003,32 +991,13 @@ dependencies = [
"syn",
]
[[package]]
name = "zstd"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
dependencies = [
"zstd-safe 6.0.6",
]
[[package]]
name = "zstd"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9"
dependencies = [
"zstd-safe 7.2.0",
]
[[package]]
name = "zstd-safe"
version = "6.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
dependencies = [
"libc",
"zstd-sys",
"zstd-safe",
]
[[package]]

View file

@ -7,5 +7,5 @@ edition = "2021"
[dependencies]
parquet = "52.1.0"
parquet2 = "0.17.2"
rusqlite = "0.30.0"
serde_json = "1.0.120"

View file

@ -1 +0,0 @@
target/release/parquet_thing

View file

@ -1,13 +1,16 @@
use std::fs::File;
use std::fs;
use std::sync::{Arc, Mutex};
use std::thread;
use std::{thread, time};
use parquet::file::reader::{FileReader, SerializedFileReader};
use std::collections::HashMap;
use std::collections::HashSet;
use serde_json::Value;
use std::thread::available_parallelism;
use std::fs::DirEntry;
use std::io::prelude::*;
use rusqlite::{Connection, Result};
use std::sync::atomic::{AtomicU32,Ordering};
use std::mem::drop;
fn gen_chars() -> HashSet<char>{
let json = std::fs::read_to_string("../data/alphabets.json").unwrap();
@ -18,75 +21,210 @@ fn gen_chars() -> HashSet<char>{
}
return chars
}
fn do_work(path : &str, chars : &HashSet<char>) -> HashMap<char, u64>{
fn do_work(path : &str, chars : &HashSet<char>, do_word_search : bool) -> (HashMap<String, u64>, HashMap<char, u64>){
let mut map : HashMap<char, u64> = HashMap::new();
let mut word_map : HashMap<String, u64> = HashMap::new();
let file = File::open(path).unwrap();
let reader = SerializedFileReader::new(file).unwrap();
let mut iter = reader.get_row_iter(None).unwrap();
const MAX_WORD_LENGTH : usize = 22;
let skippable_chars : Vec<char> = vec![',', '.', '!', '?', '\n', '\\', '\'', '"', ';', '<', '>'];
while let Some(record) = iter.next() {
if record.is_ok(){
if record.is_err(){ continue; }
let mut array: [char; MAX_WORD_LENGTH] = ['0'; MAX_WORD_LENGTH];
let mut too_long = false;
let mut reading = false;
let mut current_letter = 0;
let columns = record.unwrap().into_columns();
for chary in columns[3].1.to_string().chars(){
if do_word_search {
if
(chary == ' ' || skippable_chars.contains(&chary)) && reading == true &&
too_long == false && current_letter != 0 {
let word = array[0 .. current_letter].iter().map(|x| x).collect::<String>();
let upper_case = &word.to_uppercase().to_owned();
*word_map.entry(upper_case.clone()).or_insert(0) += 1;
reading = false;
current_letter = 0;
};
if chary == ' ' && too_long == true{
array = ['0'; MAX_WORD_LENGTH];
current_letter = 0;
too_long = false;
};
if chars.contains(&chary) && reading == false && too_long == false{
array = ['0'; MAX_WORD_LENGTH];
current_letter = 0;
reading = true;
};
if (chary != ' ') && reading == true && too_long == false {
if (current_letter == MAX_WORD_LENGTH) || !chars.contains(&chary) {
too_long = true;
reading = false;
} else{
array[current_letter] = chary;
current_letter+=1;
};
}
}
if chars.contains(&chary){
*map.entry(chary).or_insert(0) += 1;
}
};
}
}
}
return map;
return (word_map, map);
}
fn file_to_string(a : DirEntry) -> String{
let path = a.path().into_os_string();
return path.clone().to_str().unwrap().to_string();
}
fn path_to_language(a : &String) -> String {
let start = a.find("1.").unwrap();
let end = a.find("train").unwrap();
return a[start+2 .. end-1].to_owned().clone()
}
fn main(){
//These languages don't have spaces and thus flood memory, and they don't make any sense to anaylize in this way
let blacklisted_languages: Vec<String> =
vec![
"simple".to_string(), "ja".to_string(), "th".to_string(),
"la".to_string(), "zh".to_string(), "zh-yue".to_string(),
"ko".to_string(), "lo".to_string(), "km".to_string(),
"zh-min-nan".to_string(), "zh-classical".to_string()
];
let cpu_count = available_parallelism().unwrap().get()*2;
let languages : Vec<_> = std::fs::read_dir("../../downloads/wikimedia_wikipedia/").unwrap().collect();
let mut paths : Vec<_> = vec![];
for language in languages{
let dir = std::fs::read_dir(language.unwrap().path());
if dir.is_err(){continue;}
for file in dir.unwrap() {
paths.push(file.unwrap().path());
paths.push(file_to_string(file.unwrap()));
}
}
let result : Arc<Mutex<HashMap<String, HashMap<char, u64>>>> = Arc::new(Mutex::new(HashMap::new()));
let blacklist_shared : Arc<Vec<String>> = Arc::new(blacklisted_languages);
let character_count_result : Arc<Mutex<HashMap<String, HashMap<char, u64>>>> = Arc::new(Mutex::new(HashMap::new()));
let db_lock : Arc<Mutex<bool>> = Arc::new(Mutex::new(false));
let languages : HashSet<_> = paths.iter().map(|x| path_to_language(x)).collect();
println!("{:?}", languages);
let mut threads = Vec::new();
let chunked : Vec<_> = paths.chunks(cpu_count).map(|chunk| chunk.to_vec()).collect();
let shared_chunks = Arc::new(chunked);
for cpu in 0 .. cpu_count{
let chunk = Arc::clone(&shared_chunks);
let cloned_result = result.clone();
let mut language_paths : HashMap<String, Vec<String>> = HashMap::new();
for lang in &languages{
let pathy = paths.clone();
let languages : Vec<String> = pathy.iter().filter(|x| path_to_language(x) == lang.clone()).cloned().collect();
language_paths.insert(lang.clone(), languages);
}
//maps lang -> bool for blocking while executing
let thread_exit_scheduler : Arc<Mutex<HashMap<String, bool>>> = Arc::new(Mutex::new(HashMap::new()));
for lang in &languages{
thread_exit_scheduler.lock().unwrap().insert(lang.clone(), false);
}
let lang_paths : Arc<HashMap<String, Vec<String>>> = Arc::new(language_paths);
let counter = Arc::new(AtomicU32::new(0));
for lang in languages{
let paths = Arc::clone(&lang_paths);
let b_lang = Arc::clone(&blacklist_shared);
let character_result_clone = Arc::clone(&character_count_result);
let dblock = Arc::clone(&db_lock);
let t_e_s_c = Arc::clone(&thread_exit_scheduler);
//Should you not limit the threads, it will fill up your memory and break
let count = Arc::clone(&counter);
threads.push(thread::spawn(move || {
while count.load(Ordering::Relaxed) > cpu_count.try_into().unwrap(){
let skip = time::Duration::from_millis(200);
thread::sleep(skip);
println!("blocking still, {}", count.load(Ordering::Relaxed));
}
count.fetch_add(1, Ordering::Relaxed);
//sets the current lang to running
let t_e_s = Arc::clone(&t_e_s_c);
let mut t_e_s_handler = t_e_s.lock().unwrap();
*t_e_s_handler.get_mut(&lang).unwrap() = true;
//drops the handler so its no longer locking
std::mem::drop(t_e_s_handler);
let chars = gen_chars();
let chunk = &chunk[cpu];
for path in chunk{
let path_str = path.to_str().unwrap();
if ! path_str.contains("parquet"){
continue
let mut char_occurrences : HashMap<char, u64> = HashMap::new();
let mut word_occurrences : HashMap<String, u64> = HashMap::new();
let paquet_paths = paths.get(&lang).unwrap();
let do_words = !b_lang.iter().any(|x| x == &lang);
for path in paquet_paths.iter(){
if path.find(".parquet").is_none(){
println!("{}", path);
continue;
};
let result = do_work(path, &chars, do_words);
for (key,val) in result.0.into_iter(){
*word_occurrences.entry(key).or_insert(0) += val;
}
let result = do_work(path_str, &chars);
let path_split : Vec<_> = path_str.split("/").collect();
//e.g ../{date}.{lang} / {n}.parquet
let language = path_split[path_split.len()-2].split(".").last().unwrap().to_string();
let mut map = cloned_result.lock().unwrap();
if !map.contains_key(&language){
map.insert(language, result);
}
else{
for char in &chars{
if ! result.contains_key(char) {continue;}
*map.get_mut(&language).unwrap().entry(*char).or_insert(0) += result.get(char).unwrap();
}
for (key,val) in result.1.into_iter(){
*char_occurrences.entry(key).or_insert(0) += val;
}
}
}));
let mut word_occurence_handler = character_result_clone.lock().unwrap();
word_occurence_handler.insert(lang.clone(), char_occurrences);
std::mem::drop(word_occurence_handler);
let mut query_builder : Vec<u8> = Vec::new();
query_builder.write_all(b"INSERT INTO Words(Language, Word, Occurrences) VALUES ").unwrap();
for (key,val) in word_occurrences.clone().into_iter(){
let query_sgement = format!("('{}','{}',{}),", lang, key, val);
query_builder.write_all(query_sgement.as_bytes()).unwrap();
}
let last = query_builder.len()-1;
query_builder[last] = b";"[0];
let db_insert = String::from_utf8(query_builder).unwrap();
let dblocked = dblock.lock().unwrap();
let connection = Connection::open("../data/words/words.db").unwrap();
connection.execute_batch("PRAGMA journal_mode = wal; PRAGMA synchronous = extra;").unwrap();
let potential_pain = connection.execute(&db_insert, ());
if potential_pain.is_err(){
println!("LANG {} FAILED, DATA: {}", lang, db_insert);
}
std::mem::drop(db_insert);
connection.execute_batch("PRAGMA analysis_limit=400; PRAGMA optimize").unwrap();
connection.close().unwrap();
//frees the thread from blocking another lang from starting.
count.fetch_sub(1, Ordering::Relaxed);
}))
}
for thread in threads {
for thread in threads{
thread.join().unwrap();
}
let value = serde_json::to_value(&*result).unwrap();
let value = serde_json::to_value(&*character_count_result).unwrap();
let output = serde_json::to_string(&value).unwrap();
let mut file = File::create("../data/result.json").unwrap();
file.write_all(output.as_bytes()).unwrap();
}