This commit is contained in:
retkid 2022-09-02 12:09:27 -04:00
commit c06a04ed72
15 changed files with 1275 additions and 0 deletions

101
README.md Normal file
View file

@ -0,0 +1,101 @@
# ArbitraryFileVideoEncoder
# What is this?
This project is intended to allow you to encode files to videos, and then upload those videos to anywhere,
and use them in the same way you would files.
Simple concept, and before encoding, perfectly doable, easily.
Example output:
![exampleoutput](https://cdn.discordapp.com/attachments/1003008044474564618/1015264426414329906/in0.png)
# Machine Learning
A machine learning training method for reducing transmission corruption was made in training.nim. Due to limitations in arraymancer currently with serialization, the ability to load the model is very buggy and thus not implemented.
# Encoding Standard
The encoding is relatively simple right now.
The first 256 blocks are individual colors, they are used as the key for the image, 0 - 255 respectively.
Each time an image of that color is identified, it can be referenced with its place in the first 255
This means the data retention is dependent on how distinct each individual color can be between each other, and how well they can be persevered to look like each other
# Serialization standard
Currently debug data can be saved into binary files via tensorCeral. The .bin encoding is in tensorCeral.nim and works like this:
The first 9 bytes are encoded as 32 bit unsigned int
[Dimension X, Dimension Y, The length of the following array [as a redundancy]] * 3
The following data, until the end of the file is data in bytes, equal to the dimensions first specified.
# Usage
- Implementation
Used to decode a .bin file to an output, has graphing when compiled with different settings, and an interactive pager. To get graph outputs use -d:colordebug -d:graph, and for pager -d:pager
note output can be "" or "-" for stdout
use ./implementation [input.bin] [output] [originalfile]
for training and cl help, they have built in help messages
- TensorCeral, its purposes are only used within LDPC
an example usage to test cl encoding is:
```bash
./cl -e yourFile.zip outFile.bin bestpallet.plte; ./implementation outFile.bin yourFileClone.zip
```
# Unit-Tests
Some unit tests have been implemented in unit-test.nim
If everything returns positive, then everything should be working
# What is bestpallet.plte?
It is the greatest pallet I have generated randomly, used mostly as a reference and a starting ground for all comparisons.
# Corruption
Compressing these files then using a simple color comparison algorithm is going to cause transmission data corruption, which can be pretty significant unless you have a good form of data redundancy algorithm.
Currently with the included pellet, we can get around ~2.7% - 6% corruption; That is the pallet i used in the example image. I included some basic statistical sorting, but it only increases corruption, and LDPC codes seem to make it skyrocket.
2.7% Is not enough to maintain most corruption-redundant data format's integrity, and thus, is insufficient to be use in practical purposes.
# Future
In the future, I would like to include some very important stuff
1. R statistical systems, to better identify the incorrect colors
2. Custom LDPC codes, this is the key
3. Hand mate color pallets
4. Refine machine learning into a classification algorithm feedback through implementation
Once we can achieve data corrupts low enough for an archive to survive 'transmission' we can start to look at more complex data structures.
# Graphs
The graph output of trainingdata. This graph shows various models at different points in their devolvement, showing their accuracy and efficiencies.
![trainingdata](https://media.discordapp.net/attachments/1003008044474564618/1015256595061555220/visual.png?width=1057&height=561)
This is a graph output of implementation, showing the inefficiencies of the current systems:
![implementation](https://media.discordapp.net/attachments/1003008044474564618/1015256595485175808/implementation.png?width=1037&height=561)
# Special credits
All of the following friends and family helped me throughout the time I worked on this. No technical help was given.
Cassie
Morgan
Max
Lonk
Brungo
Waka
Microgravity
Ilrasso
Ela
Jacko
JDL

Binary file not shown.

After

(image error) Size: 118 KiB

BIN
archivedata/visual.png Normal file

Binary file not shown.

After

(image error) Size: 90 KiB

BIN
bestpallet.plte Normal file

Binary file not shown.

298
cl.nim Normal file
View file

@ -0,0 +1,298 @@
import strutils
import sugar
import sequtils
import math
import arraymancer
import random
import itertools
import simplepng
import nimPNG
import system
import times
import tables
import streams
import os
import json
import osproc
import tensorCeral
import strformat
proc itPad(pad, width: int, start = 0) : iterator() : seq[(int, int)] {.gcsafe.} =
## Creates a coordinate continum corresponding to the pad and resolution
## ie 0,0 0,1 0,2 1,0 1,1 1,2 2,0 2,1 2,2
return iterator(): seq[(int,int)] =
for x in countup(pad, width, pad+1):
for y in countup(pad, width, pad+1):
var temp : seq[(int,int)]
for a in y-(pad) .. y:
for b in x-(pad) .. x:
temp.add((a,b))
yield temp
proc genColorMatrix*() : Tensor[byte] =
##Generates a random color pallette
randomize(cpuTime().int64)
return collect(for x in 1 .. 256*3: (rand(0 .. 255)).byte).toTensor().reshape(256,3)
proc writeRandomImage(width, height : int, path : string) =
##Writes a nonsense image
var colorTensor = genColorMatrix()
var p = initPixels(width, height)
let chunky = collect(for x in chunked(collect(for x in colorTensor: x), 3): x)
var n = 0
for color in p.mitems:
let
r = chunky[n][0]
g = chunky[n][1]
b = chunky[n][2]
color.setColor(r, g, b, 255)
n+=1
simplePNG(path, p)
proc fromPng*(input : string, pad = 0, total : int, multiple = true) : Tensor[byte]=
## Input = folder which it is in. eg ./out/0/
## uses input/1.png as a benchmark for resolution; it is intended to be used within the confines of the rest of the code
## total = the total amount files you want to scan
## if you want to use just one, multiple = false
## Returns a tensor of data, organized by blocks. First block, second block, third block, etc
##
var png : PNGResult[string]
var size : int
if multiple:
png = loadPNG32(&"{input}1.png")
size = png.width
else:
png = loadPNG32(input)
size = png.width
var filesRead = 0
var final : seq[byte]
proc decode(x : seq[char]) : seq[byte] =
#takes the data, to hex, then converts it to its, then cuts the alphachannel
return x.map(x=>($x).toHex()).map(x=>byte(parseHexInt(x)))[0 .. 2]
proc flatten(a : seq[seq[byte]]) : seq[byte] =
for x in a:
result.add(x)
#out is in a seq[int] of rgba, so, this outs an array of them
#decode, flatten, then tensor to get the data
#back to the "original" form
if pad == 0:
return collect(for x in chunked(png.data,4): decode(x)).flatten().toTensor().reshape(size^2, 3)
else:
#it iterates to the end amount of files given in total
#if false it'll only read 1.1
proc decodeImage(png : PNGResult[string], final : var seq[byte]) =
doAssert(png.height == size, "NON SQUARE SHAPE PADS NOT SUPPORTED")
#IO operation to organize
#one lines RGBA*vertical pad length, to get each b lock
for bigchunk in chunked(png.data, (size*4)*(pad+1)):
#organize this into horizontal length blocks
#used as a vertical iterator
let start = bigchunk.distribute(pad+1, spread=false)
for x in countup(0, start[0].high, ((pad+1)*4)):
for y in 0 .. start.high:
#this seems inefficient, i should look at this again..
#later.
var a = start[y][x .. (((pad+1)*4)+x)-1]
for delete in countup(3, a.high, 4): a.delete(delete-(int((delete+1)/4)-1))
final.add(a.map(x=>x.byte))
if multiple:
for file in 1 .. total:
let png = loadPNG32(input & $file & ".png")
decodeImage(png, final)
filesRead+=1
else:
decodeImage(png, final)
filesRead = 1
let mody = (((pad+1)^2)*3)
let test = int(((size^2)*3)/mody)*(filesRead)
return final.toTensor().reshape(test,mody)
iterator writeMultipleFiles(file : Stream, size, filesize, pad : int, outdir : string, reference : ref seq[byte]) : (proc(a: (byte,byte,byte)){.gcsafe.}, byte) {.gcsafe.} =
let byteSize = (pad+1)^2
let imagesNumber = (((filesize+256) * byteSize) / size^2).ceil().int
var written = 0
#threading this would be so simple, thats kinda the point of the design.
#previous form was a nightmare
while true:
var cbyte : byte
var starty = 0
for imageIndex in 1 .. imagesNumber:
var image = initPixels(size,size)
var it = itPad(pad, size)
for current in it():
if file.atEnd():
break
if starty < 256:
cbyte = byte(starty)
starty.inc()
else:
cbyte = file.readChar.byte
yield ((proc(input : (byte,byte,byte)) {.gcsafe.} =
for x in current:
image[x[0],x[1]].setColor(input[0],input[1],input[2], 255)
reference[].add(input[0])
reference[].add(input[1])
reference[].add(input[2])
),
cbyte)
simplePNG(outdir & "/in" & $written & ".png", image)
written.inc()
break
proc rowToArray(a : Tensor[byte]) : (byte,byte,byte) {.gcsafe.} =
(a[0, 0], a[0,1], a[0, 2])
proc encodeImage*(pad = 0, path : int, inputFile : string | Stream, pltePath : string | Tensor[byte]) : (Tensor[byte], Tensor[byte]) {.gcsafe.} =
var refout = new seq[byte]
##not for release
#let rand = genColorMatrix()
#temp bebug matrix
var colorPallet : Tensor[byte]
when pltePath is string:
if pltePath == "":
colorPallet = genColorMatrix()
else:
colorPallet = deSerializeColorPallate(newFileStream(pltePath))
else:
colorPallet = pltePath
let size = 600
var file : Stream
var fileSize : BiggestInt
when inputFile is string:
if inputFile == "" or inputFile == "-":
while sizeof(stdin) == 0:
continue
file = newStringStream(newFileStream(stdin).readAll())
#dirty code, I don't know else how to get the proper size of stdin
fileSize = file.readAll().len()
file.setPosition(0)
else:
file = (newFileStream(inputFile))
fileSize = getFileSize(inputFile)
else:
file = inputFile
for buff in writeMultipleFiles(file, size, fileSize.int, pad, "in/" & $path, refout):
#wMF yeilds a function to apply a color to a block, and a byte of data
#this system allows for a greator flexabiliy of color choices and
#potential future complexity if needed
let row = rowToArray(colorPallet[buff[1].int, 0 .. 2])
buff[0](row)
let mody = (((pad+1)^2)*3)
return (colorPallet, refout[].toTensor().reshape(int(refout[].len()/mody) , mody))
# proc decodeImagePerfect(pad : int, simple = true, path : string) {.gcsafe.} =
#to be programmed
proc outMatrix(pad : int, simple = true, path : int, multiple = true) : Tensor[byte]{.gcsafe.} =
##wrapper for fromPng
let files = collect(for x in os.walkdir("out/" & $path) : x).len()
var input = fromPng("out/" & $path & "/out", pad, files, multiple)
return input
type
train* = ref object of RootObj
pre* : (seq[uint8], Metadata)
post* : (seq[uint8], Metadata)
colors* : (seq[uint8], Metadata)
proc convertFrames*(input : (int, string, string | Stream, bool, string)) {.gcsafe, thread.}=
## the main CL convert proc, everything put together
## int is the outdir in 'in/x' and 'out/x'
## string[0] is the name of the outdir, doesn't matter if the bool is falsy
## string[1] is a stream containing encoding data or a string to it
## bool is writey, false if you don't wise to write the output in the cerial format
## string[2] is the path of a plte file, if you want it, keep blank otherwise
let
doWrite = input[3]
inputFile = input[2]
path = input[0]
pltePath = input[4]
let t1 = cpuTime()
let file = "in/" & $input[0]
if fileExists(file):
removeDir(file)
discard existsOrCreateDir(file)
let one = encodeImage(9, path, inputFile, pltepath)
discard os.execShellCmd("./tovideo.sh " & $input[0])
let outy = outMatrix(9, true, 0)
if doWrite:
var outsize = [
one[0].shape, one[1].shape, outy.shape
]
var writtey = [toFlatSeq(one[0]), toFlatSeq(one[1]), toFlatSeq(outy)]
var outputFile : FileStream
stderr.writeline input
if input[1] == "":
outputFile = newFileStream(stdout)
else:
stderr.writeline input
stderr.writeline "?"
outputFile = newFileStream(input[1], fmWrite)
serializeTensor(outputFile, outsize, writtey)
var params {.global.} = commandLineParams()
proc printHelp() =
stderr.writeline "This program encodes data!"
stderr.writeline "use -h to display this!"
stderr.writeline "-e to encode a file:"
stderr.writeline " cl -e [filetoencode] [tensorOutPath.bin] [pallet.plte]"
stderr.writeline " to use in pipe mode: cl -e - -"
stderr.writeline " to note release tensors: cl [input] x"
stderr.writeline "-rf to encode a file forever, generating new pallets"
stderr.writeline " -cl -rf [file]"
quit(1)
when isMainModule:
var channel = createShared(Channel[train], sizeof(Channel[train]))
channel[].open()
var files = (collect(for x in os.walkdir("./trainingdata"): x).len())
params.setLen(4)
if params[0] == "-e":
let fileToEncode = params[1]
let tensorOutPath = params[2]
let pltePath = params[3]
let doWrite = tensorOutPath != "x"
if params[1] == "" and params[2] == "":
printHelp()
convertFrames((0, tensorOutPath, fileToEncode, doWrite, pltePath))
elif params[0] == "-rf":
let fileToEncode = params[1]
if fileToEncode == "":
printHelp()
var cores = countProcessors()
var trainThreads = newSeq[Thread[(int, string, string, bool, string)]](cores)
while true:
for x in 0 .. cores:
createThread(trainThreads[x], convertFrames, (x, &"trainingdata/{files}.bin", fileToEncode, true, ""))
files+=1
joinThreads(trainThreads)
elif params[0] ==
"-h":
printHelp()

1
cl.nims Normal file
View file

@ -0,0 +1 @@
--threads:on

1
compile.sh Executable file
View file

@ -0,0 +1 @@
nim c -d:danger -d:release -d:blas=cblas $1

20
encodeLDPC.sh Normal file
View file

@ -0,0 +1,20 @@
#!/bin/sh
set -e # Stop if an error occurs
./ldpc/make-ldpc ./ldpc/ex-ldpcvar-5000a.pchk 5000 10000 2 evenboth 3 no4cycle
./ldpc/make-gen ./ldpc/ex-ldpcvar-5000a.pchk ./ldpc/ex-ldpcvar-5000a.gen dense
./tensorCeral -b $1 \
| ./ldpc/encode ./ldpc/ex-ldpcvar-5000a.pchk ./ldpc/ex-ldpcvar-5000a.gen - - \
| tr --delete '\n' \
| ./tensorCeral -e - - \
| ./cl -e - ./temp.bin bestpallete.plte
./implementation ./temp.bin \
| ./tensorCeral -b \
| ./ldpc/extract ./ldpc/ex-ldpcvar-5000a.gen - - \
| tr --delete '\n' \
| ./tensorCeral -e > out.clone
./tensorCeral -d $1 out.clone
rm temp.bin

304
implementation.nim Normal file
View file

@ -0,0 +1,304 @@
import strformat
import tensorCeral
import streams
import arraymancer
import std/stats
import math
import sequtils
import sugar
import os
import system
import algorithm
import terminal
import illwill
import nimpy
import tables
illwillInit(fullscreen = false)
type data = ref object of RootObj
originalColor : int
inputdata : int
outputChoices : seq[(int,int)]
distanceFromCorrct : int
stats : RunningStat
proc isCorrect(a : data) : bool =
return a.originalColor == a.outputChoices[0][1]
type commands = enum
Forward, ForwardGood, ForwardBad, BackwardGood, BackwardBad, Backward, Break, Graph
proc getInput() : commands =
##input for the statistic Viewer
while true:
var key = getKey()
case key
of Key.None: discard
of Key.Escape: quit()
of Key.Enter:
return Forward
of Key.E:
echo "e"
return ForwardBad
of Key.Q:
return BackwardBad
of Key.A:
return BackwardGood
of Key.D:
return ForwardGood
of Key.Space:
return Backward
of Key.X:
return Break
of Key.W:
return Graph
else:
continue
func datainit(a : int, c : int, stats : RunningStat, output : seq[(int,int)]) : data =
var correct = 0
for x in output:
if x[1] == a:
break
correct+=1
break
result = data(
originalColor : a,
inputdata : c,
outputChoices : output,
distanceFromCorrct : correct,
stats : stats
)
proc printData(data : data, cerial : (Tensor[float32], Tensor[float32], Tensor[float32])) =
##prints Data for the statistics viewer
eraseScreen()
echo "#"
echo &"The ordered Choices {data.outputChoices}"
echo ""
echo &"Correct Byte {data.originalColor}"
echo ""
echo &"Byte Returned {data.outputChoices[0][1]}"
echo ""
echo &"input data {cerial[2][data.inputdata, _]}"
echo ""
echo &"correct reference color {cerial[0][data.originalColor, _]}"
echo ""
echo &"output's reference color {cerial[0][data.outputChoices[0][1], _]}"
echo ""
echo &"idk: {data.stats}"
echo data.originalColor
echo data.outputChoices[0][1]
echo "#"
var plt = pyImport("matplotlib.pyplot")
proc getMeanValues(a : seq[float32]) : seq[int]=
var x1 : RunningStat
var x2 : RunningStat
var x3 : RunningStat
for x in countup(0, a.high, 3):
x1.push a[x]
for x in countup(1, a.high, 3):
x2.push a[x]
for x in countup(2, a.high, 3):
x3.push a[x]
return [x1.mean, x2.mean, x3.mean, x1.kurtosis, x2.kurtosis, x3.kurtosisS].map(x=>x.int)
proc getByte(input : seq[float32], reference : seq[seq[int]]) : (char, seq[(int,int)], seq[int]) =
##Gets the most likely byte from an input file, using a reference
##currently debug and has more inputs and outputs than it needs to
var lowestdif = @[10000,-1]
let mean = getMeanValues(input)
var tempDebug : seq[(int, int)]
for x in 0 .. reference.high:
var totaldif = 0
totaldif = abs(mean[0]-reference[x][0])^2+abs(mean[1]-reference[x][1])^2+abs(mean[2]-reference[x][2])^2
if totaldif < lowestdif[0]:
lowestdif[1] = x
lowestdif[0] = totaldif
tempDebug.add((totaldif, x))
return (char(lowestdif[1]), tempDebug, mean)
proc correlateBadBytes(a : seq[ptr seq[(int,int)]]) : Table[int, Table[int,int]] =
for x in a:
var outputChoice = x[][0][1]
if not result.hasKey(outputChoice):
result[outputChoice] = initTable[int,int]()
if result[outputChoice].hasKey(x[][1][1]):
result[outputChoice][x[][1][1]] += 1
else:
result[outputChoice][x[][1][1]] = 1
proc getGraphData(a: Table[int, Table[int,int]]) : (seq[int], seq[int], seq[int]) =
for key,val in a:
for innerkey, innerval in val:
result[0].add(key)
result[1].add(innerkey)
result[2].add(innerval)
proc getBestBytes*(inputstream : Stream, output : Stream) : ((Tensor[float32],Tensor[float32],Tensor[float32]), seq[seq[int]]) =
##Takes a cerial inputstream, and a stream to output to
##Gets the most likely choice from the given data.
##Curently very simple, can get sub 3% correctness if lucky
stderr.writeLine("enter!")
let cerial = deSerializeTensors(inputstream)
var reference : seq[seq[int]]
for x in 0 .. 255:
let seqn = cerial[2][x, _].toFlatSeq()
reference.add getMeanValues(seqn)
let max = cerial[2].shape[0]-1
for x in 256 .. max:
if cerial[2][x, 0 .. 2].toFlatSeq().foldl((a+b)) < 10:
break
var chary = getByte((cerial[2][x, _].toFlatSeq()), reference)
chary[1].sort((a,b)=>cmp(a[0], b[0]))
output.write chary[0]
return (cerial, reference)
when isMainModule:
stderr.writeLine("enter!")
var params = commandLineParams()
echo params
var input = newFileStream(params[0])
var output : FileStream
if ["", "-"].contains(params[1]):
output = newFileStream(stdout)
else:
output = newFileStream(params[1], fmReadWrite)
var (cerial, reference) = getBestBytes(input, output)
if defined(colordebug):
echo params[2]
var old = newFileStream(params[2], fmRead)
var outputDebugData : seq[data]
let max = cerial[2].shape[0]-1
output.setPosition(0)
for x in 256 .. max:
if cerial[2][x, 0 .. 2].toFlatSeq().foldl((a+b)) < 10:
break
var chary = getByte((cerial[2][x, _].toFlatSeq()), reference)
let originalByte = byte(old.readChar())
chary[1].sort((a,b)=>cmp(a[0], b[0]))
var stats : RunningStat
for x in chary[1]:
stats.push x[0]
outputDebugData.add datainit(originalByte.int, x, stats, chary[1])
old.setPosition(0)
var dif = 0
var total = 0
var incorrect : seq[ptr seq[(int,int)]]
while not old.atEnd:
let oldint = old.readChar().int
let newint = output.readChar().int
if newint != oldint:
dif+=1
incorrect.add(addr outputDebugData[total].outputChoices)
total += 1
if defined(graph):
var dirtyIncorrect : seq[ptr seq[(int,int)]]
var dirtyTemp : seq[seq[(int,int)]]
proc getAccuracyDecay(a : seq[(int,int)]) : float =
return collect(for y in 0 .. 2: (a[y+1][0] / a[y][0])*100).foldl(a+b)/3.float
for x in outputDebugData:
let output = getAccuracyDecay(x.outputChoices)
if output < 300:
dirtyIncorrect.add(addr x.outputChoices)
if x.outputChoices[0][1] == 202 and x.outputChoices[1][1] == 173:
dirtyTemp.add(x.outputChoices)
var dirtyGraph = getGraphData(correlateBadBytes(dirtyIncorrect))
let fig = plt.subplots(1,2,figsize=(1,2))[1]
var pure = correlateBadBytes(incorrect)
var graph = getGraphData(pure)
let totalErrors = graph[2].foldl((a+b))
let newErrors = dirtyGraph[2].foldl((a+b))
discard fig[0].set_ylabel("Original Byte")
discard fig[0].set_xlabel("Mistaken Byte")
discard fig[0].legend(loc="upper left")
discard fig[1].set_ylabel("Original Byte")
discard fig[1].set_xlabel("Mistaken Byte")
discard fig[1].legend(loc="upper left")
discard fig[0].set_title(&"Before statistical isolation of errors:\n {totalErrors}")
discard fig[1].set_title(&"After statistical isolation of errors: \n {newErrors} \n {(newERRORS / totalErrors)* 100}% increase")
discard fig[0].scatter(graph[0], graph[1], graph[2])
discard fig[1].scatter(dirtyGraph[0], dirtyGraph[1], dirtyGraph[2])
discard plt.show()
let purey = outputDebugData.filter(x=>x.outputChoices[0][1] == 202 and x.outputChoices[1][1] == 173 and x.originalColor != 202).map(x=>x.outputChoices)
echo purey.high
echo dirtyTemp.high
var stat1 : RunningStat
for x in purey:
stat1.push getAccuracyDecay(x)
echo stat1
stat1.clear()
for x in dirtyTemp:
stat1.push getAccuracyDecay(x)
dirtyTemp = dirtyTemp.filter(x=>purey.contains(x))
echo stat1
stat1.clear()
for x in dirtyTemp:
stat1.push getAccuracyDecay(x)
echo stat1
var print = true
var pos = 0
while defined(pager):
if print:
printData(outputDebugData[pos], cerial)
else:
print = true
let command = getInput()
case command:
of Break: break
of Forward:
if outputDebugData.high > pos:
pos+=1
else:
continue
of Backward:
if pos > 0:
pos-=1
else:
continue
of ForwardBad:
for x in pos+1 .. outputDebugData.high:
if outputDebugData[x].originalColor != outputDebugData[x].outputChoices[0][1]:
pos = x
break
of Graph:
var data = outputDebugData[pos].outputChoices.map(x=>x[0])[0 .. 100]
var x = collect(for x in 0 .. 100: x)
discard plt.scatter(x,data)
discard plt.show()
discard plt.clf()
else:
break

108
install.sh Executable file
View file

@ -0,0 +1,108 @@
#!/bin/sh
function getManager {
if command -v pacman >/dev/null; then
echo "pacman"
elif command -v apt >/dev/null; then
echo "apt"
elif command -v yum >/dev/null; then
echo "yum"
else
echo "none"
fi
}
function hasCommand {
if ! command -v $1 >/dev/null;
then
read -s -p $"$2 [N/y]" response
case "$response" in
[yY])
return 1
;;
*)
echo $"cannot proceede without $1"
exit 1
;;
esac
fi
return 0
}
packageManager=$(getManager)
if ! hasCommand "nimble" "Nimble was not found on this computer, would you like to install nim from Choosenim?"
then
echo "Installing choosenim via curl + bash"
curl https://nim-lang.org/choosenim/init.sh -sSf | sh
fi
if ! hasCommand "ffmpeg" "FFMPEG not found, would you like to try and install it?"; then
case $packageManager in
pacman)
echo 'install ffmpeg via pacman core repo'
sudo pacman -S ffmpeg
;;
yum)
echo "FFMPEG is not avalible in the yum core repo"
echo 'sudo yum install epel-release'
echo 'sudo yum localinstall --nogpgcheck https://download1.rpmfusion.org/free/el/rpmfusion-free-release-7.noarch.rpm'
echo 'sudo yum install ffmpeg ffmpeg-devel'
read -s -p $'are you ok with the following install commands? [N/y]\n' response
case "$response" in
[yY])
sudo yum install epel-release
sudo yum localinstall --nogpgcheck https://download1.rpmfusion.org/free/el/rpmfusion-free-release-7.noarch.rpm
sudo yum install ffmpeg ffmpeg-devel
;;
*)
echo "cannot proceede without ffmpeg"
exit 1
;;
esac
;;
apt)
echo 'install ffmpeg via apt core repo'
sudo apt update
sudo apt install ffmpeg
;;
none)
echo "'other' package manager. Please install it yourself before continuing!"
exit 1
;;
esac
fi
if ! hasCommand "pip" "pip not found, would you like to try and install it?"; then
case $packageManager in
pacman)
sudo pacman -S python
;;
yum)
sudo yum install -y python3
;;
apt)
sudo apt install python
;;
none)
echo "please install python on your own"
exit 1
;;
esac
fi
if ! python -c 'import pkgutil; print(1 if pkgutil.find_loader("matplotlib") else 0)' > /dev/null; then
echo "installing matplotlib!"
pip3 install "matplotlib"
fi
if ! [ -d "ldpc" ]; then
mkdir ldpc
echo "Installing LDPC"
git clone https://github.com/radfordneal/LDPC-codes
cd LDPC-codes
make
./LDPC-install ../ldpc/
cd ..
rm -rf LDPC-codes
fi
nimble install nimpy simplepng nimpng 'arraymancer@#head' itertools -n illwill > /dev/null

154
tensorCeral.nim Normal file
View file

@ -0,0 +1,154 @@
import strutils
import sugar
import sequtils
import arraymancer
import streams
import os
# CERIALIZATION OPERATIONS
proc serializeTensor*(outy : Stream, dim : array[3, Metadata], data : array[3, seq[byte]], close = true) =
## Takes in a Tensor made of pallete, originalFileColorEncoding, outputData
## This is for training purposes, and is useful for machine learning
for x in 0 .. 2:
outy.write(dim[x][0].uint32)
outy.write(dim[x][1].uint32)
outy.write(len(data[x]).uint32)
for x in 0 .. 2:
for x in data[x]:
outy.write(x.char)
outy.flush()
if not close:
outy.setPosition(0)
else:
outy.close()
proc serializeColorPallete*(a : Tensor[byte] | Tensor[float32], output : Stream) =
##takes a pallete, makes it into a little file for your transport
doAssert(a.shape == [256, 3], "must be a valid shape for a pallete")
for x in a:
when a is Tensor[float32]:
output.write(x.byte)
else:
output.write(x)
output.flush()
proc deSerializeColorPallate*(inny : Stream) : Tensor[byte] =
##the inverse of serializeColorPallete
var buffer = newSeq[byte](256*3)
discard inny.readData(addr buffer[0], len(buffer))
return buffer.toTensor().reshape(256, 3)
proc deSerializeTensors*(outy : Stream) : (Tensor[float32], Tensor[float32], Tensor[float32]) =
## Takes in a Tensor made of pallete, originalFileColorEncoding, outputData
## This is for training purposes, and is useful for machine learning
## Organizes from disk, of arbitrary size
var id : seq[(uint32,uint32,uint32)]
outy.setPosition(0)
for x in 1 .. 3:
var temp : seq[uint32]
var buffersize = sizeof(uint32)
var buffer = newSeq[uint32](sizeof(uint32))
for x in 1 .. 3:
discard outy.readData(buffer[0].addr, buffersize)
temp.add(buffer[0])
id.add((temp[0], temp[1], temp[2]))
for x in 0 .. 2:
var buffersize = id[x][2]
var buffer = newSeq[byte](id[x][2])
#i suspect that this code sometimes crashes if its too big
#to resolve this, simply do buffersize/10 then do it in a 10th
discard outy.readData(buffer[0].addr, buffersize.int)
let final = (buffer.map(x=>float32(x)).toTensor.reshape(id[x][0].int, id[x][1].int))
case x:
of 0:
result[0] = final
of 1:
result[1] = final
of 2:
result[2] = final
else:
echo "imposibru"
template arrangeIo(a,b : string) =
##Arranges io in the binary functions
##allows for smooth stdin stdout management
var input {.inject.} : FileStream
var output {.inject.} : FileStream
case a:
of "", "-":
input = newFileStream(stdin)
else:
input = newFileStream(a, fmRead)
case b:
of "", "-":
output = newFileStream(stdout)
else:
output = newFileStream(b, fmWrite)
proc convertFileToBinary*(a,b = "") =
##This is used in the shell script for LDPC, because of the whacky
##Mandatory binary input output
#repeated code
arrangeIo(a,b)
while not input.atEnd():
let current = input.readChar()
let converted = toBin(int(current), 8)
output.write(converted)
output.flush()
proc convertFileToHex*(a,b = "") =
##This is used in the shell script for LDPC, because of the whacky
##Mandatory binary input output
arrangeIo(a,b)
var buffer = newSeq[char](8)
while not input.atEnd():
discard input.readData(buffer[0].addr, 8)
let converted = char parseBinInt(buffer.join(""))
output.write(converted)
output.flush()
proc diffCheck*(a : Stream, b : Stream) =
##diffchecks the streams
##used at the end to compare files
var incorrect = 0
var total = 0
var best = 100.0
for x in 0 .. 40 :
b.setPosition(x)
a.setPosition(0)
while not a.atEnd():
total+=1
if a.readChar() != b.readChar():
incorrect += 1
if best > (incorrect / total ) * 100:
best = (incorrect / total ) * 100
stderr.writeLine best
when isMainModule:
var params = commandLineParams()
var command = ""
if params.len() == 0:
quit(1)
else:
command = params[0]
params.setlen(3)
if params[0] == "-b":
convertFileToBinary(params[1],params[2])
elif params[0] == "-e":
convertFileToHex(params[1],params[2])
elif command == "-d":
diffCheck(newFileStream(params[1]), newFileStream(params[2]))

6
testCerials.sh Executable file
View file

@ -0,0 +1,6 @@
#!/bin/bash
for filename in trainingdata/*.bin; do
echo $filename
./implementation $filename out.clone $1
./tensorCeral -d $1 out.clone
done

11
tovideo.sh Executable file
View file

@ -0,0 +1,11 @@
#!/bin/sh
if ! [ -d "outvideos/" ]; then
mkdir outvideos
fi
if ! [ -d "out/" ]; then
mkdir outvideos
fi
echo "owo"
ffmpeg -r 60 -f image2 -s 600x600 -i in/$1/in%0d.png -vcodec libx264 -profile:v high -bf 2 -g 30 -crf 18 -pix_fmt yuv420p -y outvideos/$1temp.mp4 &> /dev/null
mkdir -p out/$1/
ffmpeg -i outvideos/$1temp.mp4 -vf fps=60/1 -y out/$1/out%0d.png &> /dev/null

209
training.nim Normal file
View file

@ -0,0 +1,209 @@
import tensorCeral
import arraymancer
import os
import streams
import strformat
import json
import std.jsonutils
import random
import math
import sequtils
import tables
import sugar
import stats
import nimpy
#This program handles the training and serialization of a machine learning algorthym
#Due to current technical limitations around serialization, It cannot be saved unfortunately
#This code is currently not used.
var outdata = initTable[int, seq[(float32, int)]]()
for x in 0 .. 301:
outdata[x] = @[]
randomize()
proc getNumberOfFiles(path : string) : int =
echo collect(for x in os.walkdir(path) : x)
return collect(for x in os.walkdir(path) : x).high+1
proc getUnique(a : Tensor[float32]) : int=
var unique : seq[float]
for x in a:
if not unique.contains(x):
unique.add(x)
return unique.len()
network TwoLayersNet:
layers:
fc1: Linear(300, 42)
fc2: Linear(42, 300)
forward x:
x.fc1.relu.fc2
proc save(network: TwoLayersNet[float32], outy : int) =
network.fc1.weight.value.write_npy(&"model/hiddenweight{$outy}.npy")
network.fc1.bias.value.write_npy(&"model/hiddenbias{$outy}.npy")
network.fc2.weight.value.write_npy(&"model/outputweight{$outy}.npy")
network.fc2.bias.value.write_npy(&"model/outputbias{$outy}.npy")
proc load*(ctx: Context[Tensor[float32]], inny : int): TwoLayersNet[float32] =
result.fc1.weight.value = read_npy[float32](&"model/hiddenweight{inny}.npy")
result.fc1.bias.value = read_npy[float32](&"model/hiddenbias{inny}.npy")
result.fc2.weight.value = read_npy[float32](&"model/outputweight{inny}.npy")
result.fc2.bias.value = read_npy[float32](&"model/outputbias{inny}.npy")
proc echoUsage() =
echo "This program requires stdinputs"
echo " To train a model and save it:"
echo " -t [tensor1.bin] [tensor2.bin] ..."
echo " To anaylize its outputs:"
echo " -s [stats1.json] [stats2.json] ..."
echo "each program can take between 1 and an infinite number of inputs"
when isMainModule:
var params = commandLineParams()
params.setlen(1)
if params.len() == 0 or not ["-t","-s"].contains(params[0]):
echoUsage()
quit(1)
if params[0] == "-s":
# statistic segement :)
if params.len() == 1:
echoUsage()
quit(1)
var plt = pyImport("matplotlib.pyplot")
let fig = plt.subplots(1, len(params)-1)[1]
var figit = 0
for x in params[1 .. ^1]:
var newplot : PyObject
if len(params) == 2:
#if plt is given 1,1 it is a different type than 1,2+ because python...
newplot = fig
else:
newplot = fig[figit]
var table : Table[int, seq[(float, int)]]
fromJson(table, parseJson(readFile(x)))
var decomp : seq[int]
var means : seq[float]
var full : RunningStat
var tempith : RunningStat
var rawTemptih : seq[int]
var rawstats : seq[float]
for x in 3 .. 301:
if table[x].len() == 0:
continue
rawTemptih.add(table[x].len())
tempith.push(table[x].len().float)
for x in 3 .. 301:
#15000 is a magic number, but filters out early training abnormalities
let temp = table[x].map(x=>x[0]).filter(x => x < 15000)
if temp.len() == 0:
continue
else:
var statistics: RunningStat # must be var
statistics.push(temp)
full.push(temp)
decomp.add(x)
rawstats.add(statistics.mean)
let percent = 1 - ((((tempith.max - temp.len().float ) + tempith.min) * (1 / tempith.max)))
means.add(statistics.mean*percent)
let fullMean = collect(for x in 0 .. decomp.high : full.mean)
discard newplot.scatter(decomp, rawstats, label="Raw mean value at each occurance")
discard newplot.scatter(decomp, means, label="Occurance normalized mean")
discard newplot.plot(decomp, fullMean, label="Global mean")
discard newplot.set_ylim(0, 15000)
discard newplot.set_title(x[1])
discard newplot.set_ylabel("loss")
discard newplot.set_xlabel("Amount of different variables")
discard newplot.legend(loc="upper left")
figit+=1
discard plt.show()
quit()
if params[0] == "-t":
if params.len() == 1:
echoUsage()
quit(1)
var
ctx = newContext Tensor[float32]
model = ctx.init(TwoLayersNet)
optim = model.optimizerSGD(learning_rate = 1e-5'f32)
var circular : seq[float32]
proc addToCache(input : float32) =
if circular.len() == 10:
circular.delete(0)
circular.add(input)
proc writey(die = false) {.noconv.} =
model.save(0)
if die:
echo circular
let outint = getNumberOfFiles("./trainingstats/")
echo &"writingoutput to: ./trainingstats/stats{outint}.json"
writeFile(&"./trainingstats/stats{outint}.json", $(outdata.toJson()))
quit()
proc exit() {.noconv.} =
writey(true)
setControlCHook(exit)
var prev = -1.0'f32
var it = 0
var dicty = newTable[int, seq[(int, string)]]()
for x in 0 .. 301:
dicty[x] = @[]
proc train() =
for tensors in params:
var input = newFileStream(tensors)
let decompressed = deSerializeTensors(input)
let max = (decompressed[1].shape[0])-1
for county in 0 .. max:
let
x1 = ctx.variable(decompressed[2][county .. county+1, _])
y1 = decompressed[1][county .. county+1, _]
unique = (x1.value).getUnique()
#around half
#if unique >= 110:
#model = models[1]
var strike = 0
for t in 0 .. 50:
var
y_pred = model.forward(x1)
loss = y_pred.mse_loss(y1)
if t mod 10 == 0:
echo loss.value[0]
outdata[unique].add((loss.value[0], it))
if loss.value[0] == prev:
if strike == 7:
break
strike += 1
if loss.value[0] != prev:
if strike != 0:
strike = 0
loss.backprop()
optim.update()
prev = loss.value[0]
#encase of crashing it writes saves every time
echo "hmm"
addToCache(prev)
it+=1
writey()
#train()
train()
writey()

62
unit-test.nim Normal file
View file

@ -0,0 +1,62 @@
import unittest
import arraymancer
import tensorCeral
import cl
import implementation
import math
import sugar
import sequtils
import streams
import strutils
import random
import os
randomize()
suite "encoding":
echo "this is the encoding suite!"
test "Decode Test":
let resolution = 600
let pad = 9
let maxBytes = (resolution^2 / (pad+1)^2)-256
doAssert maxBytes mod 1 == 0
discard existsOrCreateDir("./in/")
discard existsOrCreateDir("./in/0/")
let pallet = genColorMatrix()
let garbage = newStringStream(collect(for x in 1 .. maxBytes.int:
(parseHexStr(toHex(rand(0 .. 255).byte)))).join(""))
let inputData = encodeImage(9, 0, garbage, pallet)[1]
let outputData = fromPng("in/0/in0.png" , 9, 0, false)
check inputData == outputData
test "tensorCerial":
var input = newStringStream("")
serializeTensor(input,
[pallet.shape, inputData.shape, outputData.shape],
[pallet.toFlatSeq(), inputData.toFlatSeq(), outputData.toFlatSeq()], false)
let serializationTest = deSerializeTensors(input)
let toFloat = (
pallet.toFlatSeq().map(x=>x.float32).toTensor().reshape(pallet.shape),
inputData.toFlatSeq().map(x=>x.float32).toTensor().reshape(inputData.shape),
outputdata.toFlatSeq().map(x=>x.float32).toTensor().reshape(outputData.shape)
)
check serializationTest == toFloat
input.setPosition(0)
test "implementation test":
var implementOutput = newStringStream("")
discard getBestBytes(input, implementOutput)
implementOutput.setPosition(0)
garbage.setPosition(0)
check garbage.readAll() == implementOutput.readAll()
test "convertFramesTest":
garbage.setPosition(0)
convertFrames((0, "outTemp.bin", garbage, true, ""))
#honestly if this works, theres no more testing needed, the rest would be implementation issues
#which are already checked....
#I guess it could be in more depth but its really hard tpo at this stage
discard deSerializeTensors(newFileStream("outTemp.bin"))[2]
removeFile("outTemp.bin")