commit f3bd7b0207a3eba68bc05113eaa01defa90864e1 Author: joachimschmidt557 Date: Sat Jul 13 23:08:45 2019 +0200 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e1f38f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +nim_word_prediction +data/ \ No newline at end of file diff --git a/nim_word_prediction.nimble b/nim_word_prediction.nimble new file mode 100644 index 0000000..3e1fd32 --- /dev/null +++ b/nim_word_prediction.nimble @@ -0,0 +1,15 @@ +# Package + +version = "0.1.0" +author = "joachimschmidt557" +description = "Next word prediction" +license = "MIT" +srcDir = "src" +bin = @["nim_word_prediction"] + + + +# Dependencies + +requires "nim >= 0.20.0" +requires "neo >= 0.2.5" diff --git a/src/nim_word_prediction.nim b/src/nim_word_prediction.nim new file mode 100644 index 0000000..0e0fcef --- /dev/null +++ b/src/nim_word_prediction.nim @@ -0,0 +1,89 @@ +import parseopt, tables, strutils + +import neo + +proc predict(word:string, matrix:Matrix[uint16], + word2int:Table[string, int], + int2word:Table[int, string]):string = + if not word2int.hasKey(word): + return "" + let + w = word2int[word] + rowW = matrix.row(w) + var + maxI = 0 + maxW = 0u16 + for i, x in rowW: + if x > maxW: + maxI = i + maxW = x + return int2word[maxI] + +proc updateMatrix(matrix:var Matrix[uint16], word1:string, word2:string, + word2int:var Table[string, int], + int2word:var Table[int, string]) = + if not word2int.hasKey(word1): + let newIndex = len(word2int) + word2int[word1] = newIndex + int2word[newIndex] = word1 + if not word2int.hasKey(word2): + let newIndex = len(word2int) + word2int[word2] = newIndex + int2word[newIndex] = word2 + let + w1 = word2int[word1] + w2 = word2int[word2] + if w1 < 20000 and w2 < 20000: + matrix[w1, w2] += 1 + +proc interactive(matrix:Matrix[uint16], + word2int:Table[string, int], + int2word:Table[int, string]) = + echo "-= Interactive prediction =-" + for line in stdin.lines: + echo predict(line, matrix, word2int, int2word) + +proc main() = + var + p = initOptParser() + interactive = false + statistics = false + word = "I" + texts:seq[string] + matrix = zeros(20000, 20000, uint16) + word2int = initTable[string, int]() + int2word = initTable[int, string]() + + # Parse arguments + while true: + p.next() + case p.kind + of cmdEnd: break + of cmdShortOption, cmdLongOption: + if p.key == "i" or p.key == "interactive": + interactive = true + if p.key == "s" or p.key == "statistics": + statistics = true + if p.key == "t" or p.key == "text": + texts.add(readFile(p.val)) + of cmdArgument: + word = p.key + + # Generate matrix + for text in texts: + var prevWord = "" + for word in text.split(): + updateMatrix(matrix, prevWord, word, word2int, int2word) + prevWord = word + + if statistics: + echo "-= Statistics =-" + echo "Size of vocabulary: " & $word2int.len + + if interactive: + interactive(matrix, word2int, int2word) + else: + echo predict(word, matrix, word2int, int2word) + +when isMainModule: + main()