Skip to content

Commit

Permalink
Merge pull request #22 from yoeo/v2.0.1
Browse files Browse the repository at this point in the history
V2.0.1
  • Loading branch information
yoeo authored Jul 1, 2020
2 parents 4c9780f + eaf2cd4 commit 5537217
Show file tree
Hide file tree
Showing 7 changed files with 12 additions and 10 deletions.
2 changes: 1 addition & 1 deletion docs/contents.rst
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ Accuracy

Guesslang deep learning model performs very well.
It was tested with 12,000 different source code files and correctly
guessed the programming language of **93.29%** of them.
guessed the programming language of **93.82%** of them.

Most of the misclassifications come from few languages
that are **compatible** with each other, like C/C++ or JavaScript/TypeScript.
Expand Down
2 changes: 1 addition & 1 deletion guesslang/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@
from guesslang.guess import Guess, GuesslangError # noqa: F401


__version__ = '2.0.0'
__version__ = '2.0.1'
8 changes: 4 additions & 4 deletions guesslang/data/languages.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,29 @@
"C": ["c"],
"C#": ["cs"],
"C++": ["cpp", "cc"],
"CSS": ["css"],
"CoffeeScript": ["coffee", "litcoffee"],
"CSS": ["css"],
"Erlang": ["erl", "hrl"],
"Go": ["go"],
"HTML": ["html"],
"Haskell": ["hs", "lhs"],
"HTML": ["html"],
"Java": ["java"],
"JavaScript": ["js", "es6"],
"Jupyter Notebook": ["ipynb"],
"Lua": ["lua"],
"Markdown": ["md"],
"Matlab": ["matlab", "m"],
"Objective-C": ["mm", "m"],
"PHP": ["php"],
"Perl": ["pl", "pm"],
"PHP": ["php"],
"PowerShell": ["ps1"],
"Python": ["py"],
"R": ["r", "rdata", "rds", "rda"],
"Ruby": ["rb"],
"Rust": ["rs"],
"SQL": ["sql"],
"Scala": ["scala"],
"Shell": ["sh"],
"SQL": ["sql"],
"Swift": ["swift"],
"TeX": ["tex"],
"TypeScript": ["ts", "tsx"]
Expand Down
Binary file modified guesslang/data/model/saved_model.pb
Binary file not shown.
Binary file modified guesslang/data/model/variables/variables.data-00000-of-00001
Binary file not shown.
Binary file modified guesslang/data/model/variables/variables.index
Binary file not shown.
10 changes: 6 additions & 4 deletions guesslang/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@
class HyperParameter:
"""Model hyper parameters"""
BATCH_SIZE = 100
NB_TOKENS = 1000
VOCABULARY_SIZE = 20000
EMBEDDING_SIZE = int(VOCABULARY_SIZE**0.5)
NB_TOKENS = 10000
VOCABULARY_SIZE = 5000
EMBEDDING_SIZE = max(10, int(VOCABULARY_SIZE**0.5))
DNN_HIDDEN_UNITS = [512, 32]
DNN_DROPOUT = 0.5
N_GRAM = 2


class Training:
Expand Down Expand Up @@ -208,7 +209,8 @@ def _preprocess(
def _preprocess_text(data: tf.Tensor) -> tf.Tensor:
"""Feature engineering"""
padding = tf.constant(['']*HyperParameter.NB_TOKENS)
data = tf.strings.split([data]).values
data = tf.strings.bytes_split(data)
data = tf.strings.ngrams(data, HyperParameter.N_GRAM)
data = tf.concat((data, padding), axis=0)
data = data[:HyperParameter.NB_TOKENS]
return data

0 comments on commit 5537217

Please sign in to comment.