Skip to content

Commit

Permalink
embedded surface features for word encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
kariminf committed Mar 27, 2023
1 parent cb51b41 commit 1f9ce57
Show file tree
Hide file tree
Showing 83 changed files with 86 additions and 10 deletions.
Empty file modified .codeclimate.yml
100644 → 100755
Empty file.
Empty file modified .eslintignore
100644 → 100755
Empty file.
6 changes: 5 additions & 1 deletion .eslintrc.json
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
{
"env": {
"parserOptions": {
// Required for certain syntax usages
"ecmaVersion": 2020
},
"env": {
"browser": true,
"commonjs": true,
"es6": true,
Expand Down
Empty file modified .gitignore
100644 → 100755
Empty file.
Empty file modified .nycrc
100644 → 100755
Empty file.
Empty file modified .travis.yml
100644 → 100755
Empty file.
Empty file modified CHANGELOG.md
100644 → 100755
Empty file.
Empty file modified CODE_CONVENTION.md
100644 → 100755
Empty file.
Empty file modified CODE_OF_CONDUCT.md
100644 → 100755
Empty file.
Empty file modified CONTRIBUTING.md
100644 → 100755
Empty file.
Empty file modified CREDITS.md
100644 → 100755
Empty file.
Empty file modified FCT.md
100644 → 100755
Empty file.
Empty file modified LICENSE
100644 → 100755
Empty file.
Empty file modified README.md
100644 → 100755
Empty file.
Empty file modified assets/design/logo.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified assets/design/logo.svg
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified assets/design/logo128.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified assets/info.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified assets/lang.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified assets/morpho.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified assets/morpho2.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified assets/trans.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified jsdoc.json
100644 → 100755
Empty file.
Empty file modified package.json
100644 → 100755
Empty file.
Empty file modified package/jslingua-static.js
100644 → 100755
Empty file.
Empty file modified src/_jslgraph.mjs
100644 → 100755
Empty file.
Empty file modified src/_jslml.mjs
100644 → 100755
Empty file.
Empty file modified src/ara/ara.info.mjs
100644 → 100755
Empty file.
Empty file modified src/ara/ara.lang.mjs
100644 → 100755
Empty file.
Empty file modified src/ara/ara.morpho.mjs
100644 → 100755
Empty file.
Empty file modified src/ara/ara.trans.mjs
100644 → 100755
Empty file.
Empty file modified src/eng/eng.info.mjs
100644 → 100755
Empty file.
Empty file modified src/eng/eng.lang.mjs
100644 → 100755
Empty file.
Empty file modified src/eng/eng.morpho.mjs
100644 → 100755
Empty file.
Empty file modified src/eng/eng.sem.mjs
100644 → 100755
Empty file.
81 changes: 77 additions & 4 deletions src/eng/eng.syntax.mjs
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,97 @@ function prepare_pos_tagger(list_tags){
return new BeamMEMM(5, maxent);
}

prefixes = [
"ambi", "anti", "astro", "bi", "co", "con", "de", "dis", "em", "extra", "fore", "hetero", "hind", "homo", "im", "in", "inter", "mal", "mid", "mis", "mono", "non",
"on", "pan", "ped", "post", "pre", "pro", "re", "semi", "sub", "sur", "trans",
"tri", "twi", "ultra", "un", "uni", "under", "up"
]

suffixes = [
"able", "ac", "ize", "age", "al", "an", "ant", "ary", "cracy", "cycle", "dom", "eer", "en",
"er", "ess", "est", "ette", "ful", "hood", "ible", "ic", "ify", "ion", "ish", "ism", "ity",
"less", "like", "log", "ment", "ness", "or", "ous", "ship", "th", "ure", "ward", "wise", "y"
]

function prepare_sword_encoder(){
const
sword_emb_w = [[-0.019089749082922935, 0.0949365496635437, -0.2024381458759308, -0.23295924067497253, 0.056965406984090805, 0.0008755889721214771, 0.045677099376916885, 0.2468193918466568, -0.22976073622703552, -0.139898419380188, 0.21870265901088715, -0.0011770427227020264, 0.055731382220983505, -0.08659186214208603, 0.1501360833644867, 0.31037119030952454, 0.011553394608199596, 0.19828230142593384, -0.02236812375485897, 0.10368239134550095, -0.02298622764647007, 0.2368248850107193, 0.12931948900222778, -0.04970233887434006, -0.2443927824497223, 0.17874974012374878, -0.18765780329704285, 0.21122601628303528, 0.07515483349561691, -0.22255556285381317, -0.18598832190036774, 0.14200764894485474, -0.20515233278274536, 0.011510211043059826, 0.1898973435163498, 0.1438535451889038, 0.263727605342865, 0.17965008318424225, -0.17252828180789948, -0.0847819596529007, 0.1905154585838318, 0.12278065085411072, 0.05847258120775223, -0.0971096083521843, 0.3522113859653473, -0.04082566499710083, 0.25108492374420166, 0.06060123071074486, -0.179428830742836, 0.20141224563121796, 0.2114289402961731, 0.16857337951660156, -0.1863928586244583, 0.10697580128908157, -0.20252802968025208, -0.1164335384964943, 0.19597943127155304, 0.20037484169006348, -0.2186661809682846, 0.054228782653808594, 0.10660434514284134, 0.01589077152311802, 0.0165458582341671, 0.07873309403657913, -0.14789007604122162, -0.05744950845837593, 0.24231791496276855, -0.17490237951278687, -0.23341742157936096, 0.24886448681354523, 0.21717600524425507, 0.030499808490276337, -0.11108802258968353, 0.246268630027771, 0.003436007536947727, -0.13111461699008942, 0.2211570292711258, 0.11762525886297226, 0.08280894160270691, 0.07130267471075058, 0.2478734403848648, 1.3664286136627197, 0.048447586596012115], [-0.04175945371389389, -0.22111184895038605, -0.003240017220377922, 0.0689234733581543, -0.2967080771923065, -0.1555895358324051, 0.009935734793543816, 0.13492843508720398, 0.11727102845907211, -0.06566699594259262, 0.2182627171278, 0.15910327434539795, -0.012012762017548084, -0.2024093121290207, -0.1870095133781433, -0.03516069054603577, 0.14702071249485016, 0.14428797364234924, 0.09489299356937408, 0.0999702513217926, -0.03350687772035599, -0.12169965356588364, 0.23034371435642242, 0.24568144977092743, -0.17468777298927307, -0.1158439889550209, 0.021759746596217155, 0.0340295173227787, -0.1987764984369278, -0.0952814593911171, 0.099922314286232, 0.23957987129688263, -0.19858668744564056, -0.16877907514572144, 0.1904006451368332, -0.0368419885635376, 0.10372596979141235, -0.21279726922512054, 0.19345451891422272, -0.1518803834915161, -0.23212064802646637, 0.20405396819114685, 0.12865571677684784, 0.11202594637870789, 0.05676170065999031, -0.18689154088497162, -0.0055007850751280785, 0.11995755881071091, 0.10199176520109177, -0.045338328927755356, 0.04102403298020363, -0.08865657448768616, -0.12564070522785187, 0.08142564445734024, 0.01593283750116825, -0.21011695265769958, -0.03182750195264816, 0.0599672794342041, -0.024721937254071236, 0.0960252434015274, -0.022328708320856094, -0.17341333627700806, -0.21619555354118347, -0.06565798819065094, -0.1719098538160324, -0.24304082989692688, 0.22640453279018402, -0.07011786848306656, 0.2501624822616577, -0.21166671812534332, 0.08335322886705399, -0.025930389761924744, 0.0014217685675248504, 0.08012422919273376, -0.05592622980475426, 0.10147291421890259, -0.13311758637428284, 0.023639656603336334, -0.032730624079704285, -0.005462504457682371, -0.309675008058548, 0.2623489797115326, 0.06472814083099365], [-0.10274799913167953, -0.10992555320262909, -0.012204826809465885, -0.16901206970214844, 0.03708356246352196, -0.011052250862121582, 0.1840939223766327, 0.009660773910582066, -0.21836747229099274, -0.13970306515693665, 0.09355457872152328, 0.19022592902183533, -0.1240251287817955, 0.16051124036312103, -0.20495092868804932, -0.13628412783145905, -0.04342576488852501, -0.17217569053173065, 0.14771904051303864, -0.23477789759635925, -0.042405128479003906, -0.11667008697986603, 0.06800143420696259, 0.22233054041862488, -0.03950042650103569, -0.12480951845645905, 0.09537818282842636, 0.057224467396736145, 0.1336134970188141, 0.12062427401542664, -0.1838708221912384, -0.20382456481456757, -0.18219298124313354, -0.21929173171520233, -0.2538406550884247, 0.05848286673426628, -0.22234730422496796, 0.0810304656624794, 0.11931167542934418, -0.2162596732378006, -0.19250237941741943, -0.1670411378145218, -0.11298147588968277, 0.15196755528450012, 0.1363224983215332, -0.2332717776298523, -0.031796205788850784, 0.08457685261964798, -0.1796877533197403, -0.12425731122493744, -0.04532361403107643, -0.12460728734731674, 0.21209459006786346, 0.03849511221051216, -0.08514319360256195, -0.2082696259021759, 0.17261427640914917, -0.19037547707557678, 0.14501677453517914, -0.029671700671315193, 0.13684837520122528, 0.028267906978726387, 0.1963295340538025, 0.21925385296344757, 0.07807307690382004, 0.11434625834226608, -0.19107310473918915, -0.2539270520210266, -0.05733933672308922, -0.09324216842651367, -0.23545347154140472, -0.13098429143428802, -0.04924963414669037, 0.17910383641719818, 0.1672956496477127, 0.2307862639427185, -0.05820493772625923, -0.11103593558073044, 0.013496684841811657, 0.07791143655776978, 0.2797287702560425, -0.6357272267341614, -0.07104772329330444], [0.21987023949623108, 0.012485183775424957, -0.04876673221588135, 0.03182309493422508, -0.08927831053733826, -0.19001401960849762, 0.04178868234157562, -0.006306546740233898, -0.06270190328359604, -0.06914865970611572, 0.2303316295146942, -0.006603896617889404, -0.056390341371297836, -0.13303698599338531, 0.011619780212640762, 0.04141005128622055, -0.24393022060394287, 0.19228069484233856, -0.18440403044223785, 0.11990461498498917, 0.011132351122796535, 0.007160402834415436, 0.026231545954942703, -0.1694653034210205, 0.05806493014097214, -0.006827455013990402, 0.10754776746034622, -0.03764442354440689, -0.2896781861782074, 0.1311616748571396, 0.23938629031181335, 0.21311599016189575, -0.11922436207532883, 0.03621042147278786, 0.2001267820596695, 0.11333446204662323, 0.05146946385502815, 0.12783287465572357, -0.1573307365179062, 0.22813066840171814, -0.1576394885778427, 0.024373721331357956, 0.21810317039489746, -0.007392882835119963, 0.007976268418133259, 0.1394023299217224, 0.23066137731075287, -0.10553599894046783, -0.06927445530891418, 0.10312825441360474, -0.0908733382821083, -0.04632771015167236, -0.12198816239833832, -0.2847231924533844, -0.18160369992256165, -0.021979769691824913, 0.22066129744052887, 0.002678598277270794, -0.017257871106266975, 0.17352178692817688, -0.08019909262657166, -0.10315138846635818, 0.040415383875370026, 0.10070778429508209, -0.09801621735095978, 0.1648799180984497, -0.16464388370513916, 0.03984922915697098, 0.2345651090145111, 0.176201730966568, 0.22800213098526, 0.00150773033965379, 0.10021919757127762, -0.07123491168022156, 0.14562644064426422, -0.04310502111911774, -0.08549286425113678, 0.0870271772146225, -0.11477880924940109, -0.05467294156551361, -0.20955273509025574, -2.030499219894409, 0.263904333114624], [0.1737711876630783, 0.21287298202514648, 0.13703426718711853, 0.05114373192191124, 0.3090777099132538, -0.06760753691196442, 0.25383588671684265, 0.0814141258597374, 0.17700406908988953, -0.017755666747689247, -0.03538801893591881, -0.13778722286224365, 0.10023215413093567, -0.12094180285930634, 0.08061479032039642, 0.11519240587949753, -0.03853100165724754, 0.17693044245243073, 0.01114025991410017, -0.09887238591909409, -0.12937650084495544, 0.07972727715969086, -0.09904258698225021, -0.14781339466571808, -0.2328362911939621, -0.008145524188876152, -0.007910755462944508, -0.049190834164619446, 0.33650344610214233, 0.2451695203781128, 0.004534287843853235, -0.16439726948738098, 0.2302590310573578, -0.18051163852214813, -0.23098406195640564, -0.056708402931690216, 0.024921976029872894, -0.17326530814170837, -0.11249354481697083, 0.17097271978855133, 0.17457227408885956, -0.11715912818908691, -0.20786121487617493, -0.20078858733177185, 0.2041735202074051, 0.02213398925960064, 0.13261418044567108, -0.03977798670530319, -0.21977819502353668, -0.10169807821512222, 0.12011686712503433, -0.19312666356563568, -0.15392786264419556, 0.18250904977321625, 0.27822667360305786, 0.08854299038648605, 0.22531531751155853, 0.071408711373806, 0.25603216886520386, 0.06393701583147049, 0.11943496763706207, 0.07711680233478546, 0.13901108503341675, -0.030235853046178818, -0.2457531988620758, 0.1230500191450119, 0.07456441968679428, -0.03994471952319145, 0.2565605044364929, -0.16746294498443604, 0.031984321773052216, -0.05371256172657013, -0.10389533638954163, 0.033401358872652054, 0.15390899777412415, 0.03867097571492195, 0.2618665397167206, -0.2191573679447174, 0.2052084058523178, 0.36019366979599, -0.06510556489229202, -1.3831733465194702, 0.12917660176753998]],
sword_emb_b = [1.7165316343307495, -1.8890769481658936, 2.1785964965820312, -1.719642162322998, 1.9550516605377197];
return new Perceptron(sword_emb_w, sword_emb_b);
}


class EngSyntax extends Syntax {
static list_tags = [];
static memm = prepare_pos_tagger(this.list_tags);
static swenc = prepare_sword_encoder()

static __surface_word_features(word){
const lword = word.toLowerCase();
// encode prefix
const pref_code = new Array(prefixes.length).fill(0);
if (lword.length > 5){
for(let i=5; i> 0; i--){
idx = prefixes.indexOf(lword.slice(0, i));
if ( idx != -1){
pref_code[idx] = 1;
break;
}
}
}
// encode suffix
const suff_code = new Array(suffixes.length).fill(0);
if (lword.length > 5){
for(let i=5; i> 0; i--){
idx = suffixes.indexOf(lword.slice(-i));
if ( idx != -1){
suff_code[idx] = 1;
break;
}
}
}
// encode features
const feat_code = [];
//------contains a number
feat_code.push((/\d/.test(word))? 1: 0);
//------contains a hyphen
feat_code.push((/-/.test(word))? 1: 0);
//------contains an uppercase
feat_code.push((/[A-Z]/.test(word))? 1: 0);
//------contains a period (dot)
feat_code.push((/\./.test(word))? 1: 0);

return pref_code.concat(suff_code).concat(feat_code);

}

/**
* Encoding all sentence words
* Encoding all sentence words for PoS tagging
*
* @protected
* @final
* @static
* @param {String[]} sentence list of words in sentence
* @return {float[[]]} encoding of each word
*/
static _words_encode(sentence){
static _pos_words_encode(sentence){
const codes = [];
const N = sentence.length;
let past = new Array(10).fill(0);
sentence.forEach((word, i)=>{
const code = EngSem.word_embedding(word);
//TODO: check suffixes and add their encoding to word embedding
let current = EngSem.word_embedding(word);
current = current.concat(this.swenc.predict(this.__surface_word_features(word)));
let future;
if (i + 1 < N) future = new Array(15).fill(1);
else {
future = EngSem.word_embedding(sentence[i+1]);
future = future.concat(this.swenc.predict(this.__surface_word_features(sentence[i+1])));
}

const code = past.concat(current).concat(future);

past = current;

codes.push(code);
}, this);

Expand Down
Empty file modified src/eng/eng.trans.mjs
100644 → 100755
Empty file.
Empty file modified src/fra/fra.info.mjs
100644 → 100755
Empty file.
Empty file modified src/fra/fra.lang.mjs
100644 → 100755
Empty file.
Empty file modified src/fra/fra.morpho.mjs
100644 → 100755
Empty file.
Empty file modified src/fra/fra.trans.mjs
100644 → 100755
Empty file.
Empty file modified src/info.mjs
100644 → 100755
Empty file.
Empty file modified src/jpn/jpn.info.mjs
100644 → 100755
Empty file.
Empty file modified src/jpn/jpn.lang.mjs
100644 → 100755
Empty file.
Empty file modified src/jpn/jpn.morpho.mjs
100644 → 100755
Empty file.
Empty file modified src/jpn/jpn.trans.mjs
100644 → 100755
Empty file.
Empty file modified src/jslingua.mjs
100644 → 100755
Empty file.
Empty file modified src/lang.mjs
100644 → 100755
Empty file.
Empty file modified src/morpho.mjs
100644 → 100755
Empty file.
Empty file modified src/sem.mjs
100644 → 100755
Empty file.
9 changes: 4 additions & 5 deletions src/syntax.mjs
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -116,16 +116,15 @@ class Syntax {
//==========================================

/**
* Encoding a word
* Encoding all sentence words for PoS tagging
*
* @protected
* @abstract
* @final
* @static
* @param {int} i current position
* @param {String[]} sentence list of words in sentence
* @return {float[[]]} encoding of each word
*/
static _words_encode(sentence){
static _pos_words_encode(sentence){
return [[]];
}

Expand Down Expand Up @@ -171,7 +170,7 @@ class Syntax {
* @return {String[]} list of tags of these words
*/
static pos_tag(sentence){
const encoded = this._words_encode(sentence);
const encoded = this._pos_words_encode(sentence);
this.memm.init(encoded[0]);
for(let i = 1; i < sentence.length; i++){
this.memm.step(encoded[i]);
Expand Down
Empty file modified src/tools.js
100644 → 100755
Empty file.
Empty file modified src/trans.mjs
100644 → 100755
Empty file.
Empty file modified test/browser/aralang_test.html
100644 → 100755
Empty file.
Empty file modified test/browser/lang_test.html
100644 → 100755
Empty file.
Empty file modified test/browser/morpho_test.html
100644 → 100755
Empty file.
Empty file modified test/browser/trans_test.html
100644 → 100755
Empty file.
Empty file modified test/helpers/isri_test.py
100644 → 100755
Empty file.
Empty file modified test/mocha.opts
100644 → 100755
Empty file.
Empty file modified test/nodejs/http_test.js
100644 → 100755
Empty file.
Empty file modified test/nodejs/test.js
100644 → 100755
Empty file.
Empty file modified test/nodejs/test_conj.js
100644 → 100755
Empty file.
Empty file modified test/nodejs/test_embedding.mjs
100644 → 100755
Empty file.
Empty file modified test/unit/ara.info_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/ara.lang_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/ara.morpho_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/ara.trans_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/eng.info_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/eng.lang_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/eng.morpho_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/eng.trans_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/fra.info_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/fra.lang_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/fra.morpho_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/fra.trans_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/jpn.info_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/jpn.lang_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/jpn.morpho_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/jpn.trans_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/lang_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/morpho_t.js
100644 → 100755
Empty file.
Empty file modified test/unit/trans_t.js
100644 → 100755
Empty file.
Empty file modified webpack.config.js
100644 → 100755
Empty file.

0 comments on commit 1f9ce57

Please sign in to comment.