ICLRandD · DeNeutoy · Jul 10, 2020 · Jul 10, 2020 · Jul 12, 2020 · Jul 12, 2020
diff --git a/data/meta_medium.json b/data/meta_medium.json
@@ -6,5 +6,7 @@
   "author":"Daniel Hoadley and Mark Neumann",
   "email": "[email protected]",
   "url":"https://github.com/ICLRandD/Blackstone",
-  "license":"Apache 2.0"
+  "license":"Apache 2.0",
+  "version": "0.5.0",
+  "spacy_version": ">=2.3.0,<3.0.0"
 }
diff --git a/data/meta_small.json b/data/meta_small.json
@@ -6,5 +6,7 @@
   "author":"Daniel Hoadley and Mark Neumann",
   "email": "[email protected]",
   "url":"https://github.com/ICLRandD/Blackstone",
-  "license":"Apache 2.0"
+  "license":"Apache 2.0",
+  "version": "0.5.0",
+  "spacy_version": ">=2.3.0,<3.0.0"
 }
diff --git a/scripts/init_model.py b/scripts/init_model.py
diff --git a/scripts/release_medium.sh b/scripts/release_medium.sh
@@ -0,0 +1,31 @@
+
+set -e
+VERSION=${1}
+
+# Notes
+# Important to install spacy-lookups-data before running this script, as otherwise
+# models don't have lemmatization and normalization data.
+# TODO: 
+# - add version not from metadata files
+# - try out pretraining with raw text
+# - clean up release models after the fact
+# - explicitly clone the EWT repo
+
+mkdir -p release
+
+spacy init-model en ./release/base_medium --freqs-loc ./for_mark/word_freqs.txt -v ./word2vec.txt
+
+# Parser, starting from base model
+spacy train en ./release/parser_tagger_medium ../UD_English-EWT/en_ewt-ud-train.json ../UD_English-EWT/en_ewt-ud-dev.json -G --pipeline tagger,parser --n-iter 10 --base-model ./release/base_medium
+# NER, starting from best parsing model
+spacy train en ./release/ner_medium ./train.json ./dev.json -G --pipeline ner --n-iter 10 --base-model ./release/parser_tagger_medium/model-best
+
+# Package
+spacy package release/ner_medium/model-best release/ -m ./data/meta_medium.json
+
+# Pop down in to directory, build package, copy it back up and return.
+current=${pwd}
+cd release/en_core_law_md-${VERSION}
+python setup.py sdist
+cp dist/* ../
+cd ${current}
diff --git a/scripts/release_small.sh b/scripts/release_small.sh
@@ -0,0 +1,31 @@
+
+set -e
+VERSION=${1}
+
+# Notes
+# Important to install spacy-lookups-data before running this script, as otherwise
+# models don't have lemmatization and normalization data.
+# TODO: 
+# - add version not from metadata files
+# - try out pretraining with raw text
+# - clean up release models after the fact
+# - explicitly clone the EWT repo
+
+mkdir -p release
+
+spacy init-model en ./release/base_small --freqs-loc ./for_mark/word_freqs.txt
+
+# Parser, starting from base model
+spacy train en ./release/parser_tagger_small ../UD_English-EWT/en_ewt-ud-train.json ../UD_English-EWT/en_ewt-ud-dev.json -G --pipeline tagger,parser --n-iter 10 --base-model ./release/base_small
+# NER, starting from best parsing model
+spacy train en ./release/ner_small ./train.json ./dev.json -G --pipeline ner --n-iter 10 --base-model ./release/parser_tagger_small/model-best
+
+# Package
+spacy package release/ner_small/model-best release/ -m ./data/meta_small.json
+
+# Pop down in to directory, build package, copy it back up and return.
+current=${pwd}
+cd release/en_core_law_sm-${VERSION}
+python setup.py sdist
+cp dist* ../
+cd ${current}
diff --git a/scripts/train_vectors.py b/scripts/train_vectors.py
@@ -24,7 +24,7 @@ def compute_vectors(input_path: Path, output_path: Path):
     sentences = LineSentence(input_path)
     bigram_transformer = Phrases(sentences)
     model = Word2Vec(
-        bigram_transformer[sentences], size=150, window=5, min_count=5, workers=4
+        bigram_transformer[sentences], size=100, window=5, min_count=5, workers=4
     )
     print(f"Saving vectors to {output_path}")
     model.wv.save_word2vec_format(output_path, binary=False)

diff --git a/setup.py b/setup.py
@@ -38,7 +38,7 @@
     packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
     license="Apache",
     install_requires=[
-        "spacy==2.1.8",
+        "spacy>=2.3.0,<3.0.0",
         "requests", # required for the legislation linker.
         "conllu",
         "numpy",