Skip to content

Commit

Permalink
adding pulls from medrt for labeling fxns
Browse files Browse the repository at this point in the history
  • Loading branch information
izzymetzger committed Apr 11, 2020
1 parent ac074a5 commit e598261
Show file tree
Hide file tree
Showing 14 changed files with 1,630 additions and 50 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,4 +133,8 @@ data-orig/*.csv
*.gz
*.jsonl
data-fasttext/
docs/*.zip
docs/*.zip
tweets
prodigy-1.9.5-cp36.cp37.cp38-cp36m.cp37m.cp38-macosx_10_13_x86_64.whl
prodigy-1.9.5.zip
weights
296 changes: 296 additions & 0 deletions data-rxclass/medrt_ci_with.jsonl.txt

Large diffs are not rendered by default.

296 changes: 296 additions & 0 deletions data-rxclass/medrt_induces.jsonl.txt

Large diffs are not rendered by default.

296 changes: 296 additions & 0 deletions data-rxclass/medrt_mary_diagnose.jsonl.txt

Large diffs are not rendered by default.

296 changes: 296 additions & 0 deletions data-rxclass/medrt_mary_prevent.jsonl.txt

Large diffs are not rendered by default.

296 changes: 296 additions & 0 deletions data-rxclass/medrtrespons.jsonl.txt

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions eval-official.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
Usage: eval-official.py <gt_pth> <pred_pth>
takes in the ground truth (e.g., tweetid, class)
usage: eval-official.py [-h] [gt_pth] [pred_pth] [truth_label] [pred_label]
usage: eval-official.py [qs-adderral_lang-en_until-2020-02-02.csv] [gt_pth] [pred_pth] [truth_label] [pred_label]
positional arguments:
gt_pth [data-orig/validation.csv]
Expand All @@ -10,7 +10,7 @@
pred_label [Class]
optional arguments:
-h, --help show this help message and exit
qs-adderral_lang-en_until-2020-02-02.csv, --help show this help message and exit
"""
import sklearn.metrics as sklm
Expand Down
Empty file added ex.py
Empty file.
159 changes: 128 additions & 31 deletions getting_spacy_preds.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -298,7 +298,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 6,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -362,7 +362,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -575,7 +575,7 @@
"[2635 rows x 8 columns]"
]
},
"execution_count": 9,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -588,7 +588,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -599,7 +599,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -630,7 +630,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand All @@ -646,7 +646,7 @@
" 'UNRELATED']"
]
},
"execution_count": 11,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -658,7 +658,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"outputs": [
{
Expand All @@ -667,7 +667,7 @@
"['data-orig/validation.csv', 'data-orig/train.csv']"
]
},
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -680,7 +680,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -689,7 +689,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"outputs": [
{
Expand All @@ -698,7 +698,7 @@
"13172"
]
},
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -712,7 +712,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 14,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -879,7 +879,7 @@
"[13172 rows x 6 columns]"
]
},
"execution_count": 15,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -903,7 +903,14 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -919,7 +926,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -929,14 +936,14 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('tagger', <spacy.pipeline.pipes.Tagger object at 0x1b5b01e10>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x15bee69f0>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x15bee62f0>), ('textcat', <spacy.pipeline.pipes.TextCategorizer object at 0x12b4ebf50>)]\n"
"[('tagger', <spacy.pipeline.pipes.Tagger object at 0x12a80d210>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x190031980>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x190031830>), ('textcat', <spacy.pipeline.pipes.TextCategorizer object at 0x150e01990>)]\n"
]
}
],
Expand All @@ -946,39 +953,129 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 18,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'spacy.tokens.doc.Doc' object has no attribute 'tokens'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-19-0fb9d39db8fd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mdocs\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpipe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtexts\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdoc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdocs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\" \"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtoken\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlemma_\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtoken\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdoc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mAttributeError\u001b[0m: 'spacy.tokens.doc.Doc' object has no attribute 'tokens'"
"name": "stdout",
"output_type": "stream",
"text": [
"oxycontin is a great drug if you wanna be sad and constipated\n",
"\n",
"\n",
"oxycontin oxycontin PROPN NNP nsubj xxxx True False\n",
"is be AUX VBZ ROOT xx True True\n",
"a a DET DT det x True True\n",
"great great ADJ JJ amod xxxx True False\n",
"drug drug NOUN NN attr xxxx True False\n",
"if if SCONJ IN mark xx True True\n",
"you -PRON- PRON PRP nsubj xxx True True\n",
"wanna wanna VERB VBP advcl xxxx True False\n",
"be be AUX VB xcomp xx True True\n",
"sad sad ADJ JJ acomp xxx True False\n",
"and and CCONJ CC cc xxx True True\n",
"constipated constipated ADJ JJ conj xxxx True False\n"
]
}
],
"source": [
"docs= list(web.pipe(texts))\n",
"for i, doc in enumerate(docs):\n",
" text = \" \".join([token.lemma_ for token in doc.tokens])\n",
" print(text)\n",
" print(\" \".join(token.text for token in doc))\n",
" print(\"\\n\")\n",
" for token in doc:\n",
"\n",
" print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,\n",
" token.shape_, token.is_alpha, token.is_stop)\n",
" break\n",
" # print(doc.to_json())\n",
" \n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"new_data['label'] = full_df.loc[:,'class']\n",
"new_data['label'] = new_data['label'].map(str.strip)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"20"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"100 - 15\n",
"85 - 65\n",
"new_data[['label', 'unprocessed_text', 'tweetid']]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"ename": "TypeError",
"evalue": "'(array([ 1, 4, 5, ..., 13168, 13170, 13171]), slice(None, None, None))' is an invalid key",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-34-4f6f674e50dc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;31m#Select Validation rows\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mval_rows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msetdiff1d\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_rows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrn_rows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0mnew_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtrn_rows\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/anaconda3/envs/patenv/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2993\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnlevels\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2994\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_multilevel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2995\u001b[0;31m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2996\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2997\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/anaconda3/envs/patenv/lib/python3.7/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 2895\u001b[0m )\n\u001b[1;32m 2896\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2897\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2898\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2899\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_cast_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: '(array([ 1, 4, 5, ..., 13168, 13170, 13171]), slice(None, None, None))' is an invalid key"
]
}
],
"source": [
"np.random.seed(13)\n",
"trn_rows = np.sort(np.random.choice(num_rows, size = int(num_rows * .85), replace = False))\n",
"\n",
"#Select Validation rows\n",
"val_rows = np.setdiff1d(np.arange(num_rows), trn_rows)\n",
"\n",
"#Split dataset\n",
"\n",
"\n",
"#Select Training rows\n",
"np.random.seed(0)\n",
"trn_rows = np.sort(np.random.choice(num_rows, size = int(num_rows * .7), replace = False))\n",
"\n",
"#Select Validation rows\n",
"val_rows = np.setdiff1d(np.arange(num_rows), trn_rows)\n",
"new_data[trn_rows,:]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"#Split dataset\n",
"trn_data, val_data = data[trn_rows,1:], data[val_rows,1:]\n",
"trn_Y, val_Y = data[trn_rows,0], data[val_rows,0]"
]
}
],
"metadata": {
Expand Down
2 changes: 1 addition & 1 deletion nlp_configs/text_classification.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
"type": "bert_adam",
"lr": 0.001
},
"validation_metric": "+average_F1",
"validation_metric": "+a_F1",
"num_serialized_models_to_keep": 3,
"num_epochs": 70,
"patience": 10,
Expand Down
Loading

0 comments on commit e598261

Please sign in to comment.