Skip to content

Commit

Permalink
Loading text tutorial: fixed OOV handling
Browse files Browse the repository at this point in the history
  • Loading branch information
vaharoni committed Aug 27, 2023
1 parent e4ab8e0 commit f2ea5cf
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions site/en/tutorials/load_data/text.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1140,8 +1140,9 @@
},
"outputs": [],
"source": [
"keys = vocab\n",
"values = range(2, len(vocab) + 2) # Reserve `0` for padding, `1` for OOV tokens.\n",
"# Reserve `0` for padding, `1` for OOV tokens.\n",
"keys = ['', '[UNK]'] + vocab\n",
"values = range(len(keys))\n",
"\n",
"init = tf.lookup.KeyValueTensorInitializer(\n",
" keys, values, key_dtype=tf.string, value_dtype=tf.int64)\n",
Expand Down Expand Up @@ -1171,6 +1172,8 @@
" standardized = tf_text.case_fold_utf8(text)\n",
" tokenized = tokenizer.tokenize(standardized)\n",
" vectorized = vocab_table.lookup(tokenized)\n",
" # StaticVocabularyTable returns the OOV token as vocab_size + 2. We overwrite it to be 1.\n",
" vectorized = tf.where(vectorized == len(keys), tf.constant(1, dtype=tf.int64), vectorized)\n",
" return vectorized, label"
]
},
Expand Down

0 comments on commit f2ea5cf

Please sign in to comment.