From 6a756ed2afbe8914305e8e503da377309b7b222b Mon Sep 17 00:00:00 2001 From: Matt Watson Date: Thu, 30 Nov 2023 03:27:48 +0000 Subject: [PATCH] Fix t5 tokenizer presets --- keras_nlp/models/t5/t5_presets.py | 24 ++++++++++++------------ keras_nlp/models/t5/t5_tokenizer.py | 7 +++++++ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/keras_nlp/models/t5/t5_presets.py b/keras_nlp/models/t5/t5_presets.py index dd2bea7a4e..699ea1ce76 100644 --- a/keras_nlp/models/t5/t5_presets.py +++ b/keras_nlp/models/t5/t5_presets.py @@ -41,8 +41,8 @@ "preprocessor_config": {}, "weights_url": "https://storage.googleapis.com/keras-nlp/models/t5_small_multi/v1/model.weights.h5", "weights_hash": "2e10b5f72405d464ee55026b07e60741", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/t5_small_multi/v1/vocab.spm", - "vocabulary_hash": "9d15ef55d09d5a425ceb63fa31f7cae3", + "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/t5_small_multi/v1/vocab.spm", + "spm_proto_hash": "9d15ef55d09d5a425ceb63fa31f7cae3", }, "t5_base_multi": { "metadata": { @@ -70,8 +70,8 @@ "preprocessor_config": {}, "weights_url": "https://storage.googleapis.com/keras-nlp/models/t5_base_multi/v1/model.weights.h5", "weights_hash": "bed6ef276cfe83d1323467051211978d", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/t5_base_multi/v1/vocab.spm", - "vocabulary_hash": "9d15ef55d09d5a425ceb63fa31f7cae3", + "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/t5_base_multi/v1/vocab.spm", + "spm_proto_hash": "9d15ef55d09d5a425ceb63fa31f7cae3", }, "t5_large_multi": { "metadata": { @@ -99,8 +99,8 @@ "preprocessor_config": {}, "weights_url": "https://storage.googleapis.com/keras-nlp/models/t5_large_multi/v1/model.weights.h5", "weights_hash": "7854a05c2e6812899bf6f0f104792cda", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/t5_large_multi/v1/vocab.spm", - "vocabulary_hash": "9d15ef55d09d5a425ceb63fa31f7cae3", + "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/t5_large_multi/v1/vocab.spm", + "spm_proto_hash": "9d15ef55d09d5a425ceb63fa31f7cae3", }, "flan_small_multi": { "metadata": { @@ -129,8 +129,8 @@ "preprocessor_config": {}, "weights_url": "https://storage.googleapis.com/keras-nlp/models/flan_small_multi/v1/model.weights.h5", "weights_hash": "aa0fbaddb1759ef313bbc4f9e4f1e197", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/flan_small_multi/v1/vocab.spm", - "vocabulary_hash": "9d15ef55d09d5a425ceb63fa31f7cae3", + "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/flan_small_multi/v1/vocab.spm", + "spm_proto_hash": "9d15ef55d09d5a425ceb63fa31f7cae3", }, "flan_base_multi": { "metadata": { @@ -158,8 +158,8 @@ "preprocessor_config": {}, "weights_url": "https://storage.googleapis.com/keras-nlp/models/flan_base_multi/v1/model.weights.h5", "weights_hash": "84a10bec83fd093931bb2a6264115d31", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/flan_base_multi/v1/vocab.spm", - "vocabulary_hash": "9d15ef55d09d5a425ceb63fa31f7cae3", + "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/flan_base_multi/v1/vocab.spm", + "spm_proto_hash": "9d15ef55d09d5a425ceb63fa31f7cae3", }, "flan_large_multi": { "metadata": { @@ -187,7 +187,7 @@ "preprocessor_config": {}, "weights_url": "https://storage.googleapis.com/keras-nlp/models/flan_large_multi/v1/model.weights.h5", "weights_hash": "513f530ce790efa7e261c0ef965f3697", - "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/flan_large_multi/v1/vocab.spm", - "vocabulary_hash": "9d15ef55d09d5a425ceb63fa31f7cae3", + "spm_proto_url": "https://storage.googleapis.com/keras-nlp/models/flan_large_multi/v1/vocab.spm", + "spm_proto_hash": "9d15ef55d09d5a425ceb63fa31f7cae3", }, } diff --git a/keras_nlp/models/t5/t5_tokenizer.py b/keras_nlp/models/t5/t5_tokenizer.py index ec5f0bf324..28499ae530 100644 --- a/keras_nlp/models/t5/t5_tokenizer.py +++ b/keras_nlp/models/t5/t5_tokenizer.py @@ -11,8 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import copy from keras_nlp.api_export import keras_nlp_export +from keras_nlp.utils.python_utils import classproperty +from keras_nlp.models.t5.t5_presets import backbone_presets from keras_nlp.tokenizers.sentence_piece_tokenizer import SentencePieceTokenizer @@ -96,3 +99,7 @@ def set_proto(self, proto): self.end_token_id = None self.pad_token_id = None self.start_token_id = None + + @classproperty + def presets(cls): + return copy.deepcopy(backbone_presets)