diff --git a/src/beignet/data/tokenizers/protein_tokenizer/__init__.py b/src/beignet/data/tokenizers/3di_tokenizer/__init__.py similarity index 100% rename from src/beignet/data/tokenizers/protein_tokenizer/__init__.py rename to src/beignet/data/tokenizers/3di_tokenizer/__init__.py diff --git a/src/beignet/data/tokenizers/protein_tokenizer/special_tokens_map.json b/src/beignet/data/tokenizers/3di_tokenizer/special_tokens_map.json similarity index 100% rename from src/beignet/data/tokenizers/protein_tokenizer/special_tokens_map.json rename to src/beignet/data/tokenizers/3di_tokenizer/special_tokens_map.json diff --git a/src/beignet/data/tokenizers/3di_tokenizer/tokenizer_config.json b/src/beignet/data/tokenizers/3di_tokenizer/tokenizer_config.json new file mode 100644 index 0000000000..2c80b6fa89 --- /dev/null +++ b/src/beignet/data/tokenizers/3di_tokenizer/tokenizer_config.json @@ -0,0 +1,6 @@ +{ + "clean_up_tokenization_spaces": true, + "do_lower_case": false, + "model_max_length": 1024, + "tokenizer_class": "SequenceTokenizer" +} diff --git a/src/beignet/data/tokenizers/3di_tokenizer/vocab.txt b/src/beignet/data/tokenizers/3di_tokenizer/vocab.txt new file mode 100644 index 0000000000..cc8e0c5bcd --- /dev/null +++ b/src/beignet/data/tokenizers/3di_tokenizer/vocab.txt @@ -0,0 +1,53 @@ + + + + +L +A +G +V +S +E +R +T +I +D +P +K +Q +N +F +Y +M +H +W +C +X +B +U +Z +O +. +- + + +p +y +n +w +r +q +h +g +d +l +v +t +m +f +s +a +e +i +k +c diff --git a/src/beignet/data/tokenizers/ab_tokenizer/__init__.py b/src/beignet/data/tokenizers/ab_tokenizer/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/beignet/data/tokenizers/ab_tokenizer/special_tokens_map.json b/src/beignet/data/tokenizers/ab_tokenizer/special_tokens_map.json new file mode 100644 index 0000000000..88d51c7b12 --- /dev/null +++ b/src/beignet/data/tokenizers/ab_tokenizer/special_tokens_map.json @@ -0,0 +1,7 @@ +{ + "cls_token": "[CLS]", + "eos_token": "[SEP]", + "mask_token": "[MASK]", + "pad_token": "[PAD]", + "unk_token": "[UNK]" +} diff --git a/src/beignet/data/tokenizers/ab_tokenizer/tokenizer_config.json b/src/beignet/data/tokenizers/ab_tokenizer/tokenizer_config.json new file mode 100644 index 0000000000..5343da0614 --- /dev/null +++ b/src/beignet/data/tokenizers/ab_tokenizer/tokenizer_config.json @@ -0,0 +1,6 @@ +{ + "clean_up_tokenization_spaces": true, + "do_lower_case": false, + "model_max_length": 1000000, + "tokenizer_class": "SequenceTokenizer" +} diff --git a/src/beignet/data/tokenizers/ab_tokenizer/vocab.txt b/src/beignet/data/tokenizers/ab_tokenizer/vocab.txt new file mode 100644 index 0000000000..6aa04bee98 --- /dev/null +++ b/src/beignet/data/tokenizers/ab_tokenizer/vocab.txt @@ -0,0 +1,322 @@ +[PAD] +[UNK] +[CLS] +[SEP] +[MASK] +A +C +D +E +F +G +H +I +K +L +M +N +P +Q +R +S +T +V +W +Y +- +. +[VH] +[VL] +[AG] +[HUMAN] +[MOUSE] +[RAT] +[RABBIT] +[PIG] +[RHESUS] +[IgG] +[hcIgG] +[IGHV4-31*02] +[IGHV2-6-4*01] +[IGHV3-23*04] +[IGHV1-69*02] +[IGHV3-30*01] +[IGHV1S69*01] +[IGHV8-12*01] +[IGHV1S7*01] +[IGHV1S56*01] +[IGHV8-8*01] +[IGHV1S45*01] +[IGHV2-6-7*01] +[IGHV3-30*02] +[IGHV3-72*01] +[IGHV4-1*02] +[IGHV1-7*01] +[IGHV3-66*01] +[IGHV3-23*01] +[IGHV5-17*01] +[IGHV5-9*01] +[IGHV3-74*01] +[IGHV2-6*02] +[IGHV1S44*01] +[IGHV1-46*01] +[IGHV3-11*05] +[IGHV2-9*02] +[IGHV3-48*03] +[IGHV1S40*01] +[IGHV1S12*01] +[IGHV3-53*03] +[IGHV3-2*02] +[IGHV6-6*01] +[IGHV1-4*01] +[IGHV1S35*01] +[IGHV3-9*01] +[IGHV3-33*01] +[IGHV2-9-1*01] +[IGHV3-48*01] +[IGHV2-6-5*01] +[IGHV5-9-3*01] +[IGHV3-21*01] +[IGHV3-6*01] +[IGHV1S47*01] +[IGHV2-6-1*01] +[IGHV14-1*01] +[IGHV3-11*01] +[IGHV1-53*01] +[IGHV5-12-2*01] +[IGHV5-4*02] +[IGHV9-2-1*01] +[IGHV4-39*01] +[IGHV3-23*03] +[IGHV3-7*01] +[IGHV1-22*01] +[IGHV1-75*01] +[IGHV3-48*02] +[IGHV3-1*01] +[IGHV5-6-5*01] +[IGHV1-34*02] +[IGHV3-74*03] +[IGHV3-1*02] +[IGHV1-54*02] +[IGHV3-15*01] +[IGHV1-82*01] +[IGHV3-53*01] +[IGHV1-19*01] +[IGHV11-2*01] +[IGHV3-64*04] +[IGHV1-9*01] +[IGHV3-8*02] +[IGHV3-30-3*02] +[IGHV1-69*10] +[IGHV3-30*18] +[IGHV3-30*03] +[IGHV1-2*02] +[IGHV1S14*01] +[IGHV3-20*01] +[IGHV13-2*02] +[IGHV1-64*01] +[IGHV2-2*01] +[IGHV3-20*04] +[IGHV5-6-3*01] +[IGHV5-12*01] +[IGHV3-8*01] +[IGHV1-58*01] +[IGHV4-59*01] +[IGHV1S43*01] +[IGHV1S52*01] +[IGHV1-34*01] +[IGHV1S17*01] +[IGHV7-4*01] +[IGHV1S53*03] +[IGHV1-54*01] +[IGHV3-64*07] +[IGHV2-6*01] +[IGHV7-4*04] +[IGHV1-69*01] +[IGHV4-61*01] +[IGHV2-3*01] +[IGHV1S29*02] +[IGHV2-6-2*01] +[IGHV4-34*09] +[IGHV1S36*01] +[IGHV1-84*01] +[IGHV6-3*01] +[IGHV1-80*01] +[IGHV1-66*01] +[IGHV2-9*01] +[IGHV2-6-6*01] +[IGHV5-15*01] +[IGHV5-12-1*01] +[IGHV3-33*03] +[IGHV7-3*02] +[IGHV5-6-4*02] +[IGHV1-12*01] +[IGHV5-51*01] +[IGHV1S13*01] +[IGHV3-NL1*01] +[IGHV9-1*02] +[IGHV4-4*08] +[IGHV4-59*11] +[IGHV4-30-4*01] +[IGHV2-4*02] +[IGHV8-6*01] +[IGHV3-11*03] +[IGHV5-12*02] +[IGHV5-16*01] +[IGHV6-7*02] +[IGHV10-1*02] +[IGHV1S53*01] +[IGHV2-2*03] +[IGHV1-55*01] +[IGHV2-9-2*01] +[IGHV4-38-2*02] +[IGHV3-43*02] +[IGHV6-6*02] +[IGHV3-49*04] +[IGHV1-26*01] +[IGHV1-50*01] +[IGHV1-5*01] +[IGHV5-6-4*01] +[IGHV1-62-2*01] +[IGHV3-23*02] +[IGHV3-49*02] +[IGHV3-30*15] +[IGHV14-3*02] +[IGHV1-46*04] +[IGHV1-18*01] +[IGHV2-70*01] +[IGHV1S26*01] +[IGHV4-39*02] +[IGHV5-6*01] +[IGHV3-64D*06] +[IGHV1-36*01] +[IGHV1-69*04] +[IGHV5-9-4*01] +[IGHV3-43*01] +[IGHV4-59*02] +[IGHV1-85*01] +[IGHV3-49*01] +[IGHV2-3-1*01] +[IGHV5-9*03] +[IGHV5-9-1*01] +[IGHV2-5*08] +[IGHV5-9*02] +[IGHV2-4*01] +[IGHV10-3*01] +[IGHV3-53*02] +[IGHV6-3*02] +[IGHV10-1*01] +[IGHV2-26*01] +[IGKV1-33*01] +[IGKV3S8*01] +[IGKV1-16*01] +[IGKV2-28*01] +[IGKV12S24*01] +[IGKV3S9*01] +[IGKV1S3*02] +[IGKV6S11*01] +[IGKV22S1*01] +[IGKV1-39*01] +[IGKV1-12*01] +[IGKV1S3*01] +[IGKV4-57-1*01] +[IGKV6-21*01] +[IGKV12-38*01] +[IGKV16-104*01] +[IGKV1S2*01] +[IGKV12S17*01] +[IGKV14S14*01] +[IGKV1S6*01] +[IGKV10-94*02] +[IGKV1-5*01] +[IGKV12S14*01] +[IGKV12-46*01] +[IGLV1-44*01] +[IGKV5-43*01] +[IGKV22S4*01] +[IGKV12-98*01] +[IGKV1-117*01] +[IGKV3S1*01] +[IGKV1-5*03] +[IGKV12S16*01] +[IGLV3S2*01] +[IGKV1S5*01] +[IGKV4-1*01] +[IGKV12-44*01] +[IGKV1-8*01] +[IGKV8S6*01] +[IGKV1-117*02] +[IGLV4S1*01] +[IGKV3-4*01] +[IGKV22S2*01] +[IGLV2-23*01] +[IGKV1-13*02] +[IGKV1-17*01] +[IGKV1-9*01] +[IGLV3S1*01] +[IGKV22S7*01] +[IGKV3S19*01] +[IGKV14S15*01] +[IGKV8S5*01] +[IGKV1-27*01] +[IGKV10-94*03] +[IGKV3S18*01] +[IGKV14S8*01] +[IGKV1D-16*01] +[IGKV14S1*01] +[IGKV3-11*01] +[IGKV12-41*01] +[IGKV1-17*03] +[IGLV2-23*02] +[IGKV3-2*01] +[IGKV10-94*07] +[IGKV14S18*01] +[IGKV1-110*01] +[IGKV6-20*01] +[IGKV8-30*01] +[IGKV1-135*01] +[IGKV1S2*02] +[IGKV4-70*01] +[IGKV14-111*01] +[IGKV6S8*01] +[IGKV1-133*01] +[IGKV10-96*01] +[IGKV14S9*01] +[IGKV4-59*01] +[IGLV2-14*02] +[IGLV2-18*01] +[IGKV1-6*01] +[IGKV19-93*01] +[IGKV4-50*01] +[IGLV1-40*01] +[IGKV1-NL1*01] +[IGKV8-21*01] +[IGKV6-17*01] +[IGLV2S1*01] +[IGKV14S13*01] +[IGKV4-61*01] +[IGKV10-94*06] +[IGKV1-5*02] +[IGKV4-91*01] +[IGKV15S4*01] +[IGKV2-29*02] +[IGKV6-b*01] +[IGKV5-45*01] +[IGKV6-d*01] +[IGKV6-25*01] +[IGKV22S9*01] +[IGLV2-11*01] +[IGKV12S22*01] +[IGLV2-14*01] +[IGKV3-20*01] +[IGKV5-48*01] +[IGKV13-84*01] +[IGLV1-47*01] +[IGKV3-10*01] +[IGLV3-21*02] +[IGKV10-96*03] +[IGLV1-51*01] +[IGKV3-7*01] +[IGKV3-1*01] +[IGKV1-17*02] +[IGLV3-19*01] diff --git a/src/beignet/data/tokenizers/cdna_tokenizer/__init__.py b/src/beignet/data/tokenizers/cdna_tokenizer/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/beignet/data/tokenizers/cdna_tokenizer/special_tokens_map.json b/src/beignet/data/tokenizers/cdna_tokenizer/special_tokens_map.json new file mode 100644 index 0000000000..9a725bd8b1 --- /dev/null +++ b/src/beignet/data/tokenizers/cdna_tokenizer/special_tokens_map.json @@ -0,0 +1,7 @@ +{ + "cls_token": "", + "eos_token": "", + "mask_token": "", + "pad_token": "", + "unk_token": "" +} diff --git a/src/beignet/data/tokenizers/cdna_tokenizer/tokenizer_config.json b/src/beignet/data/tokenizers/cdna_tokenizer/tokenizer_config.json new file mode 100644 index 0000000000..c91b690aa9 --- /dev/null +++ b/src/beignet/data/tokenizers/cdna_tokenizer/tokenizer_config.json @@ -0,0 +1,6 @@ +{ + "clean_up_tokenization_spaces": true, + "do_lower_case": false, + "model_max_length": 2048, + "tokenizer_class": "SequenceTokenizer" +} diff --git a/src/beignet/data/tokenizers/cdna_tokenizer/vocab.txt b/src/beignet/data/tokenizers/cdna_tokenizer/vocab.txt new file mode 100644 index 0000000000..fa768ad754 --- /dev/null +++ b/src/beignet/data/tokenizers/cdna_tokenizer/vocab.txt @@ -0,0 +1,15 @@ + + + + + + + + +A +C +G +T +N +U +. diff --git a/src/beignet/data/tokenizers/hyena_tokenizer/__init__.py b/src/beignet/data/tokenizers/hyena_tokenizer/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/beignet/data/tokenizers/hyena_tokenizer/special_tokens_map.json b/src/beignet/data/tokenizers/hyena_tokenizer/special_tokens_map.json new file mode 100644 index 0000000000..79711389cb --- /dev/null +++ b/src/beignet/data/tokenizers/hyena_tokenizer/special_tokens_map.json @@ -0,0 +1,9 @@ +{ + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": "", + "pad_token": "", + "unk_token": "", + "sep_token": "" + } diff --git a/src/beignet/data/tokenizers/hyena_tokenizer/tokenizer_config.json b/src/beignet/data/tokenizers/hyena_tokenizer/tokenizer_config.json new file mode 100644 index 0000000000..19cd3f1777 --- /dev/null +++ b/src/beignet/data/tokenizers/hyena_tokenizer/tokenizer_config.json @@ -0,0 +1,7 @@ +{ + "clean_up_tokenization_spaces": true, + "do_lower_case": false, + "model_max_length": 1000002, + "tokenizer_class": "HyenaTokenizer", + "padding_side": "left" + } diff --git a/src/beignet/data/tokenizers/hyena_tokenizer/vocab.txt b/src/beignet/data/tokenizers/hyena_tokenizer/vocab.txt new file mode 100644 index 0000000000..7fea6dcffb --- /dev/null +++ b/src/beignet/data/tokenizers/hyena_tokenizer/vocab.txt @@ -0,0 +1,14 @@ + + + + + + + + +A +C +G +T +N +U diff --git a/src/beignet/data/tokenizers/pmlm_tokenizer/__init__.py b/src/beignet/data/tokenizers/pmlm_tokenizer/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/beignet/data/tokenizers/pmlm_tokenizer/special_tokens_map.json b/src/beignet/data/tokenizers/pmlm_tokenizer/special_tokens_map.json new file mode 100644 index 0000000000..9a725bd8b1 --- /dev/null +++ b/src/beignet/data/tokenizers/pmlm_tokenizer/special_tokens_map.json @@ -0,0 +1,7 @@ +{ + "cls_token": "", + "eos_token": "", + "mask_token": "", + "pad_token": "", + "unk_token": "" +} diff --git a/src/beignet/data/tokenizers/protein_tokenizer/tokenizer_config.json b/src/beignet/data/tokenizers/pmlm_tokenizer/tokenizer_config.json similarity index 71% rename from src/beignet/data/tokenizers/protein_tokenizer/tokenizer_config.json rename to src/beignet/data/tokenizers/pmlm_tokenizer/tokenizer_config.json index cb46cfb231..d8f02e33ed 100644 --- a/src/beignet/data/tokenizers/protein_tokenizer/tokenizer_config.json +++ b/src/beignet/data/tokenizers/pmlm_tokenizer/tokenizer_config.json @@ -2,5 +2,5 @@ "clean_up_tokenization_spaces": true, "do_lower_case": false, "model_max_length": 1024, - "tokenizer_class": "ProteinTokenizer" + "tokenizer_class": "PmlmTokenizer" } diff --git a/src/beignet/data/tokenizers/protein_tokenizer/vocab.txt b/src/beignet/data/tokenizers/pmlm_tokenizer/vocab.txt similarity index 100% rename from src/beignet/data/tokenizers/protein_tokenizer/vocab.txt rename to src/beignet/data/tokenizers/pmlm_tokenizer/vocab.txt diff --git a/src/beignet/data/tokenizers/pmlm_tokenizer_32/__init__.py b/src/beignet/data/tokenizers/pmlm_tokenizer_32/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/beignet/data/tokenizers/pmlm_tokenizer_32/special_tokens_map.json b/src/beignet/data/tokenizers/pmlm_tokenizer_32/special_tokens_map.json new file mode 100644 index 0000000000..ba0f9b53db --- /dev/null +++ b/src/beignet/data/tokenizers/pmlm_tokenizer_32/special_tokens_map.json @@ -0,0 +1,7 @@ +{ + "cls_token": "", + "eos_token": "", + "mask_token": "", + "pad_token": "", + "unk_token": "" +} diff --git a/src/beignet/data/tokenizers/pmlm_tokenizer_32/tokenizer_config.json b/src/beignet/data/tokenizers/pmlm_tokenizer_32/tokenizer_config.json new file mode 100644 index 0000000000..57cef61615 --- /dev/null +++ b/src/beignet/data/tokenizers/pmlm_tokenizer_32/tokenizer_config.json @@ -0,0 +1,6 @@ +{ + "clean_up_tokenization_spaces": true, + "do_lower_case": false, + "model_max_length": 1024, + "tokenizer_class": "PmlmTokenizer" +} diff --git a/src/beignet/data/tokenizers/pmlm_tokenizer_32/vocab.txt b/src/beignet/data/tokenizers/pmlm_tokenizer_32/vocab.txt new file mode 100644 index 0000000000..d44b204106 --- /dev/null +++ b/src/beignet/data/tokenizers/pmlm_tokenizer_32/vocab.txt @@ -0,0 +1,32 @@ + + + + +L +A +G +V +S +E +R +T +I +D +P +K +Q +N +F +Y +M +H +W +C +B +U +Z +O +. +- + + diff --git a/src/beignet/tokenizers/_protein_tokenizer.py b/src/beignet/tokenizers/_protein_tokenizer.py index 90f0a5107c..6f81ea5f84 100644 --- a/src/beignet/tokenizers/_protein_tokenizer.py +++ b/src/beignet/tokenizers/_protein_tokenizer.py @@ -8,7 +8,9 @@ logger = transformers.utils.logging.get_logger(__name__) -TOKENIZERS_DIRECTORY = importlib.resources.files("beignet") / "data" / "tokenizers" +TOKENIZERS_DIRECTORY = ( + importlib.resources.files("beignet") / "data" / "tokenizers" / "protein_tokenizer" +) VOCAB_PATH = TOKENIZERS_DIRECTORY / "vocab.txt"