diff --git a/CHANGE.txt b/CHANGE.txt index 386107db..1af2a717 100644 --- a/CHANGE.txt +++ b/CHANGE.txt @@ -1,8 +1,10 @@ v0.0.3: 1. update formula ast: supporting more symbols and functions defined in katex - 2. add item to vector tools, including word2vec and doc2vec using gensim + 2. add tokens to vector tools, including word2vec and doc2vec using gensim 3. sci4sif support tokenization grouped by segments 4. add special tokens: \SIFTag and \SIFSep + 5. add item to vector tools + 6. add interface for getting pretrained models, where the supported model names can be accessed by `edunlp i2v` in the command console v0.0.2: 1. fix potential ModuleNotFoundError diff --git a/EduNLP/I2V/__init__.py b/EduNLP/I2V/__init__.py new file mode 100644 index 00000000..7735252d --- /dev/null +++ b/EduNLP/I2V/__init__.py @@ -0,0 +1,5 @@ +# coding: utf-8 +# 2021/8/1 @ tongshiwei + +from .i2v import I2V, get_pretrained_i2v +from .i2v import D2V diff --git a/EduNLP/I2V/i2v.py b/EduNLP/I2V/i2v.py new file mode 100644 index 00000000..63295565 --- /dev/null +++ b/EduNLP/I2V/i2v.py @@ -0,0 +1,109 @@ +# coding: utf-8 +# 2021/8/1 @ tongshiwei + +import json +from EduNLP.constant import MODEL_DIR +from ..Vector import T2V, get_pretrained_t2v as get_t2v_pretrained_model +from ..Tokenizer import Tokenizer, get_tokenizer +from EduNLP import logger + +__all__ = ["I2V", "D2V", "get_pretrained_i2v"] + + +class I2V(object): + def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None, pretrained_t2v=False, **kwargs): + """ + + Parameters + ---------- + tokenizer: str + the tokenizer name + t2v: str + the name of token2vector model + args: + the parameters passed to t2v + tokenizer_kwargs: dict + the parameters passed to tokenizer + pretrained_t2v: bool + kwargs: + the parameters passed to t2v + """ + self.tokenizer: Tokenizer = get_tokenizer(tokenizer, **tokenizer_kwargs if tokenizer_kwargs is not None else {}) + if pretrained_t2v: + logger.info("Use pretrained t2v model %s" % t2v) + self.t2v = get_t2v_pretrained_model(t2v, kwargs.get("model_dir", MODEL_DIR)) + else: + self.t2v = T2V(t2v, *args, **kwargs) + self.params = { + "tokenizer": tokenizer, + "tokenizer_kwargs": tokenizer_kwargs, + "t2v": t2v, + "args": args, + "kwargs": kwargs, + "pretrained_t2v": pretrained_t2v + } + + def __call__(self, items, *args, **kwargs): + return self.infer_vector(items, *args, **kwargs) + + def tokenize(self, items, indexing=True, padding=False, *args, **kwargs) -> list: + return self.tokenizer(items, *args, **kwargs) + + def infer_vector(self, items, tokenize=True, indexing=False, padding=False, *args, **kwargs) -> tuple: + raise NotImplementedError + + def infer_item_vector(self, tokens, *args, **kwargs) -> ...: + return self.infer_vector(tokens, *args, **kwargs)[0] + + def infer_token_vector(self, tokens, *args, **kwargs) -> ...: + return self.infer_vector(tokens, *args, **kwargs)[1] + + def save(self, config_path, *args, **kwargs): + with open(config_path, "w", encoding="utf-8") as wf: + json.dump(self.params, wf, ensure_ascii=False, indent=2) + + @classmethod + def load(cls, config_path, *args, **kwargs): + with open(config_path, encoding="utf-8") as f: + params: dict = json.load(f) + tokenizer = params.pop("tokenizer") + t2v = params.pop("t2v") + args = params.pop("args") + kwargs = params.pop("kwargs") + params.update(kwargs) + return cls(tokenizer, t2v, *args, **params) + + @classmethod + def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): + raise NotImplementedError + + @property + def vector_size(self): + return self.t2v.vector_size + + +class D2V(I2V): + def infer_vector(self, items, tokenize=True, indexing=False, padding=False, *args, **kwargs) -> tuple: + tokens = self.tokenize(items, return_token=True) if tokenize is True else items + return self.t2v(tokens, *args, **kwargs), None + + @classmethod + def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): + return cls("text", name, pretrained_t2v=True, model_dir=model_dir) + + +MODELS = { + "d2v_all_256": [D2V, "d2v_all_256"], + "d2v_sci_256": [D2V, "d2v_sci_256"], + "d2v_eng_256": [D2V, "d2v_eng_256"], + "d2v_lit_256": [D2V, "d2v_lit_256"], +} + + +def get_pretrained_i2v(name, model_dir=MODEL_DIR): + if name not in MODELS: + raise KeyError( + "Unknown model name %s, use one of the provided models: %s" % (name, ", ".join(MODELS.keys())) + ) + _class, *params = MODELS[name] + return _class.from_pretrained(*params, model_dir=model_dir) diff --git a/EduNLP/ModelZoo/__init__.py b/EduNLP/ModelZoo/__init__.py new file mode 100644 index 00000000..5dbf8ed1 --- /dev/null +++ b/EduNLP/ModelZoo/__init__.py @@ -0,0 +1,4 @@ +# coding: utf-8 +# 2021/7/12 @ tongshiwei + +from .utils import * diff --git a/EduNLP/ModelZoo/rnn/__init__.py b/EduNLP/ModelZoo/rnn/__init__.py new file mode 100644 index 00000000..a524181d --- /dev/null +++ b/EduNLP/ModelZoo/rnn/__init__.py @@ -0,0 +1,4 @@ +# coding: utf-8 +# 2021/7/12 @ tongshiwei + +from .rnn import LM diff --git a/EduNLP/ModelZoo/rnn/rnn.py b/EduNLP/ModelZoo/rnn/rnn.py new file mode 100644 index 00000000..5b00aec8 --- /dev/null +++ b/EduNLP/ModelZoo/rnn/rnn.py @@ -0,0 +1,74 @@ +# coding: utf-8 +# 2021/7/12 @ tongshiwei + +import torch +from torch import nn +from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence + + +class LM(nn.Module): + """ + Examples + -------- + >>> import torch + >>> seq_idx = torch.LongTensor([[1, 2, 3], [1, 2, 0], [3, 0, 0]]) + >>> seq_len = torch.LongTensor([3, 2, 1]) + >>> lm = LM("RNN", 4, 3, 2) + >>> output, hn = lm(seq_idx, seq_len) + >>> output.shape + torch.Size([3, 3, 2]) + >>> hn.shape + torch.Size([1, 3, 2]) + >>> lm = LM("RNN", 4, 3, 2, num_layers=2) + >>> output, hn = lm(seq_idx, seq_len) + >>> output.shape + torch.Size([3, 3, 2]) + >>> hn.shape + torch.Size([2, 3, 2]) + """ + + def __init__(self, rnn_type: str, vocab_size: int, embedding_dim: int, hidden_size: int, num_layers=1, + bidirectional=False, embedding=None, **kwargs): + super(LM, self).__init__() + rnn_type = rnn_type.upper() + self.embedding = torch.nn.Embedding(vocab_size, embedding_dim) if embedding is None else embedding + self.c = False + if rnn_type == "RNN": + self.rnn = torch.nn.RNN( + embedding_dim, hidden_size, num_layers, bidirectional=bidirectional, **kwargs + ) + elif rnn_type == "LSTM": + self.rnn = torch.nn.LSTM( + embedding_dim, hidden_size, num_layers, bidirectional=bidirectional, **kwargs + ) + self.c = True + elif rnn_type == "GRU": + self.rnn = torch.nn.GRU( + embedding_dim, hidden_size, num_layers, bidirectional=bidirectional, **kwargs + ) + elif rnn_type == "ELMO": + bidirectional = True + self.rnn = torch.nn.LSTM( + embedding_dim, hidden_size, num_layers, bidirectional=bidirectional, **kwargs + ) + self.c = True + else: + raise TypeError("Unknown rnn_type %s" % rnn_type) + + self.num_layers = num_layers + self.bidirectional = bidirectional + if bidirectional is True: + self.num_layers *= 2 + self.hidden_size = hidden_size + + def forward(self, seq_idx, seq_len): + seq = self.embedding(seq_idx) + pack = pack_padded_sequence(seq, seq_len, batch_first=True) + h0 = torch.randn(self.num_layers, seq.shape[0], self.hidden_size) + if self.c is True: + c0 = torch.randn(self.num_layers, seq.shape[0], self.hidden_size) + output, (hn, _) = self.rnn(pack, (h0, c0)) + else: + output, hn = self.rnn(pack, h0) + output, _ = pad_packed_sequence(output, batch_first=True) + return output, hn diff --git a/EduNLP/ModelZoo/utils/__init__.py b/EduNLP/ModelZoo/utils/__init__.py new file mode 100644 index 00000000..a41af25f --- /dev/null +++ b/EduNLP/ModelZoo/utils/__init__.py @@ -0,0 +1,5 @@ +# coding: utf-8 +# 2021/7/12 @ tongshiwei + +from .padder import PadSequence, pad_sequence +from .device import set_device diff --git a/EduNLP/ModelZoo/utils/device.py b/EduNLP/ModelZoo/utils/device.py new file mode 100644 index 00000000..23b2646e --- /dev/null +++ b/EduNLP/ModelZoo/utils/device.py @@ -0,0 +1,44 @@ +# coding: utf-8 +# 2021/8/2 @ tongshiwei +import logging +import torch +from torch.nn import DataParallel + + +def set_device(_net, ctx, *args, **kwargs): # pragma: no cover + """code from longling v1.3.26""" + if ctx == "cpu": + if not isinstance(_net, DataParallel): + _net = DataParallel(_net) + return _net.cpu() + elif any(map(lambda x: x in ctx, ["cuda", "gpu"])): + if not torch.cuda.is_available(): + try: + torch.ones((1,), device=torch.device("cuda: 0")) + except AssertionError as e: + raise TypeError("no cuda detected, noly cpu is supported, the detailed error msg:%s" % str(e)) + if torch.cuda.device_count() >= 1: + if ":" in ctx: + ctx_name, device_ids = ctx.split(":") + assert ctx_name in ["cuda", "gpu"], "the equipment should be 'cpu', 'cuda' or 'gpu', now is %s" % ctx + device_ids = [int(i) for i in device_ids.strip().split(",")] + try: + if not isinstance(_net, DataParallel): + return DataParallel(_net, device_ids).cuda + return _net.cuda(device_ids) + except AssertionError as e: + logging.error(device_ids) + raise e + elif ctx in ["cuda", "gpu"]: + if not isinstance(_net, DataParallel): + _net = DataParallel(_net) + return _net.cuda() + else: + raise TypeError("the equipment should be 'cpu', 'cuda' or 'gpu', now is %s" % ctx) + else: + logging.error(torch.cuda.device_count()) + raise TypeError("0 gpu can be used, use cpu") + else: + if not isinstance(_net, DataParallel): + return DataParallel(_net, device_ids=ctx).cuda() + return _net.cuda(ctx) diff --git a/EduNLP/ModelZoo/utils/padder.py b/EduNLP/ModelZoo/utils/padder.py new file mode 100644 index 00000000..ed86cfef --- /dev/null +++ b/EduNLP/ModelZoo/utils/padder.py @@ -0,0 +1,76 @@ +# coding: utf-8 +# 2021/7/12 @ tongshiwei + +__all__ = ["PadSequence", "pad_sequence"] + + +class PadSequence(object): + """Pad the sequence. + + Pad the sequence to the given `length` by inserting `pad_val`. If `clip` is set, + sequence that has length larger than `length` will be clipped. + + Parameters + ---------- + length : int + The maximum length to pad/clip the sequence + pad_val : number + The pad value. Default 0 + clip : bool + """ + + def __init__(self, length, pad_val=0, clip=True): + self._length = length + self._pad_val = pad_val + self._clip = clip + + def __call__(self, sample: list): + """ + + Parameters + ---------- + sample : list of number + + Returns + ------- + ret : list of number + """ + sample_length = len(sample) + if sample_length >= self._length: + if self._clip and sample_length > self._length: + return sample[:self._length] + else: + return sample + else: + return sample + [ + self._pad_val for _ in range(self._length - sample_length) + ] + + +def pad_sequence(sequence: list, max_length=None, pad_val=0, clip=True): + """ + + Parameters + ---------- + sequence + max_length + pad_val + clip + + Returns + ------- + + Examples + -------- + >>> seq = [[4, 3, 3], [2], [3, 3, 2]] + >>> pad_sequence(seq) + [[4, 3, 3], [2, 0, 0], [3, 3, 2]] + >>> pad_sequence(seq, pad_val=1) + [[4, 3, 3], [2, 1, 1], [3, 3, 2]] + >>> pad_sequence(seq, max_length=2) + [[4, 3], [2, 0], [3, 3]] + >>> pad_sequence(seq, max_length=2, clip=False) + [[4, 3, 3], [2, 0], [3, 3, 2]] + """ + padder = PadSequence(max([len(seq) for seq in sequence]) if max_length is None else max_length, pad_val, clip) + return [padder(seq) for seq in sequence] diff --git a/EduNLP/Pretrain/gensim_vec.py b/EduNLP/Pretrain/gensim_vec.py index 2ebcfc9e..fde1bb43 100644 --- a/EduNLP/Pretrain/gensim_vec.py +++ b/EduNLP/Pretrain/gensim_vec.py @@ -7,6 +7,7 @@ from gensim.models.doc2vec import TaggedDocument from gensim.models.callbacks import CallbackAny2Vec from EduNLP.SIF.sif import sif4sci +from EduNLP.Vector import D2V, BowLoader from copy import deepcopy import itertools as it @@ -14,7 +15,7 @@ class GensimWordTokenizer(object): - def __init__(self, symbol="gm"): + def __init__(self, symbol="gm", general=False): """ Parameters @@ -22,15 +23,44 @@ def __init__(self, symbol="gm"): symbol: gm fgm + gmas + fgmas + general: + True when item isn't in standard format, and want to tokenize formulas(except formulas in figure) linearly. + False when use 'ast' mothed to tokenize formulas instead of 'linear'. + + Returns + ---------- + + Examples + ---------- + >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]'] + >>> tokenizer = GensimWordTokenizer(symbol="fgmas", general=False) + >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\ + ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$") + >>> print(token_item.tokens[:10]) + ['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]'] """ self.symbol = symbol - self.tokenization_params = { - "formula_params": { - "method": "ast", - "return_type": "list", - "ord2token": True + if general is True: + self.tokenization_params = { + "formula_params": { + "method": "linear", + "symbolize_figure_formula": True + } + } + else: + self.tokenization_params = { + "formula_params": { + "method": "ast", + "return_type": "list", + "ord2token": True + } } - } def batch_process(self, *items): pass @@ -96,7 +126,7 @@ def on_epoch_end(self, model): self.epoch += 1 -def train_vector(items, w2v_prefix, embedding_dim, method="sg", binary=None, train_params=None): +def train_vector(items, w2v_prefix, embedding_dim=None, method="sg", binary=None, train_params=None): monitor = MonitorCallback(["word", "I", "less"]) _train_params = dict( min_count=0, @@ -129,10 +159,22 @@ def train_vector(items, w2v_prefix, embedding_dim, method="sg", binary=None, tra docs, **_train_params ) binary = binary if binary is not None else True + elif method == "bow": + model = gensim.corpora.Dictionary(items) + binary = binary if binary is not None else True + elif method == "tfidf": + dictionary_path = train_vector(items, w2v_prefix, method="bow") + dictionary = BowLoader(dictionary_path) + corpus = [dictionary.infer_vector(item) for item in items] + model = gensim.models.TfidfModel(corpus) + binary = binary if binary is not None else True else: raise ValueError("Unknown method: %s" % method) - filepath = w2v_prefix + "%s_%s" % (method, embedding_dim) + filepath = w2v_prefix + method + if embedding_dim is not None: + filepath = filepath + "_" + str(embedding_dim) + if binary is True: filepath += ".bin" logger.info("model is saved to %s" % filepath) diff --git a/EduNLP/SIF/tokenization/tokenization.py b/EduNLP/SIF/tokenization/tokenization.py index 3c6d43e1..3bb5365b 100644 --- a/EduNLP/SIF/tokenization/tokenization.py +++ b/EduNLP/SIF/tokenization/tokenization.py @@ -33,6 +33,10 @@ def __init__(self, segment_list: SegmentList, text_params=None, formula_params=N "s": [] } self.text_params = text_params if text_params is not None else {} + if formula_params is not None and "symbolize_figure_formula" in formula_params: + self.symbolize_figure_formula = formula_params.pop("symbolize_figure_formula") + else: + self.symbolize_figure_formula = False self.formula_params = formula_params if formula_params is not None else {"method": "linear"} self.formula_tokenize_method = self.formula_params.get("method") self.figure_params = figure_params if figure_params is not None else {} @@ -166,6 +170,9 @@ def append_formula(self, segment, symbol=False, init=True): if symbol is True: self._formula_tokens.append(len(self._tokens)) self._tokens.append(segment) + elif self.symbolize_figure_formula and isinstance(segment, FigureFormulaSegment): + self._formula_tokens.append(len(self._tokens)) + self._tokens.append(Symbol(FORMULA_SYMBOL)) elif isinstance(segment, FigureFormulaSegment): self._formula_tokens.append(len(self._tokens)) self._tokens.append(segment) diff --git a/EduNLP/Tokenizer/__init__.py b/EduNLP/Tokenizer/__init__.py new file mode 100644 index 00000000..25b605c9 --- /dev/null +++ b/EduNLP/Tokenizer/__init__.py @@ -0,0 +1,4 @@ +# coding: utf-8 +# 2021/8/1 @ tongshiwei + +from .tokenizer import * diff --git a/EduNLP/Tokenizer/tokenizer.py b/EduNLP/Tokenizer/tokenizer.py new file mode 100644 index 00000000..cf9084d0 --- /dev/null +++ b/EduNLP/Tokenizer/tokenizer.py @@ -0,0 +1,74 @@ +# coding: utf-8 +# 2021/8/1 @ tongshiwei + +from typing import Iterable +from ..SIF.segment import seg +from ..SIF.tokenization import tokenize + +__all__ = ["TOKENIZER", "Tokenizer", "TextTokenizer", "get_tokenizer"] + + +class Tokenizer(object): + def __call__(self, *args, **kwargs): + raise NotImplementedError + + +class TextTokenizer(Tokenizer): + r""" + + Examples + -------- + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokenizer = TextTokenizer() + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + """ + + def __init__(self, *args, **kwargs): + self.tokenization_params = { + "formula_params": { + "method": "linear", + } + } + + def __call__(self, items: Iterable, *args, **kwargs): + for item in items: + yield tokenize(seg(item, symbol="gmas"), **self.tokenization_params).tokens + + +TOKENIZER = { + "text": TextTokenizer +} + + +def get_tokenizer(name, *args, **kwargs): + r""" + + Parameters + ---------- + name: str + args + kwargs + + Returns + ------- + tokenizer: Tokenizer + + Examples + -------- + >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"] + >>> tokenizer = get_tokenizer("text") + >>> tokens = tokenizer(items) + >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE + ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', + '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', + '\\quad', 'A', '\\cap', 'B', '='] + """ + if name not in TOKENIZER: + raise KeyError( + "Unknown tokenizer %s, use one of the provided tokenizers: %s" % (name, ", ".join(TOKENIZER.keys())) + ) + return TOKENIZER[name](*args, **kwargs) diff --git a/EduNLP/Vector/__init__.py b/EduNLP/Vector/__init__.py index 89c15fec..efccc5ad 100644 --- a/EduNLP/Vector/__init__.py +++ b/EduNLP/Vector/__init__.py @@ -1,4 +1,8 @@ # coding: utf-8 # 2021/5/29 @ tongshiwei -from .gensim_vec import W2V, D2V +from .gensim_vec import W2V, D2V, BowLoader, TfidfLoader +from .const import * +from .rnn import RNNModel +from .t2v import T2V, get_pretrained_t2v +from .embedding import Embedding diff --git a/EduNLP/Vector/const.py b/EduNLP/Vector/const.py new file mode 100644 index 00000000..f798fa1c --- /dev/null +++ b/EduNLP/Vector/const.py @@ -0,0 +1,5 @@ +# coding: utf-8 +# 2021/7/12 @ tongshiwei + +UNK = "[UNK]" +PAD = "[PAD]" diff --git a/EduNLP/Vector/embedding.py b/EduNLP/Vector/embedding.py new file mode 100644 index 00000000..e3012a18 --- /dev/null +++ b/EduNLP/Vector/embedding.py @@ -0,0 +1,75 @@ +# coding: utf-8 +# 2021/7/12 @ tongshiwei + +from typing import List +import torch +from .gensim_vec import W2V +from .const import PAD +from EduNLP.ModelZoo import pad_sequence, set_device + + +class Embedding(object): + def __init__(self, w2v: (W2V, tuple, list, dict, None), freeze=True, device=None, **kwargs): + if w2v is None: + self.w2v = None + elif isinstance(w2v, (tuple, list)): + self.w2v = W2V(*w2v) + elif isinstance(w2v, dict): + self.w2v = W2V(**w2v) + elif isinstance(w2v, W2V): + self.w2v = w2v + else: + raise TypeError("w2v argument must be one of W2V, tuple, list, dict or None") + + if self.w2v is not None: + self.vocab_size = len(self.w2v) + self.embedding_dim = self.w2v.vector_size + else: + self.vocab_size = kwargs["vocab_size"] + self.embedding_dim = kwargs["embedding_dim"] + + self.embedding = torch.nn.Embedding(self.vocab_size, self.embedding_dim) + + self.pad_val = 0 + if self.w2v is not None: + self.embedding.from_pretrained(torch.Tensor(self.w2v.vectors), freeze) + self.pad_val = self.w2v.constants[PAD] + self.key_to_index = self.w2v.key_to_index if w2v is not None else lambda x: x + + if device is not None: + self.set_device(device) + + def __call__(self, items: List[List[str]], indexing=True, padding=True, vectorization=True, *args, + **kwargs) -> tuple: + + items, item_len = self.indexing(items, padding=padding, indexing=indexing) + items = self.infer_token_vector(items, indexing=False)[0] if vectorization else items + return items, item_len + + def infer_token_vector(self, items: List[List[str]], indexing=True) -> tuple: + items, item_len = self.indexing(items, padding=True, indexing=indexing) + item_embedding = self.embedding(torch.LongTensor(items)) + return item_embedding, item_len + + def indexing(self, items: List[List[str]], padding=False, indexing=True) -> tuple: + """ + + Parameters + ---------- + items: list of list of str(word/token) + padding: bool + whether padding the returned list with default pad_val to make all item in items have the same length + indexing: bool + + Returns + ------- + word_id: list of list of int + """ + items_idx = [[self.key_to_index(word) for word in item] for item in items] if indexing else items + item_len = [len(_idx) for _idx in items_idx] + padded_items_idx = pad_sequence(items_idx, pad_val=self.pad_val) if padding is True else items_idx + return padded_items_idx, item_len + + def set_device(self, device): + self.embedding = set_device(self.embedding, device) + return self diff --git a/EduNLP/Vector/gensim_vec.py b/EduNLP/Vector/gensim_vec.py index f1ff802a..13c5600a 100644 --- a/EduNLP/Vector/gensim_vec.py +++ b/EduNLP/Vector/gensim_vec.py @@ -1,12 +1,26 @@ # coding: utf-8 # 2021/5/29 @ tongshiwei +import numpy as np from pathlib import PurePath -from gensim.models import KeyedVectors, Word2Vec, FastText, Doc2Vec +from gensim.models import KeyedVectors, Word2Vec, FastText, Doc2Vec, TfidfModel +from gensim import corpora +import re +from .const import UNK, PAD +from .meta import Vector -class W2V(object): - def __init__(self, filepath, method, binary=None): +class W2V(Vector): + def __init__(self, filepath, method=None, binary=None): + """ + + Parameters + ---------- + filepath: + path to the pretrained model file + method + binary + """ fp = PurePath(filepath) self.binary = binary if binary is not None else (True if fp.suffix == ".bin" else False) if self.binary is True: @@ -17,17 +31,115 @@ def __init__(self, filepath, method, binary=None): else: self.wv = KeyedVectors.load(filepath, mmap="r") + self.method = method + self.constants = {UNK: 0, PAD: 1} + + def __len__(self): + return len(self.constants) + len(self.wv.key_to_index) + + def key_to_index(self, word): + if word in self.constants: + return self.constants[word] + else: + if word in self.wv.key_to_index: + return self.wv.key_to_index[word] + len(self.constants) + else: + return self.constants[UNK] + + @property + def vectors(self): + return np.concatenate([np.zeros((len(self.constants), self.vector_size)), self.wv.vectors], axis=0) + + @property + def vector_size(self): + return self.wv.vector_size + def __call__(self, *words): for word in words: - yield self.wv[word] + yield self[word] def __getitem__(self, item): - return self.wv[item] + return self.wv[item] if item not in self.constants else np.zeros((self.vector_size,)) + + def infer_vector(self, items, agg="mean", *args, **kwargs) -> np.ndarray: + tokens = self.infer_tokens(items, *args, **kwargs) + return eval("np.%s" % agg)(tokens, axis=1) + + def infer_tokens(self, items, *args, **kwargs) -> list: + return [list(self(*item)) for item in items] + + +class BowLoader(object): + def __init__(self, filepath): + self.dictionary = corpora.Dictionary.load(filepath) + + def infer_vector(self, item, return_vec=False): + item = self.dictionary.doc2bow(item) + if not return_vec: + return item # return dic as default + vec = [0 for i in range(len(self.dictionary.keys()))] + for i, v in item: + vec[i] = v + return vec + + @property + def vector_size(self): + return len(self.dictionary.keys()) -class D2V(object): +class TfidfLoader(object): def __init__(self, filepath): - self.d2v = Doc2Vec.load(filepath) + self.tfidf_model = TfidfModel.load(filepath) + # 'tfidf' model shold be used based on 'bow' model + dictionary_path = re.sub(r"(.*)tfidf", r"\1bow", filepath) + self.dictionary = corpora.Dictionary.load(dictionary_path) + + def infer_vector(self, item, return_vec=False): + dic_item = self.dictionary.doc2bow(item) + tfidf_item = self.tfidf_model[dic_item] + # return dic as default + if not return_vec: + return tfidf_item # pragma: no cover + vec = [0 for i in range(len(self.dictionary.keys()))] + for i, v in tfidf_item: + vec[i] = v + return vec + + @property + def vector_size(self): + return len(self.dictionary.token2id) + + +class D2V(Vector): + def __init__(self, filepath, method="d2v"): + self._method = method + self._filepath = filepath + if self._method == "d2v": + self.d2v = Doc2Vec.load(filepath) + elif self._method == "bow": + self.d2v = BowLoader(filepath) + elif self._method == "tfidf": + self.d2v = TfidfLoader(filepath) + else: + raise ValueError("Unknown method: %s" % method) def __call__(self, item): - return self.d2v.infer_vector(item) + if self._method == "d2v": + return self.d2v.infer_vector(item) + else: + return self.d2v.infer_vector(item, return_vec=True) + + @property + def vector_size(self): + if self._method == "d2v": + return self.d2v.vector_size + elif self._method == "bow": + return self.d2v.vector_size + elif self._method == "tfidf": + return self.d2v.vector_size + + def infer_vector(self, items, *args, **kwargs) -> list: + return [self(item) for item in items] + + def infer_tokens(self, item, *args, **kwargs) -> ...: + raise NotImplementedError diff --git a/EduNLP/Vector/meta.py b/EduNLP/Vector/meta.py new file mode 100644 index 00000000..87b8db8e --- /dev/null +++ b/EduNLP/Vector/meta.py @@ -0,0 +1,13 @@ +# coding: utf-8 +# 2021/7/13 @ tongshiwei + +class Vector(object): + def infer_vector(self, items, *args, **kwargs) -> ...: + pass + + def infer_tokens(self, items, *args, **kwargs) -> ...: + pass + + @property + def vector_size(self): + raise NotImplementedError diff --git a/EduNLP/Vector/rnn/__init__.py b/EduNLP/Vector/rnn/__init__.py new file mode 100644 index 00000000..54cde6aa --- /dev/null +++ b/EduNLP/Vector/rnn/__init__.py @@ -0,0 +1,4 @@ +# coding: utf-8 +# 2021/7/12 @ tongshiwei + +from .rnn import RNNModel diff --git a/EduNLP/Vector/rnn/rnn.py b/EduNLP/Vector/rnn/rnn.py new file mode 100644 index 00000000..02318193 --- /dev/null +++ b/EduNLP/Vector/rnn/rnn.py @@ -0,0 +1,88 @@ +# coding: utf-8 +# 2021/7/12 @ tongshiwei + +import torch +from ..gensim_vec import W2V +from ..embedding import Embedding +from ..meta import Vector +from EduNLP.ModelZoo import rnn, set_device + + +class RNNModel(Vector): + """ + Examples + -------- + >>> model = RNNModel("ELMO", None, 2, vocab_size=4, embedding_dim=3) + >>> seq_idx = [[1, 2, 3], [1, 2, 0], [3, 0, 0]] + >>> output, hn = model(seq_idx, indexing=False, padding=False) + >>> seq_idx = [[1, 2, 3], [1, 2], [3]] + >>> output, hn = model(seq_idx, indexing=False, padding=True) + >>> output.shape + torch.Size([3, 3, 4]) + >>> hn.shape + torch.Size([2, 3, 2]) + >>> tokens = model.infer_tokens(seq_idx, indexing=False) + >>> tokens.shape + torch.Size([3, 3, 4]) + >>> tokens = model.infer_tokens(seq_idx, agg="mean", indexing=False) + >>> tokens.shape + torch.Size([3, 4]) + >>> item = model.infer_vector(seq_idx, indexing=False) + >>> item.shape + torch.Size([3, 4]) + >>> item = model.infer_vector(seq_idx, agg="mean", indexing=False) + >>> item.shape + torch.Size([3, 2]) + >>> item = model.infer_vector(seq_idx, agg=None, indexing=False) + >>> item.shape + torch.Size([2, 3, 2]) + """ + + def __init__(self, rnn_type, w2v: (W2V, tuple, list, dict, None), hidden_size, freeze_pretrained=True, device=None, + **kwargs): + self.embedding = Embedding(w2v, freeze_pretrained, **kwargs) + for key in ["vocab_size", "embedding_dim"]: + if key in kwargs: + kwargs.pop(key) + self.rnn = rnn.LM( + rnn_type, + self.embedding.vocab_size, + self.embedding.embedding_dim, + hidden_size=hidden_size, + embedding=self.embedding.embedding, + **kwargs + ) + self.bidirectional = self.rnn.rnn.bidirectional + self.hidden_size = self.rnn.hidden_size + self.freeze_pretrained = freeze_pretrained + if device is not None: + self.set_device(device) + + def __call__(self, items, indexing=True, padding=True, **kwargs): + seq_idx, seq_len = self.embedding(items, indexing=indexing, padding=padding, vectorization=False) + + tokens, item = self.rnn(torch.LongTensor(seq_idx), torch.LongTensor(seq_len)) + + return tokens, item + + def infer_vector(self, items, agg: (int, str, None) = -1, indexing=True, padding=True, *args, + **kwargs) -> torch.Tensor: + vector = self(items, indexing=indexing, padding=padding, **kwargs)[1] + if agg is not None: + if agg == -1: + return torch.reshape(vector, (vector.shape[1], -1)) + return eval("torch.%s" % agg)(vector, dim=0) + return vector + + def infer_tokens(self, items, agg=None, *args, **kwargs) -> torch.Tensor: + tokens = self(items, **kwargs)[0] + if agg is not None: + return eval("torch.%s" % agg)(tokens, dim=1) + return tokens + + @property + def vector_size(self) -> int: + return self.hidden_size * (1 if self.bidirectional is False else 2) + + def set_device(self, device): + self.rnn = set_device(self.rnn, device) diff --git a/EduNLP/Vector/t2v.py b/EduNLP/Vector/t2v.py new file mode 100644 index 00000000..103fd3da --- /dev/null +++ b/EduNLP/Vector/t2v.py @@ -0,0 +1,57 @@ +# coding: utf-8 +# 2021/7/13 @ tongshiwei + +import os +from longling import path_append +from EduData import get_data +from .rnn import RNNModel +from .gensim_vec import W2V, D2V +from .meta import Vector +from EduNLP.constant import MODEL_DIR + +MODELS = { + "w2v": W2V, + "d2v": D2V, + "rnn": RNNModel, + "lstm": RNNModel, + "gru": RNNModel, + "elmo": RNNModel +} + + +class T2V(object): + def __init__(self, model: str, *args, **kwargs): + model = model.lower() + self.model_type = model + if model in {"rnn", "lstm", "gru", "elmo"}: + self.i2v: Vector = MODELS[model](model, *args, **kwargs) + else: + self.i2v: Vector = MODELS[model](*args, **kwargs) + + def __call__(self, items, *args, **kwargs): + return self.i2v.infer_vector(items, *args, **kwargs) + + @property + def vector_size(self) -> int: + return self.i2v.vector_size + + +PRETRAINED_MODELS = { + "d2v_all_256": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_all_256.zip", "d2v"], + "d2v_sci_256": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_science_256.zip", "d2v"], + "d2v_eng_256": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_english_256.zip", "d2v"], + "d2v_lit_256": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_literal_256.zip", "d2v"], +} + + +def get_pretrained_t2v(name, model_dir=MODEL_DIR): + if name not in PRETRAINED_MODELS: + raise KeyError( + "Unknown pretrained model %s, use one of the provided pretrained models: %s" % ( + name, ", ".join(PRETRAINED_MODELS.keys())) + ) + url, model_name, *args = PRETRAINED_MODELS[name] + model_path = get_data(url, model_dir) + if model_name in ["d2v"]: + model_path = path_append(model_path, os.path.basename(model_path) + ".bin", to_str=True) + return T2V(model_name, model_path, *args) diff --git a/EduNLP/__init__.py b/EduNLP/__init__.py index c5cdbbf7..604e2a3a 100644 --- a/EduNLP/__init__.py +++ b/EduNLP/__init__.py @@ -1 +1,2 @@ from .utils import logger +from .I2V import get_pretrained_i2v diff --git a/EduNLP/constant.py b/EduNLP/constant.py new file mode 100644 index 00000000..5ee2ddb5 --- /dev/null +++ b/EduNLP/constant.py @@ -0,0 +1,8 @@ +# coding: utf-8 +# 2021/8/1 @ tongshiwei + +import os +from os.path import expanduser, join + +ROOT = os.environ.get("EDUNLPPATH", join(expanduser("~"), ".EduNLP")) +MODEL_DIR = os.environ.get("EDUNLPMODELPATH", join(ROOT, "model")) diff --git a/EduNLP/main.py b/EduNLP/main.py new file mode 100644 index 00000000..46e24ce7 --- /dev/null +++ b/EduNLP/main.py @@ -0,0 +1,14 @@ +# coding: utf-8 +# 2021/8/2 @ tongshiwei + +import fire + +from EduNLP.I2V.i2v import MODELS + + +def list_i2v(): + print("\n".join(MODELS.keys())) + + +def cli(): # pragma: no cover + fire.Fire({"i2v": list_i2v}) diff --git a/examples/pretrain/gensim/d2v_bow_tfidf.ipynb b/examples/pretrain/gensim/d2v_bow_tfidf.ipynb new file mode 100644 index 00000000..66a77fa6 --- /dev/null +++ b/examples/pretrain/gensim/d2v_bow_tfidf.ipynb @@ -0,0 +1,324 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# 1. load and tokenize test_items" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "from platform import processor\r\n", + "from gensim import corpora,models\r\n", + "# from collections import defaultdict\r\n", + "import json\r\n", + "from tqdm import tqdm\r\n", + "from EduNLP.Pretrain import GensimWordTokenizer,train_vector\r\n", + "from EduNLP.Vector import D2V\r\n", + "from EduNLP.SIF.segment import seg\r\n", + "from EduNLP.SIF.tokenization import tokenize\r\n", + "import time\r\n", + "\r\n", + "output_file_head = \"test\" # subject = english | liberal | science |all\r\n", + "baseDir = \"E:/Workustc/lunadata/d2v\"\r\n", + "# baseDir = \"/home/qlh/data_pretrain\"\r\n", + "work_file_path = baseDir + \"/data/\" + output_file_head + \"_raw.json\"\r\n", + "\r\n", + "test_items = [{'ques_content':'有公式$\\\\FormFigureID{wrong1?}$和公式$\\\\FormFigureBase64{wrong2?}$,如图$\\\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$'},\r\n", + " {\"ques_content\":\"Human machine interface for lab abc computer applications\"},\r\n", + " {\"ques_content\": \"A survey of user opinion of computer system response time\"},\r\n", + " {\"ques_content\": \"The EPS user interface management system\"},\r\n", + " {\"ques_content\": \"System and human system engineering testing of EPS\"},\r\n", + " {\"ques_content\": \"Relation of user perceived response time to error measurement\"},\r\n", + " {\"ques_content\": \"The generation of random binary unordered trees\"},\r\n", + " {\"ques_content\": \"The intersection graph of paths in trees\"},\r\n", + " {\"ques_content\": \"Graph minors IV Widths of trees and well quasi ordering\"},\r\n", + " {\"ques_content\": \"Graph minors A survey\"}\r\n", + " ]\r\n", + "\r\n", + "def load_items():\r\n", + " for line in test_items:\r\n", + " yield line\r\n", + " # with open(work_file_path, 'r', encoding=\"utf-8\") as f:\r\n", + " # for line in f:\r\n", + " # yield json.loads(line)\r\n", + "\r\n", + "def data2Token():\r\n", + " # 线性分词,而不使用ast\r\n", + " tokenization_params = {\r\n", + " \"formula_params\": {\r\n", + " \"method\": \"linear\",\r\n", + " }\r\n", + " }\r\n", + " \r\n", + " token_items = []\r\n", + " count = 1\r\n", + " for item in tqdm(load_items(), \"sifing\"):\r\n", + " count = count + 1\r\n", + " # -------------------------------------------- # \r\n", + " # \"\"\"除文本、公式外,其他转化为特殊标记\"\"\"\r\n", + " # seg_ret = seg(item[\"ques_content\"], symbol=\"gmas\")\r\n", + " # token_item = tokenize(seg_ret, **tokenization_params)\r\n", + " tokenizer = GensimWordTokenizer(symbol=\"gmas\", general=True)\r\n", + " token_item = tokenizer(item[\"ques_content\"])\r\n", + "\r\n", + " # -------------------------------------------- # \r\n", + " if token_item:\r\n", + " # print(\"[i] = \", count)\r\n", + " # print(\"[tokens] = \", token_item)\r\n", + " token_items.append(token_item.tokens)\r\n", + " print(\"[data2Token] finish ========================> num = \",len(token_items))\r\n", + " return token_items\r\n", + "\r\n", + "token_items = data2Token()\r\n", + "token_items[0]" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "D:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", + " warnings.warn(msg)\n", + "sifing: 10it [00:00, 18.57it/s]" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[data2Token] finish ========================> num = 10\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['公式',\n", + " '[FORMULA]',\n", + " '公式',\n", + " '[FORMULA]',\n", + " '如图',\n", + " '[FIGURE]',\n", + " 'x',\n", + " ',',\n", + " 'y',\n", + " '约束条件',\n", + " '[SEP]',\n", + " 'z',\n", + " '=',\n", + " 'x',\n", + " '+',\n", + " '7',\n", + " 'y',\n", + " '最大值',\n", + " '[MARK]']" + ] + }, + "metadata": {}, + "execution_count": 1 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "len(token_items[0])" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "19" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ], + "metadata": { + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "# 2. train and test model by 'bow'" + ], + "metadata": { + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 3, + "source": [ + "from EduNLP.Pretrain import train_vector\r\n", + "#10 dimension with fasstext method\r\n", + "train_vector(token_items, \"../../../data/d2v/gensim_luna_stem_tf_\", method=\"bow\")" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "EduNLP, INFO model is saved to ../../../data/d2v/gensim_luna_stem_tf_bow.bin\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'../../../data/d2v/gensim_luna_stem_tf_bow.bin'" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ], + "metadata": { + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "from EduNLP.Vector import D2V\r\n", + "\r\n", + "d2v = D2V(\"../../../data/d2v/gensim_luna_stem_tf_bow.bin\", method = \"bow\")\r\n", + "print(d2v(token_items[1]))" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n" + ] + } + ], + "metadata": { + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "# 3. train and test model by 'tfidf'" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 5, + "source": [ + "from EduNLP.Pretrain import train_vector\r\n", + "#10 dimension with fasstext method\r\n", + "train_vector(token_items, \"../../../data/d2v/gensim_luna_stem_tf_\", method=\"tfidf\")" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "EduNLP, INFO model is saved to ../../../data/d2v/gensim_luna_stem_tf_bow.bin\n", + "EduNLP, INFO model is saved to ../../../data/d2v/gensim_luna_stem_tf_tfidf.bin\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'../../../data/d2v/gensim_luna_stem_tf_tfidf.bin'" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 6, + "source": [ + "from EduNLP.Vector import D2V\r\n", + "\r\n", + "d2v = D2V(\"../../../data/d2v/gensim_luna_stem_tf_tfidf.bin\", method = \"tfidf\")\r\n", + "vec_size = d2v.vector_size\r\n", + "print(\"vec_size = \", vec_size)\r\n", + "d2v(token_items[1])" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "vec_size = 63\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[(15, 0.37858374396389033),\n", + " (16, 0.37858374396389033),\n", + " (17, 0.37858374396389033),\n", + " (18, 0.2646186811599866),\n", + " (19, 0.37858374396389033),\n", + " (20, 0.2646186811599866),\n", + " (21, 0.37858374396389033),\n", + " (22, 0.37858374396389033)]" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ], + "metadata": {} + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} \ No newline at end of file diff --git a/examples/pretrain/gensim/d2v_general.ipynb b/examples/pretrain/gensim/d2v_general.ipynb new file mode 100644 index 00000000..67ac5a8e --- /dev/null +++ b/examples/pretrain/gensim/d2v_general.ipynb @@ -0,0 +1,332 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# 1. Get token example from item\n", + "> Notes: use geneal('linear') tokenizition method, which means do not parse formulas" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 1, + "source": [ + "# coding: utf-8\r\n", + "import json\r\n", + "from tqdm import tqdm\r\n", + "from EduNLP.SIF.segment import seg\r\n", + "from EduNLP.SIF.tokenization import tokenize\r\n", + "from EduNLP.Pretrain import GensimWordTokenizer\r\n", + "\r\n", + "def load_items():\r\n", + " test_items = [\r\n", + " {'ques_content':'有公式$\\\\FormFigureID{wrong1?}$和公式$\\\\FormFigureBase64{wrong2?}$,如图$\\\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$'},\r\n", + " {'ques_content':'如图$\\\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$'},\r\n", + " {'ques_content':'
Below is a discussion on a website.
is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", + " warnings.warn(msg)\n", + "sifing: 3it [00:00, 5.07it/s]\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['公式',\n", + " '[FORMULA]',\n", + " '公式',\n", + " '[FORMULA]',\n", + " '如图',\n", + " '[FIGURE]',\n", + " 'x',\n", + " ',',\n", + " 'y',\n", + " '约束条件',\n", + " '[SEP]',\n", + " 'z',\n", + " '=',\n", + " 'x',\n", + " '+',\n", + " '7',\n", + " 'y',\n", + " '最大值',\n", + " '[MARK]']" + ] + }, + "metadata": {}, + "execution_count": 1 + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "len(token_items)" + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "3" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "# 2. Load Model and test item" + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 7, + "source": [ + "\r\n", + "from urllib.request import urlopen\r\n", + "import os,os.path\r\n", + "import zipfile\r\n", + "\r\n", + "\r\n", + "def down_file(subject):\r\n", + " url = \"http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_\" + subject + \"_256.zip\"\r\n", + " file_name = \"../../../data/d2v/zip/\" + url.split('/')[-1]\r\n", + " u = urlopen(url)\r\n", + "\r\n", + " f = open(file_name, 'wb') \r\n", + " file_info = u.getheaders()\r\n", + " print(\"[down file] file info : \", file_info)\r\n", + " file_size_dl = 0\r\n", + " block_sz = 8192\r\n", + " while True: \r\n", + " buffer = u.read(block_sz) \r\n", + " if not buffer: \r\n", + " break\r\n", + " file_size_dl += len(buffer) \r\n", + " f.write(buffer) \r\n", + " f.close()\r\n", + " print(\"[down file] finish !\")\r\n", + "\r\n", + "\r\n", + "def unzip_file(subject):\r\n", + " zipfilename = \"../../../data/d2v/zip/general_\" + subject + \"_256.zip\"\r\n", + " unziptodir = \"../../../data/d2v/models/\"\r\n", + " print(\"[unzip file] start ...\")\r\n", + " if not os.path.exists(unziptodir):\r\n", + " os.mkdir(unziptodir)\r\n", + " zfobj = zipfile.ZipFile(zipfilename)\r\n", + " for name in zfobj.namelist():\r\n", + " name = name.replace('\\\\','/')\r\n", + " if name.endswith('/'):\r\n", + " continue\r\n", + " ext_filename = os.path.join(unziptodir, name)\r\n", + " ext_filename = ext_filename.replace('\\\\','/')\r\n", + " print(\"save ======> \",ext_filename)\r\n", + " ext_path= os.path.dirname(ext_filename)\r\n", + " if not os.path.exists(ext_path) :\r\n", + " os.mkdir(ext_path)\r\n", + " outfile = open(ext_filename, 'wb')\r\n", + " outfile.write(zfobj.read(name))\r\n", + " outfile.close()\r\n", + " print(\"[unzip file] finish !\")\r\n", + "\r\n", + "def getData(subject = \"english\"):\r\n", + " \"\"\" subject = english | liberal | science |all \"\"\"\r\n", + " down_file(subject)\r\n", + " unzip_file(subject)\r\n", + "\r\n", + "\r\n", + "work_subject = \"science\"\r\n", + "getData(work_subject)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[down file] file info : [('Server', 'nginx'), ('Date', 'Thu, 08 Jul 2021 14:05:55 GMT'), ('Content-Type', 'application/zip'), ('Content-Length', '2035517115'), ('Connection', 'close'), ('Last-Modified', 'Thu, 08 Jul 2021 13:24:26 GMT'), ('ETag', '\"60e6fc8a-795386bb\"'), ('Accept-Ranges', 'bytes')]\n", + "[down file] finish !\n", + "[unzip file] start ...\n", + "save ======> ../../../data/d2v/models/general_science_256/general_science_256.bin\n", + "save ======> ../../../data/d2v/models/general_science_256/general_science_256.bin.dv.vectors.npy\n", + "save ======> ../../../data/d2v/models/general_science_256/general_science_256.bin.syn1neg.npy\n", + "save ======> ../../../data/d2v/models/general_science_256/general_science_256.bin.wv.vectors.npy\n", + "[unzip file] finish !\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 4, + "source": [ + "print(token_items[0])" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 2, + "source": [ + "from EduNLP.Vector import D2V\r\n", + "work_subject = \"science\"\r\n", + "d2v = D2V(\"../../../data/d2v/models/general_\" + work_subject +\"_256/general_\" + work_subject + \"_256.bin\")\r\n", + "print(d2v.vector_size)\r\n", + "d2v(token_items[0])" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "256\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([ 6.68359101e-02, -6.85622962e-03, 1.71755534e-03, -9.45999995e-02,\n", + " 5.71297631e-02, -1.14749409e-01, -1.06426410e-01, -5.48244826e-02,\n", + " -1.01055816e-01, 6.82074800e-02, -3.01527120e-02, 1.88328531e-02,\n", + " -5.40650599e-02, 1.96987823e-01, 7.23450258e-02, -7.86591992e-02,\n", + " 2.52593309e-02, -8.93113762e-02, 5.15675824e-03, 1.25454620e-01,\n", + " 1.75611585e-01, 7.01171979e-02, -4.82840873e-02, 5.61073385e-02,\n", + " 4.38053571e-02, 8.21266770e-02, 2.25354582e-02, 2.86612101e-02,\n", + " 6.49044961e-02, 4.38563228e-02, -5.53747378e-02, 3.68891433e-02,\n", + " 4.41701710e-02, -1.57279179e-01, -1.71185300e-01, -9.53545198e-02,\n", + " -3.68149281e-02, 1.03217609e-01, -4.01013494e-02, 1.34829208e-02,\n", + " -3.90383117e-02, 4.31797989e-02, -1.31486431e-01, -6.81887381e-03,\n", + " -3.09619904e-02, 1.09645449e-01, 9.19818357e-02, 1.05142176e-01,\n", + " -8.25446919e-02, -1.10780641e-01, -7.99699128e-02, 4.87378612e-03,\n", + " 5.09812087e-02, -1.88464615e-02, 4.43719625e-02, -2.79577565e-03,\n", + " 5.48942536e-02, 7.99279436e-02, -1.14065006e-01, -6.10431209e-02,\n", + " 2.25610659e-02, -3.98695990e-02, -6.11394234e-02, -5.44755235e-02,\n", + " 7.43018761e-02, -4.14421707e-02, -1.59866199e-01, -6.57487512e-02,\n", + " -1.21370479e-01, 5.41980937e-02, 5.50763076e-03, 5.59395552e-03,\n", + " 1.20198451e-01, 6.72993287e-02, -8.41371343e-02, -3.98931094e-02,\n", + " -5.98041154e-02, -6.74210638e-02, -8.08542073e-02, 4.32682643e-03,\n", + " 3.98905091e-02, -5.25522307e-02, -8.63379464e-02, 5.52122667e-02,\n", + " -1.91897918e-02, 6.72513470e-02, 1.63677037e-02, -4.64263670e-02,\n", + " 4.26646275e-03, 2.88309101e-02, -3.19259726e-02, 6.76017851e-02,\n", + " 1.18875027e-01, 4.90449667e-02, -7.11180866e-02, 2.42015105e-02,\n", + " 6.09337091e-02, 9.15575251e-02, 7.54630268e-02, -5.30363468e-04,\n", + " 5.33818686e-03, 2.14987155e-02, 1.37690797e-01, -8.63378346e-02,\n", + " 5.83221130e-02, -3.59287485e-02, 7.56779611e-02, 2.51492225e-02,\n", + " 1.17275678e-02, 9.37244594e-02, 3.03551462e-02, -1.35064060e-02,\n", + " 6.28025606e-02, -1.67514980e-01, -1.24259945e-02, -1.95242167e-02,\n", + " 6.93811625e-02, 7.72726461e-02, 7.74716437e-02, -1.47965118e-01,\n", + " -4.22228361e-04, 1.83783751e-02, -1.19136199e-01, -3.13477665e-02,\n", + " 6.60038590e-02, 2.46255528e-02, 2.11933651e-03, -9.49578434e-02,\n", + " -2.49075815e-02, 1.01346388e-01, -5.71207069e-02, -4.76290993e-02,\n", + " 2.79998290e-03, -8.29489976e-02, 4.29078564e-02, 4.00602221e-02,\n", + " 1.03404291e-01, 7.92418346e-02, -3.14001106e-02, 2.04087533e-02,\n", + " -9.57951397e-02, -7.60837719e-02, -1.74582575e-03, -4.40510325e-02,\n", + " 6.49931505e-02, -1.44915171e-02, 3.33687216e-02, -2.45348830e-02,\n", + " -4.90438566e-03, 8.16997364e-02, 1.56976636e-02, -2.20130035e-03,\n", + " -3.88220809e-02, 4.17613201e-02, 1.23736160e-02, 2.39638099e-03,\n", + " 7.04660639e-02, -8.40025023e-03, 8.84754434e-02, 4.73559313e-02,\n", + " 1.60846859e-02, 6.38007149e-02, -8.88152346e-02, -5.36189489e-02,\n", + " -3.58884176e-03, -7.97238126e-02, -2.48845778e-02, 6.67371228e-02,\n", + " -1.27798110e-01, 5.20749278e-02, -1.03058614e-01, -9.93425995e-02,\n", + " 6.30614609e-02, 6.55593872e-02, 2.47250423e-02, 1.01459853e-01,\n", + " 8.41867253e-02, 1.90107450e-02, -5.06304689e-02, 9.08671319e-03,\n", + " -1.11649349e-01, 4.15530279e-02, 3.82142738e-02, 7.48702586e-02,\n", + " 1.00878365e-01, 7.18154162e-02, 2.41982359e-02, 4.45286110e-02,\n", + " 2.29161587e-02, -6.85874224e-02, -6.66044280e-02, 5.26503660e-02,\n", + " 1.44319907e-02, 7.72640528e-03, 4.93934080e-02, -4.20203842e-02,\n", + " -1.19266249e-02, -3.40296179e-02, -5.05692326e-02, -1.01971209e-01,\n", + " 5.03124930e-02, 1.07444279e-01, 2.78240931e-03, -6.46820664e-02,\n", + " 2.53117532e-02, 1.04838371e-01, -5.48670478e-02, -8.49981084e-02,\n", + " -1.75488254e-04, -7.08199888e-02, -8.43240973e-03, 9.51339304e-02,\n", + " -1.88117087e-01, 1.78130921e-02, 2.86972504e-02, -5.94706945e-02,\n", + " 4.38547023e-02, 4.58841883e-02, -3.49672660e-02, -6.55051991e-02,\n", + " -7.90929198e-02, 3.29272039e-02, 2.99417619e-02, 1.12901134e-02,\n", + " -6.14368394e-02, -2.01964248e-02, 3.12223360e-02, 8.69451910e-02,\n", + " -1.85837403e-01, -1.25434086e-01, 1.11888051e-02, -1.12750731e-01,\n", + " 4.47746105e-02, -6.38351589e-02, 2.88816690e-02, -2.45125685e-02,\n", + " 3.97114865e-02, 8.87534320e-02, 1.15282401e-01, -6.65650517e-02,\n", + " -9.49165039e-03, 4.97242734e-02, 1.17295712e-01, -1.91902611e-02,\n", + " -3.20644900e-02, 1.36362026e-02, -3.73102799e-02, 8.89487471e-03,\n", + " -2.56872289e-02, -7.46497372e-03, -1.25288516e-02, -1.08435608e-01,\n", + " -3.12000625e-02, -1.22699983e-01, 4.24938798e-02, -1.87821351e-02],\n", + " dtype=float32)" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ], + "metadata": {} + } + ], + "metadata": { + "interpreter": { + "hash": "776957673adb719a00031a24ed5efd2fa5ce8a13405e5193f8d278edd3805d55" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/setup.py b/setup.py index c3c96585..4678892b 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ setup( name='EduNLP', - version='0.0.2', + version='0.0.3', extras_require={ 'test': test_deps, 'tutor': tutor_deps, @@ -34,7 +34,12 @@ 'numpy>=1.17.0', 'jieba', 'js2py', + 'torch', + 'EduData>=0.0.16' ], # And any other dependencies foo needs entry_points={ + "console_scripts": [ + "edunlp = EduNLP.main:cli", + ], }, ) diff --git a/tests/test_i2v/__init__.py b/tests/test_i2v/__init__.py new file mode 100644 index 00000000..1efdd808 --- /dev/null +++ b/tests/test_i2v/__init__.py @@ -0,0 +1,2 @@ +# coding: utf-8 +# 2021/8/1 @ tongshiwei diff --git a/tests/test_i2v/test_pretrained.py b/tests/test_i2v/test_pretrained.py new file mode 100644 index 00000000..a07f469a --- /dev/null +++ b/tests/test_i2v/test_pretrained.py @@ -0,0 +1,22 @@ +# coding: utf-8 +# 2021/8/2 @ tongshiwei +import pytest +from EduNLP import get_pretrained_i2v +from EduNLP.Vector.t2v import PRETRAINED_MODELS +from EduNLP.I2V.i2v import MODELS +from EduNLP.I2V import D2V + + +def test_pretrained_i2v(tmp_path): + PRETRAINED_MODELS["test"] = ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/test_256.zip", "d2v"] + MODELS["test"] = [D2V, "test"] + + d = tmp_path / "model" + d.mkdir() + + get_pretrained_i2v("test", d) + + with pytest.raises(KeyError): + get_pretrained_i2v("error") + + get_pretrained_i2v("test", d) diff --git a/tests/test_main.py b/tests/test_main.py new file mode 100644 index 00000000..b0876466 --- /dev/null +++ b/tests/test_main.py @@ -0,0 +1,8 @@ +# coding: utf-8 +# 2021/8/2 @ tongshiwei + +from EduNLP.main import list_i2v + + +def test_list_i2v(): + list_i2v() diff --git a/tests/test_tokenizer/__init__.py b/tests/test_tokenizer/__init__.py new file mode 100644 index 00000000..1efdd808 --- /dev/null +++ b/tests/test_tokenizer/__init__.py @@ -0,0 +1,2 @@ +# coding: utf-8 +# 2021/8/1 @ tongshiwei diff --git a/tests/test_tokenizer/test_tokenizer.py b/tests/test_tokenizer/test_tokenizer.py new file mode 100644 index 00000000..9f7e03f8 --- /dev/null +++ b/tests/test_tokenizer/test_tokenizer.py @@ -0,0 +1,10 @@ +# coding: utf-8 +# 2021/8/1 @ tongshiwei + +import pytest +from EduNLP.Tokenizer import get_tokenizer + + +def test_tokenizer(): + with pytest.raises(KeyError): + get_tokenizer("error") diff --git a/tests/test_vec/test_t2v.py b/tests/test_vec/test_t2v.py new file mode 100644 index 00000000..2de7ab3a --- /dev/null +++ b/tests/test_vec/test_t2v.py @@ -0,0 +1,10 @@ +# coding: utf-8 +# 2021/8/2 @ tongshiwei + +import pytest +from EduNLP.Vector import get_pretrained_t2v + + +def test_t2v(): + with pytest.raises(KeyError): + get_pretrained_t2v("error") diff --git a/tests/test_vec/test_vec.py b/tests/test_vec/test_vec.py index e7ec908c..9bb906e6 100644 --- a/tests/test_vec/test_vec.py +++ b/tests/test_vec/test_vec.py @@ -1,15 +1,46 @@ # coding: utf-8 # 2021/5/30 @ tongshiwei +import numpy as np import pytest from EduNLP.Pretrain import train_vector, GensimWordTokenizer -from EduNLP.Vector import W2V, D2V +from EduNLP.Vector import W2V, D2V, RNNModel, T2V, Embedding +from EduNLP.I2V import D2V as I_D2V @pytest.fixture(scope="module") def stem_data(data): + _data = [] + for e in data[:10]: + d = e["stem"] + _data.append(d) + assert _data + return _data + + +@pytest.fixture(scope="module") +def stem_tokens(stem_data): _data = [] tokenizer = GensimWordTokenizer() + for e in stem_data: + d = tokenizer(e) + if d is not None: + _data.append(d.tokens) + assert _data + return _data + + +@pytest.fixture(scope="module") +def stem_data_general(data): + test_items = [ + {'stem': '有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\ + 如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}, + {'stem': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \ + 若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'} + ] + data = test_items + data + _data = [] + tokenizer = GensimWordTokenizer(symbol="gmas", general=True) for e in data[:10]: d = tokenizer(e["stem"]) if d is not None: @@ -19,42 +50,127 @@ def stem_data(data): @pytest.mark.parametrize("method", ["sg", "cbow", "fasttext"]) -def test_w2v(stem_data, tmpdir, method): +@pytest.mark.parametrize("binary", [True, False, None]) +def test_w2v(stem_tokens, tmpdir, method, binary): filepath_prefix = str(tmpdir.mkdir(method).join("stem_tf_")) filepath = train_vector( - stem_data, + stem_tokens, filepath_prefix, 10, method=method, + binary=binary, train_params=dict(min_count=0) ) - w2v = W2V(filepath, method=method) - w2v(stem_data[0]) + w2v = W2V(filepath, method=method, binary=binary) + assert w2v.vector_size == 10 + w2v(*stem_tokens[0]) + assert len(w2v.infer_vector([stem_tokens[0]])[0]) == w2v.vector_size + w2v.key_to_index(stem_tokens[0][0]) + assert len(w2v) > 0 assert len(w2v["[FIGURE]"]) == 10 assert len(list(w2v("[FIGURE]"))) == 1 + assert np.array_equal(w2v["[UNK]"], np.zeros((10,))) + assert np.array_equal(w2v["[PAD]"], np.zeros((10,))) + assert w2v.vectors.shape == (len(w2v.wv.vectors) + len(w2v.constants), w2v.vector_size) + assert w2v.key_to_index("[UNK]") == 0 + assert w2v.key_to_index("OOV") == 0 + + t2v = T2V("w2v", filepath=filepath, method=method, binary=binary) + assert len(t2v(stem_tokens[:1])[0]) == t2v.vector_size + + for _w2v in [[filepath, method, binary], dict(filepath=filepath, method=method, binary=binary)]: + embedding = Embedding(_w2v, device="cpu") + items, item_len = embedding(stem_tokens[:5]) + assert items.shape == (5, max(item_len), embedding.embedding_dim) + + +def test_embedding(): + with pytest.raises(TypeError): + Embedding("error") + + +def test_rnn(stem_tokens, tmpdir): + method = "sg" + filepath_prefix = str(tmpdir.mkdir(method).join("stem_tf_")) + filepath = train_vector( + stem_tokens, + filepath_prefix, + 10, + method=method, + train_params=dict(min_count=0) + ) + w2v = W2V(filepath, method=method) + + with pytest.raises(TypeError): + RNNModel("Error", w2v, 20) + + for rnn_type in ["ElMo", "Rnn", "lstm", "GRU"]: + rnn = RNNModel(rnn_type, w2v, 20, device="cpu") + tokens = rnn.infer_tokens(stem_tokens[:1]) + item = rnn.infer_vector(stem_tokens[:1]) + assert tokens.shape == (1, len(stem_tokens[0]), 20 * (2 if rnn.bidirectional else 1)) + assert item.shape == (1, rnn.vector_size) -def test_d2v(stem_data, tmpdir): + t2v = T2V(rnn_type, w2v, 20) + assert len(t2v(stem_tokens[:1])[0]) == t2v.vector_size + + +def test_d2v(stem_tokens, tmpdir, stem_data): method = "d2v" filepath_prefix = str(tmpdir.mkdir(method).join("stem_tf_")) filepath = train_vector( - stem_data, + stem_tokens, filepath_prefix, 10, method=method, train_params=dict(min_count=0) ) d2v = D2V(filepath) - assert len(d2v(stem_data[0])) == 10 + assert len(d2v(stem_tokens[0])) == 10 + assert d2v.vector_size == 10 + + t2v = T2V("d2v", filepath) + assert len(t2v(stem_tokens[:1])[0]) == t2v.vector_size + i2v = I_D2V("text", "d2v", filepath) + i_vec, t_vec = i2v(stem_data[:1]) + assert len(i_vec[0]) == i2v.vector_size + assert t_vec is None -def test_exception(stem_data, tmpdir): + cfg_path = str(tmpdir / method / "i2v_config.json") + i2v.save(config_path=cfg_path) + i2v = I_D2V.load(cfg_path) + + i_vec = i2v.infer_item_vector(stem_data[:1]) + assert len(i_vec[0]) == i2v.vector_size + + t_vec = i2v.infer_token_vector(stem_data[:1]) + assert t_vec is None + + +@pytest.mark.parametrize("method", ["bow", "tfidf"]) +def test_d2v_bow_tfidf(stem_data_general, tmpdir, method): + filepath_prefix = str(tmpdir.mkdir(method).join("stem_tf_")) + filepath = train_vector( + stem_data_general, + filepath_prefix, + method=method + ) + d2v = D2V(filepath, method=method) + d2v(stem_data_general[0]) + assert d2v.vector_size > 0 + + +def test_exception(stem_tokens, tmpdir): filepath_prefix = str(tmpdir.mkdir("error").join("stem_tf_")) with pytest.raises(ValueError): train_vector( - stem_data, + stem_tokens, filepath_prefix, - 10, + embedding_dim=10, method="error", train_params=dict(min_count=0) ) + with pytest.raises(ValueError): + D2V("error_path", method="error")