diff --git a/CHANGE.txt b/CHANGE.txt
index 386107db..1af2a717 100644
--- a/CHANGE.txt
+++ b/CHANGE.txt
@@ -1,8 +1,10 @@
v0.0.3:
1. update formula ast: supporting more symbols and functions defined in katex
- 2. add item to vector tools, including word2vec and doc2vec using gensim
+ 2. add tokens to vector tools, including word2vec and doc2vec using gensim
3. sci4sif support tokenization grouped by segments
4. add special tokens: \SIFTag and \SIFSep
+ 5. add item to vector tools
+ 6. add interface for getting pretrained models, where the supported model names can be accessed by `edunlp i2v` in the command console
v0.0.2:
1. fix potential ModuleNotFoundError
diff --git a/EduNLP/I2V/__init__.py b/EduNLP/I2V/__init__.py
new file mode 100644
index 00000000..7735252d
--- /dev/null
+++ b/EduNLP/I2V/__init__.py
@@ -0,0 +1,5 @@
+# coding: utf-8
+# 2021/8/1 @ tongshiwei
+
+from .i2v import I2V, get_pretrained_i2v
+from .i2v import D2V
diff --git a/EduNLP/I2V/i2v.py b/EduNLP/I2V/i2v.py
new file mode 100644
index 00000000..63295565
--- /dev/null
+++ b/EduNLP/I2V/i2v.py
@@ -0,0 +1,109 @@
+# coding: utf-8
+# 2021/8/1 @ tongshiwei
+
+import json
+from EduNLP.constant import MODEL_DIR
+from ..Vector import T2V, get_pretrained_t2v as get_t2v_pretrained_model
+from ..Tokenizer import Tokenizer, get_tokenizer
+from EduNLP import logger
+
+__all__ = ["I2V", "D2V", "get_pretrained_i2v"]
+
+
+class I2V(object):
+ def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None, pretrained_t2v=False, **kwargs):
+ """
+
+ Parameters
+ ----------
+ tokenizer: str
+ the tokenizer name
+ t2v: str
+ the name of token2vector model
+ args:
+ the parameters passed to t2v
+ tokenizer_kwargs: dict
+ the parameters passed to tokenizer
+ pretrained_t2v: bool
+ kwargs:
+ the parameters passed to t2v
+ """
+ self.tokenizer: Tokenizer = get_tokenizer(tokenizer, **tokenizer_kwargs if tokenizer_kwargs is not None else {})
+ if pretrained_t2v:
+ logger.info("Use pretrained t2v model %s" % t2v)
+ self.t2v = get_t2v_pretrained_model(t2v, kwargs.get("model_dir", MODEL_DIR))
+ else:
+ self.t2v = T2V(t2v, *args, **kwargs)
+ self.params = {
+ "tokenizer": tokenizer,
+ "tokenizer_kwargs": tokenizer_kwargs,
+ "t2v": t2v,
+ "args": args,
+ "kwargs": kwargs,
+ "pretrained_t2v": pretrained_t2v
+ }
+
+ def __call__(self, items, *args, **kwargs):
+ return self.infer_vector(items, *args, **kwargs)
+
+ def tokenize(self, items, indexing=True, padding=False, *args, **kwargs) -> list:
+ return self.tokenizer(items, *args, **kwargs)
+
+ def infer_vector(self, items, tokenize=True, indexing=False, padding=False, *args, **kwargs) -> tuple:
+ raise NotImplementedError
+
+ def infer_item_vector(self, tokens, *args, **kwargs) -> ...:
+ return self.infer_vector(tokens, *args, **kwargs)[0]
+
+ def infer_token_vector(self, tokens, *args, **kwargs) -> ...:
+ return self.infer_vector(tokens, *args, **kwargs)[1]
+
+ def save(self, config_path, *args, **kwargs):
+ with open(config_path, "w", encoding="utf-8") as wf:
+ json.dump(self.params, wf, ensure_ascii=False, indent=2)
+
+ @classmethod
+ def load(cls, config_path, *args, **kwargs):
+ with open(config_path, encoding="utf-8") as f:
+ params: dict = json.load(f)
+ tokenizer = params.pop("tokenizer")
+ t2v = params.pop("t2v")
+ args = params.pop("args")
+ kwargs = params.pop("kwargs")
+ params.update(kwargs)
+ return cls(tokenizer, t2v, *args, **params)
+
+ @classmethod
+ def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
+ raise NotImplementedError
+
+ @property
+ def vector_size(self):
+ return self.t2v.vector_size
+
+
+class D2V(I2V):
+ def infer_vector(self, items, tokenize=True, indexing=False, padding=False, *args, **kwargs) -> tuple:
+ tokens = self.tokenize(items, return_token=True) if tokenize is True else items
+ return self.t2v(tokens, *args, **kwargs), None
+
+ @classmethod
+ def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
+ return cls("text", name, pretrained_t2v=True, model_dir=model_dir)
+
+
+MODELS = {
+ "d2v_all_256": [D2V, "d2v_all_256"],
+ "d2v_sci_256": [D2V, "d2v_sci_256"],
+ "d2v_eng_256": [D2V, "d2v_eng_256"],
+ "d2v_lit_256": [D2V, "d2v_lit_256"],
+}
+
+
+def get_pretrained_i2v(name, model_dir=MODEL_DIR):
+ if name not in MODELS:
+ raise KeyError(
+ "Unknown model name %s, use one of the provided models: %s" % (name, ", ".join(MODELS.keys()))
+ )
+ _class, *params = MODELS[name]
+ return _class.from_pretrained(*params, model_dir=model_dir)
diff --git a/EduNLP/ModelZoo/__init__.py b/EduNLP/ModelZoo/__init__.py
new file mode 100644
index 00000000..5dbf8ed1
--- /dev/null
+++ b/EduNLP/ModelZoo/__init__.py
@@ -0,0 +1,4 @@
+# coding: utf-8
+# 2021/7/12 @ tongshiwei
+
+from .utils import *
diff --git a/EduNLP/ModelZoo/rnn/__init__.py b/EduNLP/ModelZoo/rnn/__init__.py
new file mode 100644
index 00000000..a524181d
--- /dev/null
+++ b/EduNLP/ModelZoo/rnn/__init__.py
@@ -0,0 +1,4 @@
+# coding: utf-8
+# 2021/7/12 @ tongshiwei
+
+from .rnn import LM
diff --git a/EduNLP/ModelZoo/rnn/rnn.py b/EduNLP/ModelZoo/rnn/rnn.py
new file mode 100644
index 00000000..5b00aec8
--- /dev/null
+++ b/EduNLP/ModelZoo/rnn/rnn.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+# 2021/7/12 @ tongshiwei
+
+import torch
+from torch import nn
+from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
+
+
+class LM(nn.Module):
+ """
+ Examples
+ --------
+ >>> import torch
+ >>> seq_idx = torch.LongTensor([[1, 2, 3], [1, 2, 0], [3, 0, 0]])
+ >>> seq_len = torch.LongTensor([3, 2, 1])
+ >>> lm = LM("RNN", 4, 3, 2)
+ >>> output, hn = lm(seq_idx, seq_len)
+ >>> output.shape
+ torch.Size([3, 3, 2])
+ >>> hn.shape
+ torch.Size([1, 3, 2])
+ >>> lm = LM("RNN", 4, 3, 2, num_layers=2)
+ >>> output, hn = lm(seq_idx, seq_len)
+ >>> output.shape
+ torch.Size([3, 3, 2])
+ >>> hn.shape
+ torch.Size([2, 3, 2])
+ """
+
+ def __init__(self, rnn_type: str, vocab_size: int, embedding_dim: int, hidden_size: int, num_layers=1,
+ bidirectional=False, embedding=None, **kwargs):
+ super(LM, self).__init__()
+ rnn_type = rnn_type.upper()
+ self.embedding = torch.nn.Embedding(vocab_size, embedding_dim) if embedding is None else embedding
+ self.c = False
+ if rnn_type == "RNN":
+ self.rnn = torch.nn.RNN(
+ embedding_dim, hidden_size, num_layers, bidirectional=bidirectional, **kwargs
+ )
+ elif rnn_type == "LSTM":
+ self.rnn = torch.nn.LSTM(
+ embedding_dim, hidden_size, num_layers, bidirectional=bidirectional, **kwargs
+ )
+ self.c = True
+ elif rnn_type == "GRU":
+ self.rnn = torch.nn.GRU(
+ embedding_dim, hidden_size, num_layers, bidirectional=bidirectional, **kwargs
+ )
+ elif rnn_type == "ELMO":
+ bidirectional = True
+ self.rnn = torch.nn.LSTM(
+ embedding_dim, hidden_size, num_layers, bidirectional=bidirectional, **kwargs
+ )
+ self.c = True
+ else:
+ raise TypeError("Unknown rnn_type %s" % rnn_type)
+
+ self.num_layers = num_layers
+ self.bidirectional = bidirectional
+ if bidirectional is True:
+ self.num_layers *= 2
+ self.hidden_size = hidden_size
+
+ def forward(self, seq_idx, seq_len):
+ seq = self.embedding(seq_idx)
+ pack = pack_padded_sequence(seq, seq_len, batch_first=True)
+ h0 = torch.randn(self.num_layers, seq.shape[0], self.hidden_size)
+ if self.c is True:
+ c0 = torch.randn(self.num_layers, seq.shape[0], self.hidden_size)
+ output, (hn, _) = self.rnn(pack, (h0, c0))
+ else:
+ output, hn = self.rnn(pack, h0)
+ output, _ = pad_packed_sequence(output, batch_first=True)
+ return output, hn
diff --git a/EduNLP/ModelZoo/utils/__init__.py b/EduNLP/ModelZoo/utils/__init__.py
new file mode 100644
index 00000000..a41af25f
--- /dev/null
+++ b/EduNLP/ModelZoo/utils/__init__.py
@@ -0,0 +1,5 @@
+# coding: utf-8
+# 2021/7/12 @ tongshiwei
+
+from .padder import PadSequence, pad_sequence
+from .device import set_device
diff --git a/EduNLP/ModelZoo/utils/device.py b/EduNLP/ModelZoo/utils/device.py
new file mode 100644
index 00000000..23b2646e
--- /dev/null
+++ b/EduNLP/ModelZoo/utils/device.py
@@ -0,0 +1,44 @@
+# coding: utf-8
+# 2021/8/2 @ tongshiwei
+import logging
+import torch
+from torch.nn import DataParallel
+
+
+def set_device(_net, ctx, *args, **kwargs): # pragma: no cover
+ """code from longling v1.3.26"""
+ if ctx == "cpu":
+ if not isinstance(_net, DataParallel):
+ _net = DataParallel(_net)
+ return _net.cpu()
+ elif any(map(lambda x: x in ctx, ["cuda", "gpu"])):
+ if not torch.cuda.is_available():
+ try:
+ torch.ones((1,), device=torch.device("cuda: 0"))
+ except AssertionError as e:
+ raise TypeError("no cuda detected, noly cpu is supported, the detailed error msg:%s" % str(e))
+ if torch.cuda.device_count() >= 1:
+ if ":" in ctx:
+ ctx_name, device_ids = ctx.split(":")
+ assert ctx_name in ["cuda", "gpu"], "the equipment should be 'cpu', 'cuda' or 'gpu', now is %s" % ctx
+ device_ids = [int(i) for i in device_ids.strip().split(",")]
+ try:
+ if not isinstance(_net, DataParallel):
+ return DataParallel(_net, device_ids).cuda
+ return _net.cuda(device_ids)
+ except AssertionError as e:
+ logging.error(device_ids)
+ raise e
+ elif ctx in ["cuda", "gpu"]:
+ if not isinstance(_net, DataParallel):
+ _net = DataParallel(_net)
+ return _net.cuda()
+ else:
+ raise TypeError("the equipment should be 'cpu', 'cuda' or 'gpu', now is %s" % ctx)
+ else:
+ logging.error(torch.cuda.device_count())
+ raise TypeError("0 gpu can be used, use cpu")
+ else:
+ if not isinstance(_net, DataParallel):
+ return DataParallel(_net, device_ids=ctx).cuda()
+ return _net.cuda(ctx)
diff --git a/EduNLP/ModelZoo/utils/padder.py b/EduNLP/ModelZoo/utils/padder.py
new file mode 100644
index 00000000..ed86cfef
--- /dev/null
+++ b/EduNLP/ModelZoo/utils/padder.py
@@ -0,0 +1,76 @@
+# coding: utf-8
+# 2021/7/12 @ tongshiwei
+
+__all__ = ["PadSequence", "pad_sequence"]
+
+
+class PadSequence(object):
+ """Pad the sequence.
+
+ Pad the sequence to the given `length` by inserting `pad_val`. If `clip` is set,
+ sequence that has length larger than `length` will be clipped.
+
+ Parameters
+ ----------
+ length : int
+ The maximum length to pad/clip the sequence
+ pad_val : number
+ The pad value. Default 0
+ clip : bool
+ """
+
+ def __init__(self, length, pad_val=0, clip=True):
+ self._length = length
+ self._pad_val = pad_val
+ self._clip = clip
+
+ def __call__(self, sample: list):
+ """
+
+ Parameters
+ ----------
+ sample : list of number
+
+ Returns
+ -------
+ ret : list of number
+ """
+ sample_length = len(sample)
+ if sample_length >= self._length:
+ if self._clip and sample_length > self._length:
+ return sample[:self._length]
+ else:
+ return sample
+ else:
+ return sample + [
+ self._pad_val for _ in range(self._length - sample_length)
+ ]
+
+
+def pad_sequence(sequence: list, max_length=None, pad_val=0, clip=True):
+ """
+
+ Parameters
+ ----------
+ sequence
+ max_length
+ pad_val
+ clip
+
+ Returns
+ -------
+
+ Examples
+ --------
+ >>> seq = [[4, 3, 3], [2], [3, 3, 2]]
+ >>> pad_sequence(seq)
+ [[4, 3, 3], [2, 0, 0], [3, 3, 2]]
+ >>> pad_sequence(seq, pad_val=1)
+ [[4, 3, 3], [2, 1, 1], [3, 3, 2]]
+ >>> pad_sequence(seq, max_length=2)
+ [[4, 3], [2, 0], [3, 3]]
+ >>> pad_sequence(seq, max_length=2, clip=False)
+ [[4, 3, 3], [2, 0], [3, 3, 2]]
+ """
+ padder = PadSequence(max([len(seq) for seq in sequence]) if max_length is None else max_length, pad_val, clip)
+ return [padder(seq) for seq in sequence]
diff --git a/EduNLP/Pretrain/gensim_vec.py b/EduNLP/Pretrain/gensim_vec.py
index 2ebcfc9e..fde1bb43 100644
--- a/EduNLP/Pretrain/gensim_vec.py
+++ b/EduNLP/Pretrain/gensim_vec.py
@@ -7,6 +7,7 @@
from gensim.models.doc2vec import TaggedDocument
from gensim.models.callbacks import CallbackAny2Vec
from EduNLP.SIF.sif import sif4sci
+from EduNLP.Vector import D2V, BowLoader
from copy import deepcopy
import itertools as it
@@ -14,7 +15,7 @@
class GensimWordTokenizer(object):
- def __init__(self, symbol="gm"):
+ def __init__(self, symbol="gm", general=False):
"""
Parameters
@@ -22,15 +23,44 @@ def __init__(self, symbol="gm"):
symbol:
gm
fgm
+ gmas
+ fgmas
+ general:
+ True when item isn't in standard format, and want to tokenize formulas(except formulas in figure) linearly.
+ False when use 'ast' mothed to tokenize formulas instead of 'linear'.
+
+ Returns
+ ----------
+
+ Examples
+ ----------
+ >>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True)
+ >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
+ ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$")
+ >>> print(token_item.tokens[:10])
+ ['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]']
+ >>> tokenizer = GensimWordTokenizer(symbol="fgmas", general=False)
+ >>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
+ ... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$")
+ >>> print(token_item.tokens[:10])
+ ['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]']
"""
self.symbol = symbol
- self.tokenization_params = {
- "formula_params": {
- "method": "ast",
- "return_type": "list",
- "ord2token": True
+ if general is True:
+ self.tokenization_params = {
+ "formula_params": {
+ "method": "linear",
+ "symbolize_figure_formula": True
+ }
+ }
+ else:
+ self.tokenization_params = {
+ "formula_params": {
+ "method": "ast",
+ "return_type": "list",
+ "ord2token": True
+ }
}
- }
def batch_process(self, *items):
pass
@@ -96,7 +126,7 @@ def on_epoch_end(self, model):
self.epoch += 1
-def train_vector(items, w2v_prefix, embedding_dim, method="sg", binary=None, train_params=None):
+def train_vector(items, w2v_prefix, embedding_dim=None, method="sg", binary=None, train_params=None):
monitor = MonitorCallback(["word", "I", "less"])
_train_params = dict(
min_count=0,
@@ -129,10 +159,22 @@ def train_vector(items, w2v_prefix, embedding_dim, method="sg", binary=None, tra
docs, **_train_params
)
binary = binary if binary is not None else True
+ elif method == "bow":
+ model = gensim.corpora.Dictionary(items)
+ binary = binary if binary is not None else True
+ elif method == "tfidf":
+ dictionary_path = train_vector(items, w2v_prefix, method="bow")
+ dictionary = BowLoader(dictionary_path)
+ corpus = [dictionary.infer_vector(item) for item in items]
+ model = gensim.models.TfidfModel(corpus)
+ binary = binary if binary is not None else True
else:
raise ValueError("Unknown method: %s" % method)
- filepath = w2v_prefix + "%s_%s" % (method, embedding_dim)
+ filepath = w2v_prefix + method
+ if embedding_dim is not None:
+ filepath = filepath + "_" + str(embedding_dim)
+
if binary is True:
filepath += ".bin"
logger.info("model is saved to %s" % filepath)
diff --git a/EduNLP/SIF/tokenization/tokenization.py b/EduNLP/SIF/tokenization/tokenization.py
index 3c6d43e1..3bb5365b 100644
--- a/EduNLP/SIF/tokenization/tokenization.py
+++ b/EduNLP/SIF/tokenization/tokenization.py
@@ -33,6 +33,10 @@ def __init__(self, segment_list: SegmentList, text_params=None, formula_params=N
"s": []
}
self.text_params = text_params if text_params is not None else {}
+ if formula_params is not None and "symbolize_figure_formula" in formula_params:
+ self.symbolize_figure_formula = formula_params.pop("symbolize_figure_formula")
+ else:
+ self.symbolize_figure_formula = False
self.formula_params = formula_params if formula_params is not None else {"method": "linear"}
self.formula_tokenize_method = self.formula_params.get("method")
self.figure_params = figure_params if figure_params is not None else {}
@@ -166,6 +170,9 @@ def append_formula(self, segment, symbol=False, init=True):
if symbol is True:
self._formula_tokens.append(len(self._tokens))
self._tokens.append(segment)
+ elif self.symbolize_figure_formula and isinstance(segment, FigureFormulaSegment):
+ self._formula_tokens.append(len(self._tokens))
+ self._tokens.append(Symbol(FORMULA_SYMBOL))
elif isinstance(segment, FigureFormulaSegment):
self._formula_tokens.append(len(self._tokens))
self._tokens.append(segment)
diff --git a/EduNLP/Tokenizer/__init__.py b/EduNLP/Tokenizer/__init__.py
new file mode 100644
index 00000000..25b605c9
--- /dev/null
+++ b/EduNLP/Tokenizer/__init__.py
@@ -0,0 +1,4 @@
+# coding: utf-8
+# 2021/8/1 @ tongshiwei
+
+from .tokenizer import *
diff --git a/EduNLP/Tokenizer/tokenizer.py b/EduNLP/Tokenizer/tokenizer.py
new file mode 100644
index 00000000..cf9084d0
--- /dev/null
+++ b/EduNLP/Tokenizer/tokenizer.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+# 2021/8/1 @ tongshiwei
+
+from typing import Iterable
+from ..SIF.segment import seg
+from ..SIF.tokenization import tokenize
+
+__all__ = ["TOKENIZER", "Tokenizer", "TextTokenizer", "get_tokenizer"]
+
+
+class Tokenizer(object):
+ def __call__(self, *args, **kwargs):
+ raise NotImplementedError
+
+
+class TextTokenizer(Tokenizer):
+ r"""
+
+ Examples
+ --------
+ >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"]
+ >>> tokenizer = TextTokenizer()
+ >>> tokens = tokenizer(items)
+ >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE
+ ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<',
+ '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',',
+ '\\quad', 'A', '\\cap', 'B', '=']
+ """
+
+ def __init__(self, *args, **kwargs):
+ self.tokenization_params = {
+ "formula_params": {
+ "method": "linear",
+ }
+ }
+
+ def __call__(self, items: Iterable, *args, **kwargs):
+ for item in items:
+ yield tokenize(seg(item, symbol="gmas"), **self.tokenization_params).tokens
+
+
+TOKENIZER = {
+ "text": TextTokenizer
+}
+
+
+def get_tokenizer(name, *args, **kwargs):
+ r"""
+
+ Parameters
+ ----------
+ name: str
+ args
+ kwargs
+
+ Returns
+ -------
+ tokenizer: Tokenizer
+
+ Examples
+ --------
+ >>> items = ["已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$"]
+ >>> tokenizer = get_tokenizer("text")
+ >>> tokens = tokenizer(items)
+ >>> next(tokens) # doctest: +NORMALIZE_WHITESPACE
+ ['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<',
+ '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',',
+ '\\quad', 'A', '\\cap', 'B', '=']
+ """
+ if name not in TOKENIZER:
+ raise KeyError(
+ "Unknown tokenizer %s, use one of the provided tokenizers: %s" % (name, ", ".join(TOKENIZER.keys()))
+ )
+ return TOKENIZER[name](*args, **kwargs)
diff --git a/EduNLP/Vector/__init__.py b/EduNLP/Vector/__init__.py
index 89c15fec..efccc5ad 100644
--- a/EduNLP/Vector/__init__.py
+++ b/EduNLP/Vector/__init__.py
@@ -1,4 +1,8 @@
# coding: utf-8
# 2021/5/29 @ tongshiwei
-from .gensim_vec import W2V, D2V
+from .gensim_vec import W2V, D2V, BowLoader, TfidfLoader
+from .const import *
+from .rnn import RNNModel
+from .t2v import T2V, get_pretrained_t2v
+from .embedding import Embedding
diff --git a/EduNLP/Vector/const.py b/EduNLP/Vector/const.py
new file mode 100644
index 00000000..f798fa1c
--- /dev/null
+++ b/EduNLP/Vector/const.py
@@ -0,0 +1,5 @@
+# coding: utf-8
+# 2021/7/12 @ tongshiwei
+
+UNK = "[UNK]"
+PAD = "[PAD]"
diff --git a/EduNLP/Vector/embedding.py b/EduNLP/Vector/embedding.py
new file mode 100644
index 00000000..e3012a18
--- /dev/null
+++ b/EduNLP/Vector/embedding.py
@@ -0,0 +1,75 @@
+# coding: utf-8
+# 2021/7/12 @ tongshiwei
+
+from typing import List
+import torch
+from .gensim_vec import W2V
+from .const import PAD
+from EduNLP.ModelZoo import pad_sequence, set_device
+
+
+class Embedding(object):
+ def __init__(self, w2v: (W2V, tuple, list, dict, None), freeze=True, device=None, **kwargs):
+ if w2v is None:
+ self.w2v = None
+ elif isinstance(w2v, (tuple, list)):
+ self.w2v = W2V(*w2v)
+ elif isinstance(w2v, dict):
+ self.w2v = W2V(**w2v)
+ elif isinstance(w2v, W2V):
+ self.w2v = w2v
+ else:
+ raise TypeError("w2v argument must be one of W2V, tuple, list, dict or None")
+
+ if self.w2v is not None:
+ self.vocab_size = len(self.w2v)
+ self.embedding_dim = self.w2v.vector_size
+ else:
+ self.vocab_size = kwargs["vocab_size"]
+ self.embedding_dim = kwargs["embedding_dim"]
+
+ self.embedding = torch.nn.Embedding(self.vocab_size, self.embedding_dim)
+
+ self.pad_val = 0
+ if self.w2v is not None:
+ self.embedding.from_pretrained(torch.Tensor(self.w2v.vectors), freeze)
+ self.pad_val = self.w2v.constants[PAD]
+ self.key_to_index = self.w2v.key_to_index if w2v is not None else lambda x: x
+
+ if device is not None:
+ self.set_device(device)
+
+ def __call__(self, items: List[List[str]], indexing=True, padding=True, vectorization=True, *args,
+ **kwargs) -> tuple:
+
+ items, item_len = self.indexing(items, padding=padding, indexing=indexing)
+ items = self.infer_token_vector(items, indexing=False)[0] if vectorization else items
+ return items, item_len
+
+ def infer_token_vector(self, items: List[List[str]], indexing=True) -> tuple:
+ items, item_len = self.indexing(items, padding=True, indexing=indexing)
+ item_embedding = self.embedding(torch.LongTensor(items))
+ return item_embedding, item_len
+
+ def indexing(self, items: List[List[str]], padding=False, indexing=True) -> tuple:
+ """
+
+ Parameters
+ ----------
+ items: list of list of str(word/token)
+ padding: bool
+ whether padding the returned list with default pad_val to make all item in items have the same length
+ indexing: bool
+
+ Returns
+ -------
+ word_id: list of list of int
+ """
+ items_idx = [[self.key_to_index(word) for word in item] for item in items] if indexing else items
+ item_len = [len(_idx) for _idx in items_idx]
+ padded_items_idx = pad_sequence(items_idx, pad_val=self.pad_val) if padding is True else items_idx
+ return padded_items_idx, item_len
+
+ def set_device(self, device):
+ self.embedding = set_device(self.embedding, device)
+ return self
diff --git a/EduNLP/Vector/gensim_vec.py b/EduNLP/Vector/gensim_vec.py
index f1ff802a..13c5600a 100644
--- a/EduNLP/Vector/gensim_vec.py
+++ b/EduNLP/Vector/gensim_vec.py
@@ -1,12 +1,26 @@
# coding: utf-8
# 2021/5/29 @ tongshiwei
+import numpy as np
from pathlib import PurePath
-from gensim.models import KeyedVectors, Word2Vec, FastText, Doc2Vec
+from gensim.models import KeyedVectors, Word2Vec, FastText, Doc2Vec, TfidfModel
+from gensim import corpora
+import re
+from .const import UNK, PAD
+from .meta import Vector
-class W2V(object):
- def __init__(self, filepath, method, binary=None):
+class W2V(Vector):
+ def __init__(self, filepath, method=None, binary=None):
+ """
+
+ Parameters
+ ----------
+ filepath:
+ path to the pretrained model file
+ method
+ binary
+ """
fp = PurePath(filepath)
self.binary = binary if binary is not None else (True if fp.suffix == ".bin" else False)
if self.binary is True:
@@ -17,17 +31,115 @@ def __init__(self, filepath, method, binary=None):
else:
self.wv = KeyedVectors.load(filepath, mmap="r")
+ self.method = method
+ self.constants = {UNK: 0, PAD: 1}
+
+ def __len__(self):
+ return len(self.constants) + len(self.wv.key_to_index)
+
+ def key_to_index(self, word):
+ if word in self.constants:
+ return self.constants[word]
+ else:
+ if word in self.wv.key_to_index:
+ return self.wv.key_to_index[word] + len(self.constants)
+ else:
+ return self.constants[UNK]
+
+ @property
+ def vectors(self):
+ return np.concatenate([np.zeros((len(self.constants), self.vector_size)), self.wv.vectors], axis=0)
+
+ @property
+ def vector_size(self):
+ return self.wv.vector_size
+
def __call__(self, *words):
for word in words:
- yield self.wv[word]
+ yield self[word]
def __getitem__(self, item):
- return self.wv[item]
+ return self.wv[item] if item not in self.constants else np.zeros((self.vector_size,))
+
+ def infer_vector(self, items, agg="mean", *args, **kwargs) -> np.ndarray:
+ tokens = self.infer_tokens(items, *args, **kwargs)
+ return eval("np.%s" % agg)(tokens, axis=1)
+
+ def infer_tokens(self, items, *args, **kwargs) -> list:
+ return [list(self(*item)) for item in items]
+
+
+class BowLoader(object):
+ def __init__(self, filepath):
+ self.dictionary = corpora.Dictionary.load(filepath)
+
+ def infer_vector(self, item, return_vec=False):
+ item = self.dictionary.doc2bow(item)
+ if not return_vec:
+ return item # return dic as default
+ vec = [0 for i in range(len(self.dictionary.keys()))]
+ for i, v in item:
+ vec[i] = v
+ return vec
+
+ @property
+ def vector_size(self):
+ return len(self.dictionary.keys())
-class D2V(object):
+class TfidfLoader(object):
def __init__(self, filepath):
- self.d2v = Doc2Vec.load(filepath)
+ self.tfidf_model = TfidfModel.load(filepath)
+ # 'tfidf' model shold be used based on 'bow' model
+ dictionary_path = re.sub(r"(.*)tfidf", r"\1bow", filepath)
+ self.dictionary = corpora.Dictionary.load(dictionary_path)
+
+ def infer_vector(self, item, return_vec=False):
+ dic_item = self.dictionary.doc2bow(item)
+ tfidf_item = self.tfidf_model[dic_item]
+ # return dic as default
+ if not return_vec:
+ return tfidf_item # pragma: no cover
+ vec = [0 for i in range(len(self.dictionary.keys()))]
+ for i, v in tfidf_item:
+ vec[i] = v
+ return vec
+
+ @property
+ def vector_size(self):
+ return len(self.dictionary.token2id)
+
+
+class D2V(Vector):
+ def __init__(self, filepath, method="d2v"):
+ self._method = method
+ self._filepath = filepath
+ if self._method == "d2v":
+ self.d2v = Doc2Vec.load(filepath)
+ elif self._method == "bow":
+ self.d2v = BowLoader(filepath)
+ elif self._method == "tfidf":
+ self.d2v = TfidfLoader(filepath)
+ else:
+ raise ValueError("Unknown method: %s" % method)
def __call__(self, item):
- return self.d2v.infer_vector(item)
+ if self._method == "d2v":
+ return self.d2v.infer_vector(item)
+ else:
+ return self.d2v.infer_vector(item, return_vec=True)
+
+ @property
+ def vector_size(self):
+ if self._method == "d2v":
+ return self.d2v.vector_size
+ elif self._method == "bow":
+ return self.d2v.vector_size
+ elif self._method == "tfidf":
+ return self.d2v.vector_size
+
+ def infer_vector(self, items, *args, **kwargs) -> list:
+ return [self(item) for item in items]
+
+ def infer_tokens(self, item, *args, **kwargs) -> ...:
+ raise NotImplementedError
diff --git a/EduNLP/Vector/meta.py b/EduNLP/Vector/meta.py
new file mode 100644
index 00000000..87b8db8e
--- /dev/null
+++ b/EduNLP/Vector/meta.py
@@ -0,0 +1,13 @@
+# coding: utf-8
+# 2021/7/13 @ tongshiwei
+
+class Vector(object):
+ def infer_vector(self, items, *args, **kwargs) -> ...:
+ pass
+
+ def infer_tokens(self, items, *args, **kwargs) -> ...:
+ pass
+
+ @property
+ def vector_size(self):
+ raise NotImplementedError
diff --git a/EduNLP/Vector/rnn/__init__.py b/EduNLP/Vector/rnn/__init__.py
new file mode 100644
index 00000000..54cde6aa
--- /dev/null
+++ b/EduNLP/Vector/rnn/__init__.py
@@ -0,0 +1,4 @@
+# coding: utf-8
+# 2021/7/12 @ tongshiwei
+
+from .rnn import RNNModel
diff --git a/EduNLP/Vector/rnn/rnn.py b/EduNLP/Vector/rnn/rnn.py
new file mode 100644
index 00000000..02318193
--- /dev/null
+++ b/EduNLP/Vector/rnn/rnn.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+# 2021/7/12 @ tongshiwei
+
+import torch
+from ..gensim_vec import W2V
+from ..embedding import Embedding
+from ..meta import Vector
+from EduNLP.ModelZoo import rnn, set_device
+
+
+class RNNModel(Vector):
+ """
+ Examples
+ --------
+ >>> model = RNNModel("ELMO", None, 2, vocab_size=4, embedding_dim=3)
+ >>> seq_idx = [[1, 2, 3], [1, 2, 0], [3, 0, 0]]
+ >>> output, hn = model(seq_idx, indexing=False, padding=False)
+ >>> seq_idx = [[1, 2, 3], [1, 2], [3]]
+ >>> output, hn = model(seq_idx, indexing=False, padding=True)
+ >>> output.shape
+ torch.Size([3, 3, 4])
+ >>> hn.shape
+ torch.Size([2, 3, 2])
+ >>> tokens = model.infer_tokens(seq_idx, indexing=False)
+ >>> tokens.shape
+ torch.Size([3, 3, 4])
+ >>> tokens = model.infer_tokens(seq_idx, agg="mean", indexing=False)
+ >>> tokens.shape
+ torch.Size([3, 4])
+ >>> item = model.infer_vector(seq_idx, indexing=False)
+ >>> item.shape
+ torch.Size([3, 4])
+ >>> item = model.infer_vector(seq_idx, agg="mean", indexing=False)
+ >>> item.shape
+ torch.Size([3, 2])
+ >>> item = model.infer_vector(seq_idx, agg=None, indexing=False)
+ >>> item.shape
+ torch.Size([2, 3, 2])
+ """
+
+ def __init__(self, rnn_type, w2v: (W2V, tuple, list, dict, None), hidden_size, freeze_pretrained=True, device=None,
+ **kwargs):
+ self.embedding = Embedding(w2v, freeze_pretrained, **kwargs)
+ for key in ["vocab_size", "embedding_dim"]:
+ if key in kwargs:
+ kwargs.pop(key)
+ self.rnn = rnn.LM(
+ rnn_type,
+ self.embedding.vocab_size,
+ self.embedding.embedding_dim,
+ hidden_size=hidden_size,
+ embedding=self.embedding.embedding,
+ **kwargs
+ )
+ self.bidirectional = self.rnn.rnn.bidirectional
+ self.hidden_size = self.rnn.hidden_size
+ self.freeze_pretrained = freeze_pretrained
+ if device is not None:
+ self.set_device(device)
+
+ def __call__(self, items, indexing=True, padding=True, **kwargs):
+ seq_idx, seq_len = self.embedding(items, indexing=indexing, padding=padding, vectorization=False)
+
+ tokens, item = self.rnn(torch.LongTensor(seq_idx), torch.LongTensor(seq_len))
+
+ return tokens, item
+
+ def infer_vector(self, items, agg: (int, str, None) = -1, indexing=True, padding=True, *args,
+ **kwargs) -> torch.Tensor:
+ vector = self(items, indexing=indexing, padding=padding, **kwargs)[1]
+ if agg is not None:
+ if agg == -1:
+ return torch.reshape(vector, (vector.shape[1], -1))
+ return eval("torch.%s" % agg)(vector, dim=0)
+ return vector
+
+ def infer_tokens(self, items, agg=None, *args, **kwargs) -> torch.Tensor:
+ tokens = self(items, **kwargs)[0]
+ if agg is not None:
+ return eval("torch.%s" % agg)(tokens, dim=1)
+ return tokens
+
+ @property
+ def vector_size(self) -> int:
+ return self.hidden_size * (1 if self.bidirectional is False else 2)
+
+ def set_device(self, device):
+ self.rnn = set_device(self.rnn, device)
diff --git a/EduNLP/Vector/t2v.py b/EduNLP/Vector/t2v.py
new file mode 100644
index 00000000..103fd3da
--- /dev/null
+++ b/EduNLP/Vector/t2v.py
@@ -0,0 +1,57 @@
+# coding: utf-8
+# 2021/7/13 @ tongshiwei
+
+import os
+from longling import path_append
+from EduData import get_data
+from .rnn import RNNModel
+from .gensim_vec import W2V, D2V
+from .meta import Vector
+from EduNLP.constant import MODEL_DIR
+
+MODELS = {
+ "w2v": W2V,
+ "d2v": D2V,
+ "rnn": RNNModel,
+ "lstm": RNNModel,
+ "gru": RNNModel,
+ "elmo": RNNModel
+}
+
+
+class T2V(object):
+ def __init__(self, model: str, *args, **kwargs):
+ model = model.lower()
+ self.model_type = model
+ if model in {"rnn", "lstm", "gru", "elmo"}:
+ self.i2v: Vector = MODELS[model](model, *args, **kwargs)
+ else:
+ self.i2v: Vector = MODELS[model](*args, **kwargs)
+
+ def __call__(self, items, *args, **kwargs):
+ return self.i2v.infer_vector(items, *args, **kwargs)
+
+ @property
+ def vector_size(self) -> int:
+ return self.i2v.vector_size
+
+
+PRETRAINED_MODELS = {
+ "d2v_all_256": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_all_256.zip", "d2v"],
+ "d2v_sci_256": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_science_256.zip", "d2v"],
+ "d2v_eng_256": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_english_256.zip", "d2v"],
+ "d2v_lit_256": ["http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_literal_256.zip", "d2v"],
+}
+
+
+def get_pretrained_t2v(name, model_dir=MODEL_DIR):
+ if name not in PRETRAINED_MODELS:
+ raise KeyError(
+ "Unknown pretrained model %s, use one of the provided pretrained models: %s" % (
+ name, ", ".join(PRETRAINED_MODELS.keys()))
+ )
+ url, model_name, *args = PRETRAINED_MODELS[name]
+ model_path = get_data(url, model_dir)
+ if model_name in ["d2v"]:
+ model_path = path_append(model_path, os.path.basename(model_path) + ".bin", to_str=True)
+ return T2V(model_name, model_path, *args)
diff --git a/EduNLP/__init__.py b/EduNLP/__init__.py
index c5cdbbf7..604e2a3a 100644
--- a/EduNLP/__init__.py
+++ b/EduNLP/__init__.py
@@ -1 +1,2 @@
from .utils import logger
+from .I2V import get_pretrained_i2v
diff --git a/EduNLP/constant.py b/EduNLP/constant.py
new file mode 100644
index 00000000..5ee2ddb5
--- /dev/null
+++ b/EduNLP/constant.py
@@ -0,0 +1,8 @@
+# coding: utf-8
+# 2021/8/1 @ tongshiwei
+
+import os
+from os.path import expanduser, join
+
+ROOT = os.environ.get("EDUNLPPATH", join(expanduser("~"), ".EduNLP"))
+MODEL_DIR = os.environ.get("EDUNLPMODELPATH", join(ROOT, "model"))
diff --git a/EduNLP/main.py b/EduNLP/main.py
new file mode 100644
index 00000000..46e24ce7
--- /dev/null
+++ b/EduNLP/main.py
@@ -0,0 +1,14 @@
+# coding: utf-8
+# 2021/8/2 @ tongshiwei
+
+import fire
+
+from EduNLP.I2V.i2v import MODELS
+
+
+def list_i2v():
+ print("\n".join(MODELS.keys()))
+
+
+def cli(): # pragma: no cover
+ fire.Fire({"i2v": list_i2v})
diff --git a/examples/pretrain/gensim/d2v_bow_tfidf.ipynb b/examples/pretrain/gensim/d2v_bow_tfidf.ipynb
new file mode 100644
index 00000000..66a77fa6
--- /dev/null
+++ b/examples/pretrain/gensim/d2v_bow_tfidf.ipynb
@@ -0,0 +1,324 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# 1. load and tokenize test_items"
+ ],
+ "metadata": {}
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "source": [
+ "from platform import processor\r\n",
+ "from gensim import corpora,models\r\n",
+ "# from collections import defaultdict\r\n",
+ "import json\r\n",
+ "from tqdm import tqdm\r\n",
+ "from EduNLP.Pretrain import GensimWordTokenizer,train_vector\r\n",
+ "from EduNLP.Vector import D2V\r\n",
+ "from EduNLP.SIF.segment import seg\r\n",
+ "from EduNLP.SIF.tokenization import tokenize\r\n",
+ "import time\r\n",
+ "\r\n",
+ "output_file_head = \"test\" # subject = english | liberal | science |all\r\n",
+ "baseDir = \"E:/Workustc/lunadata/d2v\"\r\n",
+ "# baseDir = \"/home/qlh/data_pretrain\"\r\n",
+ "work_file_path = baseDir + \"/data/\" + output_file_head + \"_raw.json\"\r\n",
+ "\r\n",
+ "test_items = [{'ques_content':'有公式$\\\\FormFigureID{wrong1?}$和公式$\\\\FormFigureBase64{wrong2?}$,如图$\\\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\\\SIFSep$,则$z=x+7 y$的最大值为$\\\\SIFBlank$'},\r\n",
+ " {\"ques_content\":\"Human machine interface for lab abc computer applications\"},\r\n",
+ " {\"ques_content\": \"A survey of user opinion of computer system response time\"},\r\n",
+ " {\"ques_content\": \"The EPS user interface management system\"},\r\n",
+ " {\"ques_content\": \"System and human system engineering testing of EPS\"},\r\n",
+ " {\"ques_content\": \"Relation of user perceived response time to error measurement\"},\r\n",
+ " {\"ques_content\": \"The generation of random binary unordered trees\"},\r\n",
+ " {\"ques_content\": \"The intersection graph of paths in trees\"},\r\n",
+ " {\"ques_content\": \"Graph minors IV Widths of trees and well quasi ordering\"},\r\n",
+ " {\"ques_content\": \"Graph minors A survey\"}\r\n",
+ " ]\r\n",
+ "\r\n",
+ "def load_items():\r\n",
+ " for line in test_items:\r\n",
+ " yield line\r\n",
+ " # with open(work_file_path, 'r', encoding=\"utf-8\") as f:\r\n",
+ " # for line in f:\r\n",
+ " # yield json.loads(line)\r\n",
+ "\r\n",
+ "def data2Token():\r\n",
+ " # 线性分词,而不使用ast\r\n",
+ " tokenization_params = {\r\n",
+ " \"formula_params\": {\r\n",
+ " \"method\": \"linear\",\r\n",
+ " }\r\n",
+ " }\r\n",
+ " \r\n",
+ " token_items = []\r\n",
+ " count = 1\r\n",
+ " for item in tqdm(load_items(), \"sifing\"):\r\n",
+ " count = count + 1\r\n",
+ " # -------------------------------------------- # \r\n",
+ " # \"\"\"除文本、公式外,其他转化为特殊标记\"\"\"\r\n",
+ " # seg_ret = seg(item[\"ques_content\"], symbol=\"gmas\")\r\n",
+ " # token_item = tokenize(seg_ret, **tokenization_params)\r\n",
+ " tokenizer = GensimWordTokenizer(symbol=\"gmas\", general=True)\r\n",
+ " token_item = tokenizer(item[\"ques_content\"])\r\n",
+ "\r\n",
+ " # -------------------------------------------- # \r\n",
+ " if token_item:\r\n",
+ " # print(\"[i] = \", count)\r\n",
+ " # print(\"[tokens] = \", token_item)\r\n",
+ " token_items.append(token_item.tokens)\r\n",
+ " print(\"[data2Token] finish ========================> num = \",len(token_items))\r\n",
+ " return token_items\r\n",
+ "\r\n",
+ "token_items = data2Token()\r\n",
+ "token_items[0]"
+ ],
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "D:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package