Skip to content

Commit

Permalink
update vec
Browse files Browse the repository at this point in the history
  • Loading branch information
KenelmQLH committed Mar 2, 2024
1 parent 11dee5e commit df6ae97
Show file tree
Hide file tree
Showing 14 changed files with 414 additions and 56 deletions.
2 changes: 1 addition & 1 deletion EduNLP/I2V/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# 2021/8/1 @ tongshiwei

from .i2v import I2V, get_pretrained_i2v
from .i2v import D2V, W2V, Elmo, Bert, DisenQ, QuesNet
from .i2v import D2V, W2V, Elmo, Bert, HfAuto, DisenQ, QuesNet
73 changes: 71 additions & 2 deletions EduNLP/I2V/i2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
from longling import path_append
from EduData import get_data
from ..Tokenizer import Tokenizer, get_tokenizer
from EduNLP.Pretrain import ElmoTokenizer, BertTokenizer, DisenQTokenizer, QuesNetTokenizer, Question
from EduNLP.Pretrain import ElmoTokenizer, BertTokenizer, AutoTokenizer, DisenQTokenizer, QuesNetTokenizer, Question
from EduNLP import logger

__all__ = ["I2V", "D2V", "W2V", "Elmo", "Bert", "DisenQ", "QuesNet", "get_pretrained_i2v"]
__all__ = ["I2V", "D2V", "W2V", "Elmo", "Bert", "HfAuto", "DisenQ", "QuesNet", "get_pretrained_i2v"]


class I2V(object):
Expand Down Expand Up @@ -69,6 +69,9 @@ def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None,
if tokenizer == 'bert':
self.tokenizer = BertTokenizer.from_pretrained(
**tokenizer_kwargs if tokenizer_kwargs is not None else {})
elif tokenizer == 'hf_auto':
self.tokenizer = AutoTokenizer.from_pretrained(
**tokenizer_kwargs if tokenizer_kwargs is not None else {})
elif tokenizer == 'quesnet':
self.tokenizer = QuesNetTokenizer.from_pretrained(
**tokenizer_kwargs if tokenizer_kwargs is not None else {})
Expand Down Expand Up @@ -426,6 +429,71 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwarg
tokenizer_kwargs=tokenizer_kwargs)


class HfAuto(I2V):
"""
The model aims to transfer item and tokens to vector with Bert.
Bases
-------
I2V
Parameters
-----------
tokenizer: str
the tokenizer name
t2v: str
the name of token2vector model
args:
the parameters passed to t2v
tokenizer_kwargs: dict
the parameters passed to tokenizer
pretrained_t2v: bool
True: use pretrained t2v model
False: use your own t2v model
kwargs:
the parameters passed to t2v
Returns
-------
i2v model: Bert
"""

def infer_vector(self, items: Tuple[List[str], List[dict], str, dict],
*args, key=lambda x: x, return_tensors='pt', **kwargs) -> tuple:
"""
It is a function to switch item to vector. And before using the function, it is nesseary to load model.
Parameters
-----------
items : str or dict or list
the item of question, or question list
return_tensors: str
tensor type used in tokenizer
args:
the parameters passed to t2v
kwargs:
the parameters passed to t2v
Returns
--------
vector:list
"""
is_batch = isinstance(items, list)
items = items if is_batch else [items]
inputs = self.tokenize(items, key=key, return_tensors=return_tensors)
return self.t2v.infer_vector(inputs, *args, **kwargs), self.t2v.infer_tokens(inputs, *args, **kwargs)

@classmethod
def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwargs):
model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True)
for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
model_path = model_path.replace(i, "")
logger.info("model_path: %s" % model_path)
tokenizer_kwargs = {"tokenizer_config_dir": model_path}
return cls("bert", name, pretrained_t2v=True, model_dir=model_dir, device=device,
tokenizer_kwargs=tokenizer_kwargs)


class DisenQ(I2V):
"""
The model aims to transfer item and tokens to vector with DisenQ.
Expand Down Expand Up @@ -542,6 +610,7 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwarg
"w2v": W2V,
"d2v": D2V,
"bert": Bert,
"hf_auto": HfAuto,
"disenq": DisenQ,
"quesnet": QuesNet,
"elmo": Elmo
Expand Down
22 changes: 11 additions & 11 deletions EduNLP/ModelZoo/hf_model/hf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ def __init__(self, pretrained_model_dir=None, head_dropout=0.5, init=True):
bert_config = AutoConfig.from_pretrained(pretrained_model_dir)
if init:
logger.info(f'Load AutoModel from checkpoint: {pretrained_model_dir}')
self.bert = AutoModel.from_pretrained(pretrained_model_dir)
self.model = AutoModel.from_pretrained(pretrained_model_dir)
else:
logger.info(f'Load AutoModel from config: {pretrained_model_dir}')
self.bert = AutoModel(bert_config)
self.hidden_size = self.bert.config.hidden_size
self.model = AutoModel(bert_config)
self.hidden_size = self.model.config.hidden_size
self.head_dropout = head_dropout
self.dropout = nn.Dropout(head_dropout)
self.classifier = nn.Linear(self.hidden_size, 1)
Expand All @@ -39,7 +39,7 @@ def forward(self,
attention_mask=None,
token_type_ids=None,
labels=None):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
item_embeds = outputs.last_hidden_state[:, 0, :]
item_embeds = self.dropout(item_embeds)

Expand Down Expand Up @@ -69,7 +69,7 @@ def save_config(self, config_dir):
config_path = os.path.join(config_dir, "model_config.json")
with open(config_path, "w", encoding="utf-8") as wf:
json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2)
self.bert.config.save_pretrained(config_dir)
self.model.config.save_pretrained(config_dir)


class HfModelForKnowledgePrediction(BaseModel):
Expand All @@ -88,11 +88,11 @@ def __init__(self,
bert_config = AutoConfig.from_pretrained(pretrained_model_dir)
if init:
logger.info(f'Load AutoModel from checkpoint: {pretrained_model_dir}')
self.bert = AutoModel.from_pretrained(pretrained_model_dir)
self.model = AutoModel.from_pretrained(pretrained_model_dir)
else:
logger.info(f'Load AutoModel from config: {pretrained_model_dir}')
self.bert = AutoModel(bert_config)
self.hidden_size = self.bert.config.hidden_size
self.model = AutoModel(bert_config)
self.hidden_size = self.model.config.hidden_size
self.head_dropout = head_dropout
self.dropout = nn.Dropout(head_dropout)
self.sigmoid = nn.Sigmoid()
Expand All @@ -101,7 +101,7 @@ def __init__(self,
self.ham_classifier = HAM(
num_classes_list=num_classes_list,
num_total_classes=num_total_classes,
sequence_model_hidden_size=self.bert.config.hidden_size,
sequence_model_hidden_size=self.model.config.hidden_size,
attention_unit_size=attention_unit_size,
fc_hidden_size=fc_hidden_size,
beta=beta,
Expand All @@ -120,7 +120,7 @@ def forward(self,
attention_mask=None,
token_type_ids=None,
labels=None):
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
item_embeds = outputs.last_hidden_state[:, 0, :]
item_embeds = self.dropout(item_embeds)
tokens_embeds = outputs.last_hidden_state
Expand Down Expand Up @@ -162,4 +162,4 @@ def save_config(self, config_dir):
config_path = os.path.join(config_dir, "model_config.json")
with open(config_path, "w", encoding="utf-8") as wf:
json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2)
self.bert.config.save_pretrained(config_dir)
self.model.config.save_pretrained(config_dir)
44 changes: 22 additions & 22 deletions EduNLP/Pretrain/auto_vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
from .hugginface_utils import TokenizerForHuggingface

__all__ = [
"EduAutoTokenizer",
"EduAutoDataset",
"finetune_edu_auto_model",
"finetune_edu_auto_model_for_property_prediction",
"finetune_edu_auto_model_for_knowledge_prediction",
"AutoTokenizer",
"AutoDataset",
"pretrain_hf_auto_model",
"finetune_hf_auto_model_for_property_prediction",
"finetune_hf_auto_model_for_knowledge_prediction",
]

DEFAULT_TRAIN_PARAMS = {
Expand All @@ -42,11 +42,11 @@
}


class EduAutoTokenizer(TokenizerForHuggingface):
class AutoTokenizer(TokenizerForHuggingface):
"""
Examples
----------
>>> tokenizer = EduAutoTokenizer(add_special_tokens=True)
>>> tokenizer = AutoTokenizer(add_special_tokens=True)
>>> item = "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"
>>> token_item = tokenizer(item)
Expand All @@ -63,17 +63,17 @@ class EduAutoTokenizer(TokenizerForHuggingface):
>>> print(len(tokenizer.tokenize(items)))
2
>>> tokenizer.save_pretrained('test_dir') # doctest: +SKIP
>>> tokenizer = EduAutoTokenizer.from_pretrained('test_dir') # doctest: +SKIP
>>> tokenizer = AutoTokenizer.from_pretrained('test_dir') # doctest: +SKIP
"""

pass


class EduAutoDataset(EduDataset):
class AutoDataset(EduDataset):
pass


def finetune_edu_auto_model(
def pretrain_hf_auto_model(
items: Union[List[dict], List[str]],
output_dir: str,
pretrained_model="bert-base-chinese",
Expand Down Expand Up @@ -105,7 +105,7 @@ def finetune_edu_auto_model(
----------
>>> stems = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$",
... "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$"]
>>> finetune_edu_auto_model(stems, "examples/test_model/data/data/bert") # doctest: +SKIP
>>> pretrain_hf_auto_model(stems, "examples/test_model/data/data/bert") # doctest: +SKIP
{'train_runtime': ..., ..., 'epoch': 1.0}
"""
tokenizer_params = tokenizer_params if tokenizer_params else {}
Expand All @@ -114,22 +114,22 @@ def finetune_edu_auto_model(
train_params = train_params if train_params is not None else {}
# tokenizer configuration
if os.path.exists(pretrained_model):
tokenizer = EduAutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
else:
work_tokenizer_params = {
"add_specials": True,
"tokenize_method": "pure_text",
}
work_tokenizer_params.update(tokenizer_params)
tokenizer = EduAutoTokenizer(pretrained_model, **work_tokenizer_params)
tokenizer = AutoTokenizer(pretrained_model, **work_tokenizer_params)
# TODO: tokenizer.set_vocab()
# model configuration
model = AutoModelForMaskedLM.from_pretrained(pretrained_model, **model_params)
# resize embedding for additional special tokens
model.resize_token_embeddings(len(tokenizer.bert_tokenizer))

# dataset configuration
dataset = EduAutoDataset(
dataset = AutoDataset(
tokenizer, items=items, stem_key=data_params.get("stem_key", None)
)
mlm_probability = train_params.pop("mlm_probability", 0.15)
Expand All @@ -153,7 +153,7 @@ def finetune_edu_auto_model(
tokenizer.save_pretrained(output_dir)


def finetune_edu_auto_model_for_property_prediction(
def finetune_hf_auto_model_for_property_prediction(
train_items,
output_dir,
pretrained_model="bert-base-chinese",
Expand Down Expand Up @@ -187,16 +187,16 @@ def finetune_edu_auto_model_for_property_prediction(
model_params = model_params if model_params is not None else {}
train_params = train_params if train_params is not None else {}
# tokenizer configuration
tokenizer = EduAutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
# dataset configuration
train_dataset = EduAutoDataset(
train_dataset = AutoDataset(
tokenizer=tokenizer,
items=train_items,
stem_key=data_params.get("stem_key", "ques_content"),
label_key=data_params.get("label_key", "difficulty"),
)
if eval_items is not None:
eval_dataset = EduAutoDataset(
eval_dataset = AutoDataset(
tokenizer=tokenizer,
items=eval_items,
stem_key=data_params.get("stem_key", "ques_content"),
Expand Down Expand Up @@ -228,7 +228,7 @@ def finetune_edu_auto_model_for_property_prediction(
tokenizer.save_pretrained(output_dir)


def finetune_edu_auto_model_for_knowledge_prediction(
def finetune_hf_auto_model_for_knowledge_prediction(
train_items,
output_dir,
pretrained_model="bert-base-chinese",
Expand Down Expand Up @@ -262,16 +262,16 @@ def finetune_edu_auto_model_for_knowledge_prediction(
model_params = model_params if model_params is not None else {}
train_params = train_params if train_params is not None else {}
# tokenizer configuration
tokenizer = EduAutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
# dataset configuration
train_dataset = EduAutoDataset(
train_dataset = AutoDataset(
tokenizer=tokenizer,
items=train_items,
stem_key=data_params.get("stem_key", "ques_content"),
label_key=data_params.get("label_key", "know_list"),
)
if eval_items is not None:
eval_dataset = EduAutoDataset(
eval_dataset = AutoDataset(
tokenizer=tokenizer,
items=eval_items,
stem_key=data_params.get("stem_key", "ques_content"),
Expand Down
6 changes: 3 additions & 3 deletions EduNLP/Pretrain/bert_vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
__all__ = [
"BertTokenizer",
"BertDataset",
"finetune_bert",
"pretrain_bert",
"finetune_bert_for_property_prediction",
"finetune_bert_for_knowledge_prediction",
]
Expand Down Expand Up @@ -70,7 +70,7 @@ class BertDataset(EduDataset):
pass


def finetune_bert(
def pretrain_bert(
items: Union[List[dict], List[str]],
output_dir: str,
pretrained_model="bert-base-chinese",
Expand Down Expand Up @@ -102,7 +102,7 @@ def finetune_bert(
----------
>>> stems = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$",
... "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$"]
>>> finetune_bert(stems, "examples/test_model/data/data/bert") # doctest: +SKIP
>>> pretrain_bert(stems, "examples/test_model/data/data/bert") # doctest: +SKIP
{'train_runtime': ..., ..., 'epoch': 1.0}
"""
tokenizer_params = tokenizer_params if tokenizer_params else {}
Expand Down
8 changes: 4 additions & 4 deletions EduNLP/Pretrain/elmo_vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from .pretrian_utils import PretrainedEduTokenizer, EduDataset
from ..utils import logger

__all__ = ["ElmoTokenizer", "ElmoDataset", "pretrain_elmo", "pretrain_elmo_for_property_prediction",
"pretrain_elmo_for_knowledge_prediction"]
__all__ = ["ElmoTokenizer", "ElmoDataset", "pretrain_elmo", "finetune_elmo_for_property_prediction",
"finetune_elmo_for_knowledge_prediction"]

DEFAULT_TRAIN_PARAMS = {
# default
Expand Down Expand Up @@ -158,7 +158,7 @@ def pretrain_elmo(train_items: Union[List[dict], List[str]] = None, output_dir:
return output_dir


def pretrain_elmo_for_property_prediction(
def finetune_elmo_for_property_prediction(
train_items: list, output_dir: str, pretrained_dir=None, eval_items=None,
tokenizer_params=None, data_params=None, train_params=None, model_params=None
):
Expand Down Expand Up @@ -244,7 +244,7 @@ def pretrain_elmo_for_property_prediction(
return output_dir


def pretrain_elmo_for_knowledge_prediction(
def finetune_elmo_for_knowledge_prediction(
train_items: list, output_dir: str, pretrained_dir=None, eval_items=None,
tokenizer_params=None, data_params=None, train_params=None, model_params=None
):
Expand Down
1 change: 1 addition & 0 deletions EduNLP/Vector/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .t2v import T2V, get_pretrained_t2v, get_pretrained_model_info, get_all_pretrained_models
from .embedding import Embedding
from .bert_vec import BertModel
from .auto_vec import AutoModel
from .quesnet import QuesNetModel
from .disenqnet import DisenQModel
from .elmo_vec import ElmoModel
Loading

0 comments on commit df6ae97

Please sign in to comment.