Skip to content

Commit

Permalink
Xunzi-Qwen1.5-4B-upos
Browse files Browse the repository at this point in the history
  • Loading branch information
KoichiYasuoka committed Jul 4, 2024
1 parent 2fdbcd7 commit ccc14e6
Showing 1 changed file with 113 additions and 0 deletions.
113 changes: 113 additions & 0 deletions demo/2024-07-05/lzh-qwen-upos.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
{
"nbformat":4,
"nbformat_minor":0,
"metadata":{
"colab":{ "name":"古典中国語生成AIモデルの系列ラベリングによるUPOS品詞付与" },
"kernelspec":{ "name":"python3" },
"accelerator": "GPU"
},
"cells":[
{
"cell_type":"markdown",
"metadata":{ "colab_type":"text" },
"source":[
"Xunzi-Qwen1.5-4Bによる系列ラベリング"
]
},
{
"cell_type":"code",
"metadata":{ "colab_type":"code" },
"source": [
"!pip install transformers accelerate\n",
"!test -d Xunzi-Qwen1.5-4B || env GIT_LFS_SKIP_SMUDGE=1 git clone --depth=1 https://www.modelscope.cn/Xunzillm4cc/Xunzi-Qwen1.5-4B.git\n",
"from transformers import AutoTokenizer,AutoModelForTokenClassification,TokenClassificationPipeline\n",
"from transformers.utils import cached_file\n",
"tkz=AutoTokenizer.from_pretrained(\"Xunzi-Qwen1.5-4B\")\n",
"mdl=AutoModelForTokenClassification.from_pretrained(\"KoichiYasuoka/Xunzi-Qwen1.5-4B-upos\",trust_remote_code=True,device_map=\"auto\")\n",
"class TCP(TokenClassificationPipeline):\n",
" def check_model_type(self,supported_models):\n",
" pass\n",
"nlp=TCP(model=mdl,tokenizer=tkz)\n",
"txt=\"四十而不惑\"\n",
"doc=nlp(txt)\n",
"for t in doc:\n",
" print(txt[t[\"start\"]:t[\"end\"]],t[\"entity\"])"
]
},
{
"cell_type":"markdown",
"metadata":{ "colab_type":"text" },
"source":[
"トークナイザを単文字化"
]
},
{
"cell_type":"code",
"metadata":{ "colab_type":"code" },
"source": [
"tkz=AutoTokenizer.from_pretrained(\"KoichiYasuoka/Xunzi-Qwen1.5-4B-upos\")\n",
"nlp=TCP(model=mdl,tokenizer=tkz)\n",
"txt=\"四十而不惑\"\n",
"doc=nlp(txt)\n",
"for t in doc:\n",
" print(txt[t[\"start\"]:t[\"end\"]],t[\"entity\"])"
]
},
{
"cell_type":"markdown",
"metadata":{ "colab_type":"text" },
"source":[
"出力部に逆方向のBellman-Fordを追加"
]
},
{
"cell_type":"code",
"metadata":{ "colab_type":"code" },
"source": [
"class BFP(TokenClassificationPipeline):\n",
" def __init__(self,**kwargs):\n",
" import numpy\n",
" super().__init__(**kwargs)\n",
" x=self.model.config.label2id\n",
" y=[k for k in x if not k.startswith(\"I-\")]\n",
" self.transition=numpy.full((len(x),len(x)),numpy.nan)\n",
" for k,v in x.items():\n",
" for j in [\"I-\"+k[2:]] if k.startswith(\"B-\") else [k]+y if k.startswith(\"I-\") else y:\n",
" self.transition[v,x[j]]=0\n",
" def check_model_type(self,supported_models):\n",
" pass\n",
" def postprocess(self,model_outputs,**kwargs):\n",
" import numpy\n",
" if \"logits\" not in model_outputs:\n",
" return self.postprocess(model_outputs[0],**kwargs)\n",
" m=model_outputs[\"logits\"][0].numpy()\n",
" e=numpy.exp(m-numpy.max(m,axis=-1,keepdims=True))\n",
" z=e/e.sum(axis=-1,keepdims=True)\n",
" for i in range(m.shape[0]-1,0,-1):\n",
" m[i-1]+=numpy.nanmax(m[i]+self.transition,axis=1)\n",
" k=[numpy.nanargmax(m[0])]\n",
" for i in range(1,m.shape[0]):\n",
" k.append(numpy.nanargmax(m[i]+self.transition[k[-1]]))\n",
" w=[{\"entity\":self.model.config.id2label[j],\"start\":s,\"end\":e,\"score\":z[i,j]} for i,((s,e),j) in enumerate(zip(model_outputs[\"offset_mapping\"][0].tolist(),k)) if s<e]\n",
" if \"aggregation_strategy\" in kwargs and kwargs[\"aggregation_strategy\"]!=\"none\":\n",
" for i,t in reversed(list(enumerate(w))):\n",
" p=t.pop(\"entity\")\n",
" if p.startswith(\"I-\"):\n",
" w[i-1][\"score\"]=min(w[i-1][\"score\"],t[\"score\"])\n",
" w[i-1][\"end\"]=w.pop(i)[\"end\"]\n",
" elif p.startswith(\"B-\"):\n",
" t[\"entity_group\"]=p[2:]\n",
" else:\n",
" t[\"entity_group\"]=p\n",
" for t in w:\n",
" t[\"text\"]=model_outputs[\"sentence\"][t[\"start\"]:t[\"end\"]]\n",
" return w\n",
"nlp=BFP(model=mdl,tokenizer=tkz)\n",
"txt=\"四十而不惑\"\n",
"doc=nlp(txt)\n",
"for t in doc:\n",
" print(txt[t[\"start\"]:t[\"end\"]],t[\"entity\"])"
]
}
]
}

0 comments on commit ccc14e6

Please sign in to comment.