From ccc14e6fb535ed91e318faa9c2b116d2fa2b1088 Mon Sep 17 00:00:00 2001 From: Koichi Yasuoka Date: Thu, 4 Jul 2024 21:57:56 +0900 Subject: [PATCH] Xunzi-Qwen1.5-4B-upos --- demo/2024-07-05/lzh-qwen-upos.ipynb | 113 ++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 demo/2024-07-05/lzh-qwen-upos.ipynb diff --git a/demo/2024-07-05/lzh-qwen-upos.ipynb b/demo/2024-07-05/lzh-qwen-upos.ipynb new file mode 100644 index 0000000..2cdd3ed --- /dev/null +++ b/demo/2024-07-05/lzh-qwen-upos.ipynb @@ -0,0 +1,113 @@ +{ + "nbformat":4, + "nbformat_minor":0, + "metadata":{ + "colab":{ "name":"古典中国語生成AIモデルの系列ラベリングによるUPOS品詞付与" }, + "kernelspec":{ "name":"python3" }, + "accelerator": "GPU" + }, + "cells":[ + { + "cell_type":"markdown", + "metadata":{ "colab_type":"text" }, + "source":[ + "Xunzi-Qwen1.5-4Bによる系列ラベリング" + ] + }, + { + "cell_type":"code", + "metadata":{ "colab_type":"code" }, + "source": [ + "!pip install transformers accelerate\n", + "!test -d Xunzi-Qwen1.5-4B || env GIT_LFS_SKIP_SMUDGE=1 git clone --depth=1 https://www.modelscope.cn/Xunzillm4cc/Xunzi-Qwen1.5-4B.git\n", + "from transformers import AutoTokenizer,AutoModelForTokenClassification,TokenClassificationPipeline\n", + "from transformers.utils import cached_file\n", + "tkz=AutoTokenizer.from_pretrained(\"Xunzi-Qwen1.5-4B\")\n", + "mdl=AutoModelForTokenClassification.from_pretrained(\"KoichiYasuoka/Xunzi-Qwen1.5-4B-upos\",trust_remote_code=True,device_map=\"auto\")\n", + "class TCP(TokenClassificationPipeline):\n", + " def check_model_type(self,supported_models):\n", + " pass\n", + "nlp=TCP(model=mdl,tokenizer=tkz)\n", + "txt=\"四十而不惑\"\n", + "doc=nlp(txt)\n", + "for t in doc:\n", + " print(txt[t[\"start\"]:t[\"end\"]],t[\"entity\"])" + ] + }, + { + "cell_type":"markdown", + "metadata":{ "colab_type":"text" }, + "source":[ + "トークナイザを単文字化" + ] + }, + { + "cell_type":"code", + "metadata":{ "colab_type":"code" }, + "source": [ + "tkz=AutoTokenizer.from_pretrained(\"KoichiYasuoka/Xunzi-Qwen1.5-4B-upos\")\n", + "nlp=TCP(model=mdl,tokenizer=tkz)\n", + "txt=\"四十而不惑\"\n", + "doc=nlp(txt)\n", + "for t in doc:\n", + " print(txt[t[\"start\"]:t[\"end\"]],t[\"entity\"])" + ] + }, + { + "cell_type":"markdown", + "metadata":{ "colab_type":"text" }, + "source":[ + "出力部に逆方向のBellman-Fordを追加" + ] + }, + { + "cell_type":"code", + "metadata":{ "colab_type":"code" }, + "source": [ + "class BFP(TokenClassificationPipeline):\n", + " def __init__(self,**kwargs):\n", + " import numpy\n", + " super().__init__(**kwargs)\n", + " x=self.model.config.label2id\n", + " y=[k for k in x if not k.startswith(\"I-\")]\n", + " self.transition=numpy.full((len(x),len(x)),numpy.nan)\n", + " for k,v in x.items():\n", + " for j in [\"I-\"+k[2:]] if k.startswith(\"B-\") else [k]+y if k.startswith(\"I-\") else y:\n", + " self.transition[v,x[j]]=0\n", + " def check_model_type(self,supported_models):\n", + " pass\n", + " def postprocess(self,model_outputs,**kwargs):\n", + " import numpy\n", + " if \"logits\" not in model_outputs:\n", + " return self.postprocess(model_outputs[0],**kwargs)\n", + " m=model_outputs[\"logits\"][0].numpy()\n", + " e=numpy.exp(m-numpy.max(m,axis=-1,keepdims=True))\n", + " z=e/e.sum(axis=-1,keepdims=True)\n", + " for i in range(m.shape[0]-1,0,-1):\n", + " m[i-1]+=numpy.nanmax(m[i]+self.transition,axis=1)\n", + " k=[numpy.nanargmax(m[0])]\n", + " for i in range(1,m.shape[0]):\n", + " k.append(numpy.nanargmax(m[i]+self.transition[k[-1]]))\n", + " w=[{\"entity\":self.model.config.id2label[j],\"start\":s,\"end\":e,\"score\":z[i,j]} for i,((s,e),j) in enumerate(zip(model_outputs[\"offset_mapping\"][0].tolist(),k)) if s