From 977defceaf5c8283bdc54eae1811ad8181b5266e Mon Sep 17 00:00:00 2001 From: QinLonghu <1097824882@qq.com> Date: Sun, 24 Oct 2021 20:39:56 +0800 Subject: [PATCH 01/58] [DOC] rectify some atrributes and class --- EduNLP/Formula/Formula.py | 2 +- EduNLP/SIF/tokenization/tokenization.py | 4 ---- docs/source/api/sif.rst | 3 +-- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/EduNLP/Formula/Formula.py b/EduNLP/Formula/Formula.py index 6eb80049..ad25325a 100644 --- a/EduNLP/Formula/Formula.py +++ b/EduNLP/Formula/Formula.py @@ -42,7 +42,7 @@ class Formula(object): >>> f.elements [{'id': 0, 'type': 'mathord', 'text': 'x', 'role': None, 'var': 0}] - Attributes + Attributes ------------ ast show all ast details diff --git a/EduNLP/SIF/tokenization/tokenization.py b/EduNLP/SIF/tokenization/tokenization.py index 650492c4..ae587706 100644 --- a/EduNLP/SIF/tokenization/tokenization.py +++ b/EduNLP/SIF/tokenization/tokenization.py @@ -37,10 +37,6 @@ class TokenList(object): show figure tokens ques_mark_tokens show question mark tokens - tag_tokens - show tag tokens - describe - show number of each elements """ def __init__(self, segment_list: SegmentList, text_params=None, formula_params=None, figure_params=None): self._tokens = [] diff --git a/docs/source/api/sif.rst b/docs/source/api/sif.rst index 7467b7cb..8cf2da46 100644 --- a/docs/source/api/sif.rst +++ b/docs/source/api/sif.rst @@ -10,9 +10,8 @@ SIF Parser -------- -.. automodule:: EduNLP.SIF.parser.parser.Parser +.. automodule:: EduNLP.SIF.parser.parser :members: - :imported-members: Segment From d4656e4c55ea0752db7a5cecac4e1a21e2f37761 Mon Sep 17 00:00:00 2001 From: QinLonghu <1097824882@qq.com> Date: Sun, 24 Oct 2021 20:50:12 +0800 Subject: [PATCH 02/58] [DOC] rectify parser comments --- EduNLP/SIF/parser/parser.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/EduNLP/SIF/parser/parser.py b/EduNLP/SIF/parser/parser.py index b9b1e269..d05f123e 100644 --- a/EduNLP/SIF/parser/parser.py +++ b/EduNLP/SIF/parser/parser.py @@ -3,18 +3,6 @@ class Parser: - """ - initial data and special variable - - Attributes - ---------- - get_token - Get different elements in the item. - txt_list - show txt list - description_list - use Parser to process and describe the txt - """ def __init__(self, data, check_formula=True): self.lookahead = 0 self.head = 0 From da47222b9424f5aa610bb75692e5c2e0fa8fd8b8 Mon Sep 17 00:00:00 2001 From: QinLonghu <1097824882@qq.com> Date: Sun, 24 Oct 2021 20:59:36 +0800 Subject: [PATCH 03/58] [DOC] test --- EduNLP/SIF/parser/parser.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/EduNLP/SIF/parser/parser.py b/EduNLP/SIF/parser/parser.py index d05f123e..33fa4e7d 100644 --- a/EduNLP/SIF/parser/parser.py +++ b/EduNLP/SIF/parser/parser.py @@ -3,6 +3,9 @@ class Parser: + """ + Parse the item to standard format. + """ def __init__(self, data, check_formula=True): self.lookahead = 0 self.head = 0 From 3047977e820726babe27c103eafae8aaa40438e8 Mon Sep 17 00:00:00 2001 From: QinLonghu <1097824882@qq.com> Date: Sun, 24 Oct 2021 21:12:17 +0800 Subject: [PATCH 04/58] [DOC] test remote --- docs/source/api/sif.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/api/sif.rst b/docs/source/api/sif.rst index 8cf2da46..02c5c427 100644 --- a/docs/source/api/sif.rst +++ b/docs/source/api/sif.rst @@ -10,7 +10,7 @@ SIF Parser -------- -.. automodule:: EduNLP.SIF.parser.parser +.. automodule:: EduNLP.SIF.parser.parser.Parser :members: From 1b9e835dd0e295238acd80aff4b14f4423371ea7 Mon Sep 17 00:00:00 2001 From: KenelmQLH <46173279+KenelmQLH@users.noreply.github.com> Date: Sun, 24 Oct 2021 21:38:33 +0800 Subject: [PATCH 05/58] Update sif.rst --- docs/source/api/sif.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/api/sif.rst b/docs/source/api/sif.rst index 02c5c427..bd49430b 100644 --- a/docs/source/api/sif.rst +++ b/docs/source/api/sif.rst @@ -12,7 +12,7 @@ Parser -------- .. automodule:: EduNLP.SIF.parser.parser.Parser :members: - + :imported-members: Segment ---------- @@ -49,4 +49,4 @@ formula .. automodule:: EduNLP.SIF.tokenization.formula.linear_token :members: - :imported-members: \ No newline at end of file + :imported-members: From 11a34903339017439aa34a38ac4868978353c847 Mon Sep 17 00:00:00 2001 From: KenelmQLH <46173279+KenelmQLH@users.noreply.github.com> Date: Sun, 24 Oct 2021 23:15:02 +0800 Subject: [PATCH 06/58] Update sif.rst --- docs/source/api/sif.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/api/sif.rst b/docs/source/api/sif.rst index bd49430b..265725a7 100644 --- a/docs/source/api/sif.rst +++ b/docs/source/api/sif.rst @@ -10,7 +10,7 @@ SIF Parser -------- -.. automodule:: EduNLP.SIF.parser.parser.Parser +.. automodule:: EduNLP.SIF.parser :members: :imported-members: From 68e388313b288fd0dd6f9db6d958be45692faaf9 Mon Sep 17 00:00:00 2001 From: QinLonghu <1097824882@qq.com> Date: Sat, 30 Oct 2021 10:45:09 +0800 Subject: [PATCH 07/58] test collection api --- EduNLP/SIF/segment/segment.py | 2 - examples/sif/sif.ipynb | 690 +++++++++++++++++----------------- 2 files changed, 345 insertions(+), 347 deletions(-) diff --git a/EduNLP/SIF/segment/segment.py b/EduNLP/SIF/segment/segment.py index 39e2f869..ab8941f0 100644 --- a/EduNLP/SIF/segment/segment.py +++ b/EduNLP/SIF/segment/segment.py @@ -112,8 +112,6 @@ class SegmentList(object): show question mark segments tag_segments show tag segments - describe - show number of each elements """ def __init__(self, item, figures: dict = None): self._segments = [] diff --git a/examples/sif/sif.ipynb b/examples/sif/sif.ipynb index 3758f3f8..2040dd2e 100644 --- a/examples/sif/sif.ipynb +++ b/examples/sif/sif.ipynb @@ -2,280 +2,277 @@ "cells": [ { "cell_type": "markdown", + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%% md\n" + } + }, "source": [ "# SIF4Sci 使用示例\n", "\n", "## 概述\n", "\n", "SIF4Sci 是一个提供试题切分和标注的模块。它可定制化的将文本切分为令牌(token)序列,为后续试题的向量化做准备。" - ], - "metadata": { - "collapsed": true, - "pycharm": { - "name": "#%% md\n" - } - } + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "本文将以下面这道题目(来源自 LUNA 题库)为例,展示 SIF4Sci 的使用方法。 \n", "\n", "![Figure](../../asset/_static/item.png)" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "- 符合 [SIF 格式](https://edunlp.readthedocs.io/en/docs_dev/tutorial/zh/sif.html) 的题目录入格式为:" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 1, - "source": [ - "item = {\r\n", - " \"stem\": r\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\r\n", - " \"options\": [\"$p_1=p_2$\", \"$p_1=p_3$\", \"$p_2=p_3$\", \"$p_1=p_2+p_3$\"]\r\n", - "}\r\n", - "item[\"stem\"]" - ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "'如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\\\SIFChoice$$\\\\FigureID{1}$'" ] }, + "execution_count": 1, "metadata": {}, - "execution_count": 1 + "output_type": "execute_result" } ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "source": [ + "item = {\n", + " \"stem\": r\"如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形$ABC$的斜边$BC$, 直角边$AB$, $AC$.$\\bigtriangleup ABC$的三边所围成的区域记为$I$,黑色部分记为$II$, 其余部分记为$III$.在整个图形中随机取一点,此点取自$I,II,III$的概率分别记为$p_1,p_2,p_3$,则$\\SIFChoice$$\\FigureID{1}$\",\n", + " \"options\": [\"$p_1=p_2$\", \"$p_1=p_3$\", \"$p_2=p_3$\", \"$p_1=p_2+p_3$\"]\n", + "}\n", + "item[\"stem\"]" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "- 加载图片:`$\\\\FigureID{1}$`" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 2, - "source": [ - "from PIL import Image\r\n", - "img = Image.open(\"../../asset/_static/item_figure.png\")\r\n", - "figures = {\"1\": img}\r\n", - "img" - ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAOYAAACICAYAAADzlJeRAAAe3klEQVR4nO2df1RT9/3/nyE/SWJyBdSUOgs72mKP1XhK24j2CGon4FagbtTMtpZt6rDzR7rjKt+2C+rnHGxdBbr2xB9zgc4j1NMJbpXg2hlsp2DnjrSHVaqcgquWbFqNLUgQkvf3DwgiSSA/7uUm8f04530O3Ny87zu5ed7X+8fr/XoJCCEEFAolrIjhuwEUCsUTKkwKJQyhwqRQwhAR3w2IBJqamuBwOHy+rtFokJKSMo4tokQ7VJgAGhoa0NzcDLvdjn//+99oa2uDw+FAa2srAIBhmFHf73K58O233wIAYmNj8cADDwAAZs+ejblz50Kr1UKn00Emk3H6OSjRg+Bum5Vtbm5Gc3MzrFYrPvnkE7S2toJhGPT09KC3t5f160mlUsTGxqKrqwsMw2D69OlYunQpdDodMjMzWb8eJTqIemHa7XbU19fjT3/6E/72t79BoVDA6XSiq6uL13YxDIOuri48/vjjyM/PR2ZmJpKSknhtEyV8iEph2mw21NbW4tChQzh16hSkUulQVzMckcvliImJgUwmQ3Z2NgwGA7RaLd/NovBIVAmzuroar732Gs6fPw8AuHnzJs8tCg61Wg2FQoFXX30VK1asGHOMS4k+Il6YNpsNf/jDH7Bz504ACGvLGChyuRxOpxOLFy/G+vXr6Zj0LiJihdnU1IQ33ngDdXV1cLlcoy5nRAMMw0AkEmHnzp14/vnn+W4OhWMiTpjNzc0oLCxEa2sr7HY7380Zd5RKJWQyGRVolBMxwrTZbFi/fj0++OAD3Lhxg+/m8A4VaHQT9sK02+347W9/i8rKyqgaP7KFUqmESqXCn//8Z+h0Or6bQ2GJsPaV3bdvH5KTk7Fv3z4qSh90dXXh66+/RlZWFjZs2HBXdu+jEhKGdHZ2Ep1OR9RqNQFAi59FJpORSZMmEYvFwuLdqCNlZRdYrI/iD2FnMfft24dZs2ahqamJjiUDxOFw4MqVK9Dr9XjqqadYsJ5tKJ+fjUNsNI4SGHw/Gdxcv36d5OXlUSvJUhGLxWTSpEmksbEx6HtyoSyNACBr6li80RS/CAuL2dDQgPvvvx/vv/8+tZIs0dfXhytXruBHP/oRSkpKAq+grRw78QrK0thvG2VseJ+VLS0txbZt2+ikBYeoVCrMmzcP1dXVfrr3WbB2/nlsPnk/dgr+Dw9eOImN07lrX0NDA4Db+17/+c9/4uuvv/Z67uzZs5GcnAwASEpKuqNEFXya65/97GeEYRjeu313QxGLxSQ5OZmcO3duzPtSt2YNqSOEkAtlJA2Df7NAT08PsVqtxGg0Ep1OR2JjYwkAwjAMYRiGiESigD6TUqkkDMMQpVJJABCtVkuys7OJ0WgMqQsfDvAizOvXr5NHH3106MbQMn4lLi6OHDlyxOe9cY8rb5fQhHnu3Dny8ssvk+TkZCISicbtQcwwDImNjSUZGRnEZDKR9vb2ED7F+DPuwmxvbycpKSlEIpHw/iO9W8uUKVPIO++843lzLpSRNcOXRurWEKSVkUAXSxobG0lhYSFJSEggKpUqYEvIdpHL5USpVJLExERiMpnI9evXQ/kJjwvjKsyzZ8+SxMRE3n+YtIBMnDiRvP3227dvzoUykjZChBfK0vwWZk9PDzGbzSQxMTGshydyuZwoFAqSnZ3N8novu4ybMM+ePUs0Gg3vN4aW2yUuLm5InAPH0siAwbxAytKGn+u7O9vZ2Uk2bdpEJkyYMDTWi5TCMAxJSEggZrN5nFTgP+MizPb2dmopw7TExcWRjRs3BnxPOzs7ycqVK4lcLue9qxpqUSqVYSdQzoXZ3t5OkpKSeP/yafFdGIbxW5w9PT3EaDSS+Ph4IhaLeW87m8Ut0JqaGk414Q+cCpOKMnIKwzBk165do97PqqoqotFoiEKh4L29XBa1Wk10Oh2vM7mcCfP69eskJSWF9y+ZFv/LlClTvC6luDcVqFQq3ts4noVhGGI0GklPTw9XMvEJJ54/DocDjzzyCFpaWtiuOuqRy+WQSCQecW6Tk5OhVqvvODcnJwcA8NFHH+G///0vrl27ht7eXvT19QUdnnPKlCloaGgYiixfX1+PZ555Bt98802QnyiyUSgUiI+Px5EjR8Y1ciEnwnzqqafw/vvvo6+vj+2qowaGYYYiuLtFl5OTM5RuQavVshodr6GhAXa7Hc3Nzaivr4fNZsNXX30FAIiJiUF/f//QuSqVCrt27cJf/vIXfPTRR9RdEkB8fDxKSkqwevXqcbke68Kkvq+eiEQiKJVKdHV1ITU1FUuXLkV6enpY5Dxxi7WjowN//etfceLECXz33XdwuVx3iJUyEFZ00aJFOHjwIPfpLtjsFzc2NpKEhATexwZ8F7frmUgkIjqdjpSUlESU7+bhw4eJRqMharWayOVyIpVKSUxMDO/fazgUsVhMZsyY4ZfPcSiwJszOzk4ybdo03r84vopMJiNKpZJotdqIE+JwTCYTSU9PJ+3t7aSqqoqsWLGCnDt3jphMJpKZmUkkEknULZMEU6ZMmULOnj3L2X1gRZg9PT0kNTWV9y+Lj6JWq0liYiLZvn17xDlKD6enp4esWLGC/PKXvyQ9PT3k448/Jh9//DHZtGkTKS0tveM8s9k8tDtEKBTyfg/4KhqNhrMHMCvCNBqNUb+2NVKMKpWKPPfcc5w+NceLc+fOEa1WS6qqqoYE6S49PT0kMzOTWK1Wj/e1t7eT7du3k8TERCKXy3m/L3yUhISEUXfrBEvIwmxvbw9rp2U2i1KpJNnZ2WHhGcIWVVVVRKfTkXPnznmI0l06OzuJVqslnZ2dPus5e/Ysee6554hSqSRSqZT3ezWexdf6byiEPCs7b948NDU1hVJFWCMSiSCXy5Gbm4utW7dGzU55h8OBoqIi2Gw2mEymMdecRSIRioqKYLFYRp2RdDgceOutt1BSUoLu7m5Oco6GI1OmTMHevXvx5JNPslJfSMLct28fNm/eHJVxemQyGWQyGX7yk59g27Zt0Gg0fDeJNWw2G/R6PXJycpCamur3+86cOYOLFy+itLR0zHMdDgcqKirw6quvoru7Gz09PaE0OSKIj4/Hhx9+yIojQtDCtNlsmDt3Lmw2W8iNCCcUCgViYmKwbt06bNmyJepS4NXX18NgMMBsNge1Trl//34sXLgwoLQMFRUV2Lx5M3p6etDd3R3wNSOJpKQkWK3WkHtWQUfJW7duXVS5aYnFYjAMg+LiYvzvf//Djh07ok6UxcXFKC8vR2NjY9DOAyaTCZWVlQENX55//nlcuXIFBw4cQHJyMmJjY4O6diTQ0dGBH/zgByFnnwvKYjY3NyM9PT1qurAMw2DZsmV46623ok6MwIB3T15eHhYuXIglS5aEXN/UqVORl5cHq9Ua1Pf1u9/9Djt27IDdbofT6Qy5PeGGRCKBVqvF6dOng64jKGEuWrQIVqs16IuGCxMmTMC9994Ls9kctQl5mpqaUFBQAJPJBJFIxFq9/f392Lp1a9C/A5vNhnXr1uH48eNR84AfjkqlwpYtW1BUVBTU+wMWZjRYS7FYDIVCgW3btmH9+vV8N4czysrKcOTIEVRVVaGtrY31+v/xj3/gxo0bwQWUHqShoQErV67E9evXo26CKJTJoIDHmC+++GJEi5JhGOTn56O9vT1qRWm326HX63Hx4kVYLBZORAkACxYsQGtrK2pra4OuIz09HZcvX8ZvfvMbxMfHs9e4MOCbb75BTk5OUOPNgCxmJFtLsViMyZMn47333ovabisAtLa2oqCgABs3bsTUqVM5v96sWbOQlZUFs9kc8k4Zm82GvLw8fPbZZ7h58yZLLeQXsViM/Px8HDhwIKD3BWQxI9VaMgyDRYsWoaWlJapFWVFRAb1eD7PZPC6iBICWlhaYzWbo9fqQZyI1Gg0aGxuxefPmqJmE6+vrw9GjR1FfXx/Q+/y2mE1NTcjKyoq4fZZxcXF45ZVXYDAY+G4KZzgcDhgMBjgcDphMJpw5c2bc23D16lVUVlaipqaGlfqamprw5JNP4sqVK6zUxzfJycn48ssv/T7fb4tpMpkiSpRisRjTpk3DyZMno1qUHR0dyMjIwJw5c/Dzn/+cF1ECQEJCAlJSUrBjxw5W6tPpdDh//jx0Oh3kcjkrdfLJ1atX8fvf/97v8/0SpsPhwKFDkZO+dOLEifjhD3+IL774gvcIAVxSW1uLvLw8mEwmzJo1i+/mYNmyZTh27NhQ9q5QYRgmarq23333HYqLi/3u7vslzOrqalbXwLgkISEB5eXlOHz4MPfhH3ikqKgIlZWVsFqtQQfe4oKamhoYDAZ0dHSwVmdxcTEsFgsSExNZq5MPent7/e5R+DXGnDlzJlpbW0NuGNdoNBpUVVUhPT2d76ZwhtsBfenSpViwYAHfzfGKvztRAqWjowNZWVkR8Vv0hUqlwhdffDHmpogxLWZra6vPJKLhRGJiIiwWS1SL0j0BZzQaw1aUwIBX0KpVq1BYWMhqvUlJSWhsbERqamrE9OBG4nQ6/Vo6GVOYe/bsCfs1pXvvvRcnT54c17if482OHTtgMBhgsVgi4kc5ffp0yGQy7N69m9V6GYbBxx9/jLS0tIh0hu/u7kZJScmYY80xhXngwIGwDWMokUiQkpKClpaWqNnAPBK3A/qNGzewc+dOzrx4uKC0tBTvvvsu6xvpZTIZTpw4gSeeeMIjCHYk4HQ6x1zXHHWM2dHRgYceeiisJhfcyOVyzJkzB3V1dRE/Y+eL5uZmFBQUwGg0IiEhge/mBMXUqVOh1+tRU1PDyWbzTZs2oaKiIuIcX8Za1xzVYtbX18PlcrHeqFCRSCSYM2cOjh8/HrWi3L17NwoLC1FTUxOxogSAS5cuoaSkBAUFBSF7BnmjrKwMRqMREydOZL1uLrl27Rqam5t9vj6qMA8dOhSW48v7778fx48fj8rlEIfDgYKCAnz66aewWq24dOkS300KGZFIhKVLlwa9BWosDAYDnnvuuYjq1nZ3d6OystLn6z67sg6HAxMnTuTkKRcKKSkpaGxsjEpL6XZAX7t2LaZPn853c1jn7bffRk5ODlasWMFJ/Xq9HrW1tWH3m/VFQkKCT5dDnxazqakp7CzStGnTgt41H+7U1tYObWiORlECgNlsxmuvvcbZOmRVVRUeffTRiJi1BgaWlXxNjPkU5ocffhhWkz6JiYk4ceJEVEWrc2MwGFBZWQmLxRJW3znbnDlzBjU1NdDr9Zz5XR87dgyPPPIIxGIxJ/WzSVdXF959912vr/nsyoZT+BCNRgOLxRJ165Tu/YdPP/10QGEkIx22d6KMxG63Y968eRHhIeSrO+vTYv7rX//itEH+kpCQgJqamqgTZUNDAzIyMlBaWnpXiRIYuKdz5sxBcXExJ/UzDAOr1Ypp06ZxUj+bOBwOr37FXoVpt9vDIoK2SqXCiy++GHWbm4uLi7F169aQwkhGOkuWLMHp06cD3kDsLxqNBseOHUNcXBwn9bOJt904XoXZ3NzMu7uTWCyGVqvlbIqdD+x2O7KysgAA27dvHzMtQbRTVVWFoqIiVneiDMe9P1SlUnFSPxt0dXXhk08+8TjuU5h8RyybPHkyjhw5wmsb2KSpqQkZGRnYuHEjK7Fdo4GWlhZUVVUhLy+PsyWO1atXY/HixWE9U+ut1+BVmGfPnuW1K5uQkID33nsvapZFdu/ejaKiItTU1ECpVPLdnLDi6tWreOmll1BQUMDZNQ4ePBjWezm/+uorjweTV2F+9tln49Igb0TTuNLhcECv1+PTTz+FxWKJCi8eLpg6dSo0Gg3Kyso4qV8mk4X1eFOpVHq453kVJl8/ILFYjMWLF0fFuLK1tRXz5s1DTk4OVq5cyVssnkihpKSE1bAkIwnn8abL5fIYZ/ucleWD+Ph4HDx4kJdrs0l1dTX0ej2qqqrGLYxkpHPmzBmYzWYYDAbOMsi5x5tCoZCT+oOlt7fX4zPfIUzi6se3Xd9BoIoF1ApgHD3yGIaB2WwOOzfAQHA4HCgsLMSRI0fQ2NiIq1ev8t2kiKKtrQ0mk4mVGLW++OMf/xh2Xdre3l6cP3/+jmN3CFMQI8K1K/+DTCwGbnYD4+QLLBaL8dhjjyEzM3N8LsgB7ng0c+bMwQsvvEC7rkHS39+PnJwczoYzDMPgjTfeCLsu7UgvJY+ubMfFi4iJCTptZlCoVCpUVFSM6zXZpL6+Hnl5eSgpKQmLMJKRTmpqKux2O2e/iWeffRbf//73Oak7WDo7O+/432Nxx27/dvAvMaCWDPzpugV81wfIZIBUCPQOWlOxGJAPntM7zMIOPw4ncGOY6R3x2gSXGEaj0dM5nThxy+kCAQBBDCRCIQQjj0MAsUiEGBD09/fDCcHA+MHZDycAxIggFbhunx8jgjRGENAXNhbFxcU4ffo0rFbrXe8wwCYmkwlZWVlISUnhZIbebDZj8eLFuHbtGut1B8PIYY+HabTbrw9GLegDbt4C4BwQJQA4nED/rUEBigfGoDe6gRu3AKkMEA8elwsHusI3uoF+IaBwe/p7vjbpnslesm4R9DsJYoRiSEUiCOGCk4w8LoY4hqDPRQAIIBLGQAACp9OJGJEYUmEMBK5+9LoA8dD/TrAVj8FutyMjIwMA8PLLL1NRsox7MqiwsJCTyUitVouVK1eG7S6U0fusfX1AL24LSyYEnH23/46RDEwSqSUAhIBw8LjLCQyehlu3AJHw9nuGvaaWSlFd9a6nWIgTzhghRAJgQHTigb+JE06BAMJBoxcjGCk2AYRC0bAPJYAwZtDSDuIKOE2vJ01NTZg3bx6MRiP14uGQS5cuobS0FHl5eZzU//rrr2PChAmc1B0qvoXpFpbDOSAsMQCh884Jof5bgxZzsLhfixllOnrwNZFIhPzlyzFn9hzv5xF3d3XkceL9+DhRVlYGg8EAq9Ua1m5e0QKXYUlkMhn2798flt5YQ8J0ufrQ6yIDyxXEOeyUQaspV9y2lsCgYCXDllQGu7YOJwDh7eNCIdDvvP2ewddUKhXeLC8FiRF6eToIICDu7isAkAFLJxBCCIL+IbNHfLyffdxhJC9evBhxYSQjHTYS5PoiNzc3LCfshn7TAggAVz/iJ09GTP+ICRKHE8AIa4k+oNcJSBWD3Vnh4OuDY1P3cbET6O67/Z6bt6BQxWHz/9sCgUQGydBkDEF/fx/6BwUoEQrgdPaht78Pvf1OYKhbGwO4+geOuwSD7yfod7pAQOB09sM15v+B0dzcjKysLKxatQrLly8P8N0UNuAyLInJZAo7q+kRwaChoQF5eXkjBtxiQNbH2rpmXFwcLl++HBHOBBUVFdizZw/MZjN1GOCZhIQE6PV6NDY2sv7b4Ttix8hIBv71AmVCwDn2af6gUChQVFQU9qJ0e/GcOHECVquVijIMuHr1KoxGI/R6Pet179q1i1erOdJ100OYdwhGMdgdheP2ZFCISKVS/OpXv2KnMo4Il2SwFE/YTpDrRqvVYtKkSazWGQgjDZWHMDUaze1wF90jZltDJBKsZbglg6V4wnaCXDe7du3iLXv1mBYzKSmJsxCK4W4twzUZLMUTLhLk5ubmYsqUKazVFwgjrbXXMSYXDr7hbC1tNhvmzZsHtVqNX//619SLJwJoaWmByWRiPScKH1ZTLpdj9uzZdxzzKsz4+HjWLy4UCvHMM8+wXm+oNDQ0ICsrC6WlpWGdDJbiCRcJcnNzczF58mTW6vMHiUTiEZ7VqzC5MOcPP/xw2EVR37FjB7Zu3QqLxXLXhpGMdLhIkPvSSy+Nq1dXT08PUlJS7jjmVZhs77pXq9XYsGEDq3WGwvBksNu3b6dePBEO2wlyc3NzIZFIxj6RJaRSqUfgOa/CfOCBB1h9YgiFwrDZBN3c3IyMjAysWrUKy5Yt47s5FBZgOyyJRqPBY489xkLL/OPBBx/0OOZVmAsWLGBtsVUkEmH58uVhMekTLclgKZ6wnSB3w4YNkEqlLLRsdEQiERYvXuxx3KswdTodawGfFQoF1q1bx0pdwRKNyWApnrC5EyUzM3NcjIlSqfS6ddCrMBmGYW1m9p577uE1IZA7jOTChQtpGMm7gNTUVNhsNlRXV4dUj0wmQ05ODkut8o3D4fAaocGnrywbYpLL5di4cWPI9QRLdXU1CgoKUFVVFbXJYCmesLUTpbCwkHOrqdVqvV7DpzCfeOKJkPvYTqeTs7Teo+FwOGAwGHDkyBFYLBbqgH6XwVaCXJ1Ox2mEA6lUiqefftrraz6FqdPpQs74NXPmzHHPP2Kz2ZCVlYX77rsPL7zwAvXiuUu5dOkSjEZjyDlR1qxZw1KLPJFKpT5XK0btyjqdwe/1EolE474c4U4GW1JSctclg6V4wkaC3CVLlnAWsCs+Pt7DscCNT2HKZDI8/vjjQV/U12wTV9BksBRvhJogV6fTcZJSYazVilE3Sj/77LNBO7Q7nc5xydhFk8FSxiKUBLkymQxz585lv1EAfvGLX/h8bVRhZmZmBt2dTU1N5XxGyx1GkiaDpYxGqAly8/PzIRCwGyh84cKFo86/jCpMhmHw0EMPBXxRqVSKJ598MuD3BUJZWRmKiopgtVrDLpASJfwIJUFueno6q+NMhmG8BDm/kzFj/qxatSrg/WmjzTaFit1uh16vx8WLF2GxWKgDOsVvgk2Qy7aDzMSJE8fUx5jCzM3NDfjCEonE52xTKLS2tiIrKws5OTlYvnw59eKhBEywCXLZGmeq1Wrs2rVrzPPGFGYwnvaPPvpoQOf7gzsZrNlspslgKUET7E6UH//4x6xcPy4uzi9j51f4yg0bNkCtVvt1YZFIFNIyy0hoMlgK2wSTIHfJkiUhp6f011oCXgI+++Lee+/F119/7dfFKyoqguoCj6SjowMFBQV4+umnacQ6CuucOXMGFy9eRGlp6ZjnOhwOyOVy+CkXryQnJ+PLL7/061y/HwEGgwEKhWLM8wghrIwvaTJYCtcEkiBXJpOFtPynVqtx+PBhv8/3W5ijLYYO5+bNmyELs6ioCOXl5bBardSLh8IpJpMJlZWVfoUlCTZmlUgkQn5+fkCzu34Lk2EY/PSnPx0z5Mj3vvc9vy8+EpvNhoyMDKjVapoMljIuBJIgN9iZWZVKhTfffDOg9wQ0ml23bt2Y3dmZM2cG1AA3TU1NyMrKgtFopGEkKeOKvwlytVptwB5AarUa+/fvD7gbHJAwtVot8vPzfVrNYGdk3clgLRYLTQZL4QV/wpI88sgjAdUpFouxaNGioCZCA57/3bZtm09PIIVCEdD4kiaDpYQTYyXITUlJCchi3nPPPTh48GBQbQlYmBqNBqtWrfJqmgOZkaXJYCnhyGhhSZKSkvxeLomLi8OxY8eCnskNasX09ddf92o1XS6XXw2pqKhAYWEhzGYzDSNJCStaWlpgNpt9Oh/4EwharVZjx44dIa1OBCVMmUyGoqIij4kgh8Mx6pSyO4wkTQZLCWdGS5A7Vhws97hy9erVIbXBb8+fkTgcDsyePRsXLly447iv6lpbW1FQUIC1a9fSiHWUiODo0aNQq9XYsmXL0LH4+Hhcu3bN6/kSiQQPP/wwjh8/HvJe5KCd/2QyGd555507Nnv6inZQW1uLgoICmEwmKkpKxOAtQe5oET1mz57NiiiBEIQJDMRDefbZZ4ca4q3/bTAYUFlZCYvFQpPBUiKOkQlyfW2YTklJwQcffMBa1I7Q3OUxMBHk9vYZvh3LnQz2vvvuo8lgKRHLyAS53nZZpaSkoLGxkdVQrSELUyaT4dChQ3eY+OHJYGkYSUqkMzxB7vAJT3dAALZFCbAgTGDAIygtLQ0zZsygyWApUYk7Qa5UKoVQKERsbCy0Wi0nogQCEKZlrQACwYgyvxxuX50lS5bAarWitbUVf//736kXDyXqKC0txX/+8x/I5XLk5OTg9OnTnGUaCGi5pK18PmZ8/grIniwAFqwVZGPvmjqQPVk4cOAAtmzZgsuXL7PeSK1Wy8oXkJ6ePvR3RUVFUHFGKRSRSIS+vj5OrxGAMNtQPn8GPn+FYM9AfGVY1gqQjbpBoVIo0U9HRwc6OjrueMgDACxrIcjeO/TvmjqCPViLtdiDYOTh/xiz7SgOnVqD3MGLtJXPR/beNJRtpqKk3D0kJSWNEGUbyucLIMgG6ggBGSy5tQIIslvw4IzgruO3MNuOHsIp7EX24PhyFSpByElspP4ClLuWgV7kJpThAtmD4SYqa08d1mAW7g9SH34Ksw1HD53CmrrBJ8KFMmDTDMwvpxM8lLuXtvJV2HQqDWWVG+Gpvxl4sCwXwfYn/RtjtpVj/ozP8cqwp4JlrQDZe9egbsSTgkK5Oxi0lrO4mWPxy2K2HT2EU2uGq78N51sApD2IILvQFEqEcwGfnwLSgh1EjoEfcTwGu7Gv3JalZe0MbDoFrKnzZsIplLuAtvPg0sl0TIspEAyIcG/2bceC7L1pKLtAgpoGplCigun3YxaAU59f8Pqypfy2800wBL0fk0K56xlcu0wru4CTQ8sTnuv9wUCFSaGEQls55s/YhFNDB9JQdiH0ZUQqTAolDGFldwmFQmEXKkwKJQz5/6GFNae7ZI7lAAAAAElFTkSuQmCC", "text/plain": [ "" ] }, + "execution_count": 2, "metadata": {}, - "execution_count": 2 + "output_type": "execute_result" } ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "source": [ + "from PIL import Image\n", + "img = Image.open(\"../../asset/_static/item_figure.png\")\n", + "figures = {\"1\": img}\n", + "img" + ] }, { "cell_type": "markdown", - "source": [ - "## 导入模块" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "## 导入模块" + ] }, { "cell_type": "code", "execution_count": 3, - "source": [ - "from EduNLP.SIF import sif4sci, is_sif, to_sif" - ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "D:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", " warnings.warn(msg)\n" ] } ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "source": [ + "from EduNLP.SIF import sif4sci, is_sif, to_sif" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## 验证题目格式" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 4, - "source": [ - "is_sif(item['stem'])" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, + "execution_count": 4, "metadata": {}, - "execution_count": 4 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "is_sif(item['stem'])" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "- 若发现题目因为公式没有包含在 `$$` 中而不符合 SIF 格式,则可以使用 `to_sif` 模块转成标准格式。示例如下:" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 5, - "source": [ - "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", - "is_sif(text)" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "False" ] }, + "execution_count": 5, "metadata": {}, - "execution_count": 5 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", + "is_sif(text)" + ] }, { "cell_type": "code", "execution_count": 6, - "source": [ - "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\r\n", - "to_sif(text)\r\n" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "'某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...'" ] }, + "execution_count": 6, "metadata": {}, - "execution_count": 6 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'\n", + "to_sif(text)\n" + ] }, { "cell_type": "markdown", - "source": [ - "## 题目切分及令牌化\n", - "\n", - "现在我们得到了符合标准格式的题目文本,接下来可以对题目做进一步的预训练,例如:切分和令牌化。" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "## 题目切分及令牌化\n", + "\n", + "现在我们得到了符合标准格式的题目文本,接下来可以对题目做进一步的预训练,例如:切分和令牌化。" + ] }, { "cell_type": "markdown", - "source": [ - "### 题目切分" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "### 题目切分" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### 基本切分\n", "分离文本、公式、图片和特殊符号。" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 7, - "source": [ - "segments = sif4sci(item[\"stem\"], figures=figures, tokenization=False)\r\n", - "segments" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形', 'ABC', '的斜边', 'BC', ', 直角边', 'AB', ', ', 'AC', '.', '\\\\bigtriangleup ABC', '的三边所围成的区域记为', 'I', ',黑色部分记为', 'II', ', 其余部分记为', 'III', '.在整个图形中随机取一点,此点取自', 'I,II,III', '的概率分别记为', 'p_1,p_2,p_3', ',则', '\\\\SIFChoice', \\FigureID{1}]" ] }, + "execution_count": 7, "metadata": {}, - "execution_count": 7 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "segments = sif4sci(item[\"stem\"], figures=figures, tokenization=False)\n", + "segments" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "- 文本部分" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 8, - "source": [ - "segments.text_segments" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['如图来自古希腊数学家希波克拉底所研究的几何图形.此图由三个半圆构成,三个半圆的直径分别为直角三角形',\n", @@ -291,28 +288,28 @@ " ',则']" ] }, + "execution_count": 8, "metadata": {}, - "execution_count": 8 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "segments.text_segments" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "- 公式部分" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 9, - "source": [ - "segments.formula_segments\r\n" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['ABC',\n", @@ -327,182 +324,187 @@ " 'p_1,p_2,p_3']" ] }, + "execution_count": 9, "metadata": {}, - "execution_count": 9 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "segments.formula_segments\n" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "- 图片部分" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 10, - "source": [ - "segments.figure_segments" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[\\FigureID{1}]" ] }, + "execution_count": 10, "metadata": {}, - "execution_count": 10 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "segments.figure_segments" + ] }, { "cell_type": "code", "execution_count": 11, - "source": [ - "segments.figure_segments[0].figure" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAOYAAACICAYAAADzlJeRAAAe3klEQVR4nO2df1RT9/3/nyE/SWJyBdSUOgs72mKP1XhK24j2CGon4FagbtTMtpZt6rDzR7rjKt+2C+rnHGxdBbr2xB9zgc4j1NMJbpXg2hlsp2DnjrSHVaqcgquWbFqNLUgQkvf3DwgiSSA/7uUm8f04530O3Ny87zu5ed7X+8fr/XoJCCEEFAolrIjhuwEUCsUTKkwKJQyhwqRQwhAR3w2IBJqamuBwOHy+rtFokJKSMo4tokQ7VJgAGhoa0NzcDLvdjn//+99oa2uDw+FAa2srAIBhmFHf73K58O233wIAYmNj8cADDwAAZs+ejblz50Kr1UKn00Emk3H6OSjRg+Bum5Vtbm5Gc3MzrFYrPvnkE7S2toJhGPT09KC3t5f160mlUsTGxqKrqwsMw2D69OlYunQpdDodMjMzWb8eJTqIemHa7XbU19fjT3/6E/72t79BoVDA6XSiq6uL13YxDIOuri48/vjjyM/PR2ZmJpKSknhtEyV8iEph2mw21NbW4tChQzh16hSkUulQVzMckcvliImJgUwmQ3Z2NgwGA7RaLd/NovBIVAmzuroar732Gs6fPw8AuHnzJs8tCg61Wg2FQoFXX30VK1asGHOMS4k+Il6YNpsNf/jDH7Bz504ACGvLGChyuRxOpxOLFy/G+vXr6Zj0LiJihdnU1IQ33ngDdXV1cLlcoy5nRAMMw0AkEmHnzp14/vnn+W4OhWMiTpjNzc0oLCxEa2sr7HY7380Zd5RKJWQyGRVolBMxwrTZbFi/fj0++OAD3Lhxg+/m8A4VaHQT9sK02+347W9/i8rKyqgaP7KFUqmESqXCn//8Z+h0Or6bQ2GJsPaV3bdvH5KTk7Fv3z4qSh90dXXh66+/RlZWFjZs2HBXdu+jEhKGdHZ2Ep1OR9RqNQFAi59FJpORSZMmEYvFwuLdqCNlZRdYrI/iD2FnMfft24dZs2ahqamJjiUDxOFw4MqVK9Dr9XjqqadYsJ5tKJ+fjUNsNI4SGHw/Gdxcv36d5OXlUSvJUhGLxWTSpEmksbEx6HtyoSyNACBr6li80RS/CAuL2dDQgPvvvx/vv/8+tZIs0dfXhytXruBHP/oRSkpKAq+grRw78QrK0thvG2VseJ+VLS0txbZt2+ikBYeoVCrMmzcP1dXVfrr3WbB2/nlsPnk/dgr+Dw9eOImN07lrX0NDA4Db+17/+c9/4uuvv/Z67uzZs5GcnAwASEpKuqNEFXya65/97GeEYRjeu313QxGLxSQ5OZmcO3duzPtSt2YNqSOEkAtlJA2Df7NAT08PsVqtxGg0Ep1OR2JjYwkAwjAMYRiGiESigD6TUqkkDMMQpVJJABCtVkuys7OJ0WgMqQsfDvAizOvXr5NHH3106MbQMn4lLi6OHDlyxOe9cY8rb5fQhHnu3Dny8ssvk+TkZCISicbtQcwwDImNjSUZGRnEZDKR9vb2ED7F+DPuwmxvbycpKSlEIpHw/iO9W8uUKVPIO++843lzLpSRNcOXRurWEKSVkUAXSxobG0lhYSFJSEggKpUqYEvIdpHL5USpVJLExERiMpnI9evXQ/kJjwvjKsyzZ8+SxMRE3n+YtIBMnDiRvP3227dvzoUykjZChBfK0vwWZk9PDzGbzSQxMTGshydyuZwoFAqSnZ3N8novu4ybMM+ePUs0Gg3vN4aW2yUuLm5InAPH0siAwbxAytKGn+u7O9vZ2Uk2bdpEJkyYMDTWi5TCMAxJSEggZrN5nFTgP+MizPb2dmopw7TExcWRjRs3BnxPOzs7ycqVK4lcLue9qxpqUSqVYSdQzoXZ3t5OkpKSeP/yafFdGIbxW5w9PT3EaDSS+Ph4IhaLeW87m8Ut0JqaGk414Q+cCpOKMnIKwzBk165do97PqqoqotFoiEKh4L29XBa1Wk10Oh2vM7mcCfP69eskJSWF9y+ZFv/LlClTvC6luDcVqFQq3ts4noVhGGI0GklPTw9XMvEJJ54/DocDjzzyCFpaWtiuOuqRy+WQSCQecW6Tk5OhVqvvODcnJwcA8NFHH+G///0vrl27ht7eXvT19QUdnnPKlCloaGgYiixfX1+PZ555Bt98802QnyiyUSgUiI+Px5EjR8Y1ciEnwnzqqafw/vvvo6+vj+2qowaGYYYiuLtFl5OTM5RuQavVshodr6GhAXa7Hc3Nzaivr4fNZsNXX30FAIiJiUF/f//QuSqVCrt27cJf/vIXfPTRR9RdEkB8fDxKSkqwevXqcbke68Kkvq+eiEQiKJVKdHV1ITU1FUuXLkV6enpY5Dxxi7WjowN//etfceLECXz33XdwuVx3iJUyEFZ00aJFOHjwIPfpLtjsFzc2NpKEhATexwZ8F7frmUgkIjqdjpSUlESU7+bhw4eJRqMharWayOVyIpVKSUxMDO/fazgUsVhMZsyY4ZfPcSiwJszOzk4ybdo03r84vopMJiNKpZJotdqIE+JwTCYTSU9PJ+3t7aSqqoqsWLGCnDt3jphMJpKZmUkkEknULZMEU6ZMmULOnj3L2X1gRZg9PT0kNTWV9y+Lj6JWq0liYiLZvn17xDlKD6enp4esWLGC/PKXvyQ9PT3k448/Jh9//DHZtGkTKS0tveM8s9k8tDtEKBTyfg/4KhqNhrMHMCvCNBqNUb+2NVKMKpWKPPfcc5w+NceLc+fOEa1WS6qqqoYE6S49PT0kMzOTWK1Wj/e1t7eT7du3k8TERCKXy3m/L3yUhISEUXfrBEvIwmxvbw9rp2U2i1KpJNnZ2WHhGcIWVVVVRKfTkXPnznmI0l06OzuJVqslnZ2dPus5e/Ysee6554hSqSRSqZT3ezWexdf6byiEPCs7b948NDU1hVJFWCMSiSCXy5Gbm4utW7dGzU55h8OBoqIi2Gw2mEymMdecRSIRioqKYLFYRp2RdDgceOutt1BSUoLu7m5Oco6GI1OmTMHevXvx5JNPslJfSMLct28fNm/eHJVxemQyGWQyGX7yk59g27Zt0Gg0fDeJNWw2G/R6PXJycpCamur3+86cOYOLFy+itLR0zHMdDgcqKirw6quvoru7Gz09PaE0OSKIj4/Hhx9+yIojQtDCtNlsmDt3Lmw2W8iNCCcUCgViYmKwbt06bNmyJepS4NXX18NgMMBsNge1Trl//34sXLgwoLQMFRUV2Lx5M3p6etDd3R3wNSOJpKQkWK3WkHtWQUfJW7duXVS5aYnFYjAMg+LiYvzvf//Djh07ok6UxcXFKC8vR2NjY9DOAyaTCZWVlQENX55//nlcuXIFBw4cQHJyMmJjY4O6diTQ0dGBH/zgByFnnwvKYjY3NyM9PT1qurAMw2DZsmV46623ok6MwIB3T15eHhYuXIglS5aEXN/UqVORl5cHq9Ua1Pf1u9/9Djt27IDdbofT6Qy5PeGGRCKBVqvF6dOng64jKGEuWrQIVqs16IuGCxMmTMC9994Ls9kctQl5mpqaUFBQAJPJBJFIxFq9/f392Lp1a9C/A5vNhnXr1uH48eNR84AfjkqlwpYtW1BUVBTU+wMWZjRYS7FYDIVCgW3btmH9+vV8N4czysrKcOTIEVRVVaGtrY31+v/xj3/gxo0bwQWUHqShoQErV67E9evXo26CKJTJoIDHmC+++GJEi5JhGOTn56O9vT1qRWm326HX63Hx4kVYLBZORAkACxYsQGtrK2pra4OuIz09HZcvX8ZvfvMbxMfHs9e4MOCbb75BTk5OUOPNgCxmJFtLsViMyZMn47333ovabisAtLa2oqCgABs3bsTUqVM5v96sWbOQlZUFs9kc8k4Zm82GvLw8fPbZZ7h58yZLLeQXsViM/Px8HDhwIKD3BWQxI9VaMgyDRYsWoaWlJapFWVFRAb1eD7PZPC6iBICWlhaYzWbo9fqQZyI1Gg0aGxuxefPmqJmE6+vrw9GjR1FfXx/Q+/y2mE1NTcjKyoq4fZZxcXF45ZVXYDAY+G4KZzgcDhgMBjgcDphMJpw5c2bc23D16lVUVlaipqaGlfqamprw5JNP4sqVK6zUxzfJycn48ssv/T7fb4tpMpkiSpRisRjTpk3DyZMno1qUHR0dyMjIwJw5c/Dzn/+cF1ECQEJCAlJSUrBjxw5W6tPpdDh//jx0Oh3kcjkrdfLJ1atX8fvf/97v8/0SpsPhwKFDkZO+dOLEifjhD3+IL774gvcIAVxSW1uLvLw8mEwmzJo1i+/mYNmyZTh27NhQ9q5QYRgmarq23333HYqLi/3u7vslzOrqalbXwLgkISEB5eXlOHz4MPfhH3ikqKgIlZWVsFqtQQfe4oKamhoYDAZ0dHSwVmdxcTEsFgsSExNZq5MPent7/e5R+DXGnDlzJlpbW0NuGNdoNBpUVVUhPT2d76ZwhtsBfenSpViwYAHfzfGKvztRAqWjowNZWVkR8Vv0hUqlwhdffDHmpogxLWZra6vPJKLhRGJiIiwWS1SL0j0BZzQaw1aUwIBX0KpVq1BYWMhqvUlJSWhsbERqamrE9OBG4nQ6/Vo6GVOYe/bsCfs1pXvvvRcnT54c17if482OHTtgMBhgsVgi4kc5ffp0yGQy7N69m9V6GYbBxx9/jLS0tIh0hu/u7kZJScmYY80xhXngwIGwDWMokUiQkpKClpaWqNnAPBK3A/qNGzewc+dOzrx4uKC0tBTvvvsu6xvpZTIZTpw4gSeeeMIjCHYk4HQ6x1zXHHWM2dHRgYceeiisJhfcyOVyzJkzB3V1dRE/Y+eL5uZmFBQUwGg0IiEhge/mBMXUqVOh1+tRU1PDyWbzTZs2oaKiIuIcX8Za1xzVYtbX18PlcrHeqFCRSCSYM2cOjh8/HrWi3L17NwoLC1FTUxOxogSAS5cuoaSkBAUFBSF7BnmjrKwMRqMREydOZL1uLrl27Rqam5t9vj6qMA8dOhSW48v7778fx48fj8rlEIfDgYKCAnz66aewWq24dOkS300KGZFIhKVLlwa9BWosDAYDnnvuuYjq1nZ3d6OystLn6z67sg6HAxMnTuTkKRcKKSkpaGxsjEpL6XZAX7t2LaZPn853c1jn7bffRk5ODlasWMFJ/Xq9HrW1tWH3m/VFQkKCT5dDnxazqakp7CzStGnTgt41H+7U1tYObWiORlECgNlsxmuvvcbZOmRVVRUeffTRiJi1BgaWlXxNjPkU5ocffhhWkz6JiYk4ceJEVEWrc2MwGFBZWQmLxRJW3znbnDlzBjU1NdDr9Zz5XR87dgyPPPIIxGIxJ/WzSVdXF959912vr/nsyoZT+BCNRgOLxRJ165Tu/YdPP/10QGEkIx22d6KMxG63Y968eRHhIeSrO+vTYv7rX//itEH+kpCQgJqamqgTZUNDAzIyMlBaWnpXiRIYuKdz5sxBcXExJ/UzDAOr1Ypp06ZxUj+bOBwOr37FXoVpt9vDIoK2SqXCiy++GHWbm4uLi7F169aQwkhGOkuWLMHp06cD3kDsLxqNBseOHUNcXBwn9bOJt904XoXZ3NzMu7uTWCyGVqvlbIqdD+x2O7KysgAA27dvHzMtQbRTVVWFoqIiVneiDMe9P1SlUnFSPxt0dXXhk08+8TjuU5h8RyybPHkyjhw5wmsb2KSpqQkZGRnYuHEjK7Fdo4GWlhZUVVUhLy+PsyWO1atXY/HixWE9U+ut1+BVmGfPnuW1K5uQkID33nsvapZFdu/ejaKiItTU1ECpVPLdnLDi6tWreOmll1BQUMDZNQ4ePBjWezm/+uorjweTV2F+9tln49Igb0TTuNLhcECv1+PTTz+FxWKJCi8eLpg6dSo0Gg3Kyso4qV8mk4X1eFOpVHq453kVJl8/ILFYjMWLF0fFuLK1tRXz5s1DTk4OVq5cyVssnkihpKSE1bAkIwnn8abL5fIYZ/ucleWD+Ph4HDx4kJdrs0l1dTX0ej2qqqrGLYxkpHPmzBmYzWYYDAbOMsi5x5tCoZCT+oOlt7fX4zPfIUzi6se3Xd9BoIoF1ApgHD3yGIaB2WwOOzfAQHA4HCgsLMSRI0fQ2NiIq1ev8t2kiKKtrQ0mk4mVGLW++OMf/xh2Xdre3l6cP3/+jmN3CFMQI8K1K/+DTCwGbnYD4+QLLBaL8dhjjyEzM3N8LsgB7ng0c+bMwQsvvEC7rkHS39+PnJwczoYzDMPgjTfeCLsu7UgvJY+ubMfFi4iJCTptZlCoVCpUVFSM6zXZpL6+Hnl5eSgpKQmLMJKRTmpqKux2O2e/iWeffRbf//73Oak7WDo7O+/432Nxx27/dvAvMaCWDPzpugV81wfIZIBUCPQOWlOxGJAPntM7zMIOPw4ncGOY6R3x2gSXGEaj0dM5nThxy+kCAQBBDCRCIQQjj0MAsUiEGBD09/fDCcHA+MHZDycAxIggFbhunx8jgjRGENAXNhbFxcU4ffo0rFbrXe8wwCYmkwlZWVlISUnhZIbebDZj8eLFuHbtGut1B8PIYY+HabTbrw9GLegDbt4C4BwQJQA4nED/rUEBigfGoDe6gRu3AKkMEA8elwsHusI3uoF+IaBwe/p7vjbpnslesm4R9DsJYoRiSEUiCOGCk4w8LoY4hqDPRQAIIBLGQAACp9OJGJEYUmEMBK5+9LoA8dD/TrAVj8FutyMjIwMA8PLLL1NRsox7MqiwsJCTyUitVouVK1eG7S6U0fusfX1AL24LSyYEnH23/46RDEwSqSUAhIBw8LjLCQyehlu3AJHw9nuGvaaWSlFd9a6nWIgTzhghRAJgQHTigb+JE06BAMJBoxcjGCk2AYRC0bAPJYAwZtDSDuIKOE2vJ01NTZg3bx6MRiP14uGQS5cuobS0FHl5eZzU//rrr2PChAmc1B0qvoXpFpbDOSAsMQCh884Jof5bgxZzsLhfixllOnrwNZFIhPzlyzFn9hzv5xF3d3XkceL9+DhRVlYGg8EAq9Ua1m5e0QKXYUlkMhn2798flt5YQ8J0ufrQ6yIDyxXEOeyUQaspV9y2lsCgYCXDllQGu7YOJwDh7eNCIdDvvP2ewddUKhXeLC8FiRF6eToIICDu7isAkAFLJxBCCIL+IbNHfLyffdxhJC9evBhxYSQjHTYS5PoiNzc3LCfshn7TAggAVz/iJ09GTP+ICRKHE8AIa4k+oNcJSBWD3Vnh4OuDY1P3cbET6O67/Z6bt6BQxWHz/9sCgUQGydBkDEF/fx/6BwUoEQrgdPaht78Pvf1OYKhbGwO4+geOuwSD7yfod7pAQOB09sM15v+B0dzcjKysLKxatQrLly8P8N0UNuAyLInJZAo7q+kRwaChoQF5eXkjBtxiQNbH2rpmXFwcLl++HBHOBBUVFdizZw/MZjN1GOCZhIQE6PV6NDY2sv7b4Ttix8hIBv71AmVCwDn2af6gUChQVFQU9qJ0e/GcOHECVquVijIMuHr1KoxGI/R6Pet179q1i1erOdJ100OYdwhGMdgdheP2ZFCISKVS/OpXv2KnMo4Il2SwFE/YTpDrRqvVYtKkSazWGQgjDZWHMDUaze1wF90jZltDJBKsZbglg6V4wnaCXDe7du3iLXv1mBYzKSmJsxCK4W4twzUZLMUTLhLk5ubmYsqUKazVFwgjrbXXMSYXDr7hbC1tNhvmzZsHtVqNX//619SLJwJoaWmByWRiPScKH1ZTLpdj9uzZdxzzKsz4+HjWLy4UCvHMM8+wXm+oNDQ0ICsrC6WlpWGdDJbiCRcJcnNzczF58mTW6vMHiUTiEZ7VqzC5MOcPP/xw2EVR37FjB7Zu3QqLxXLXhpGMdLhIkPvSSy+Nq1dXT08PUlJS7jjmVZhs77pXq9XYsGEDq3WGwvBksNu3b6dePBEO2wlyc3NzIZFIxj6RJaRSqUfgOa/CfOCBB1h9YgiFwrDZBN3c3IyMjAysWrUKy5Yt47s5FBZgOyyJRqPBY489xkLL/OPBBx/0OOZVmAsWLGBtsVUkEmH58uVhMekTLclgKZ6wnSB3w4YNkEqlLLRsdEQiERYvXuxx3KswdTodawGfFQoF1q1bx0pdwRKNyWApnrC5EyUzM3NcjIlSqfS6ddCrMBmGYW1m9p577uE1IZA7jOTChQtpGMm7gNTUVNhsNlRXV4dUj0wmQ05ODkut8o3D4fAaocGnrywbYpLL5di4cWPI9QRLdXU1CgoKUFVVFbXJYCmesLUTpbCwkHOrqdVqvV7DpzCfeOKJkPvYTqeTs7Teo+FwOGAwGHDkyBFYLBbqgH6XwVaCXJ1Ox2mEA6lUiqefftrraz6FqdPpQs74NXPmzHHPP2Kz2ZCVlYX77rsPL7zwAvXiuUu5dOkSjEZjyDlR1qxZw1KLPJFKpT5XK0btyjqdwe/1EolE474c4U4GW1JSctclg6V4wkaC3CVLlnAWsCs+Pt7DscCNT2HKZDI8/vjjQV/U12wTV9BksBRvhJogV6fTcZJSYazVilE3Sj/77LNBO7Q7nc5xydhFk8FSxiKUBLkymQxz585lv1EAfvGLX/h8bVRhZmZmBt2dTU1N5XxGyx1GkiaDpYxGqAly8/PzIRCwGyh84cKFo86/jCpMhmHw0EMPBXxRqVSKJ598MuD3BUJZWRmKiopgtVrDLpASJfwIJUFueno6q+NMhmG8BDm/kzFj/qxatSrg/WmjzTaFit1uh16vx8WLF2GxWKgDOsVvgk2Qy7aDzMSJE8fUx5jCzM3NDfjCEonE52xTKLS2tiIrKws5OTlYvnw59eKhBEywCXLZGmeq1Wrs2rVrzPPGFGYwnvaPPvpoQOf7gzsZrNlspslgKUET7E6UH//4x6xcPy4uzi9j51f4yg0bNkCtVvt1YZFIFNIyy0hoMlgK2wSTIHfJkiUhp6f011oCXgI+++Lee+/F119/7dfFKyoqguoCj6SjowMFBQV4+umnacQ6CuucOXMGFy9eRGlp6ZjnOhwOyOVy+CkXryQnJ+PLL7/061y/HwEGgwEKhWLM8wghrIwvaTJYCtcEkiBXJpOFtPynVqtx+PBhv8/3W5ijLYYO5+bNmyELs6ioCOXl5bBardSLh8IpJpMJlZWVfoUlCTZmlUgkQn5+fkCzu34Lk2EY/PSnPx0z5Mj3vvc9vy8+EpvNhoyMDKjVapoMljIuBJIgN9iZWZVKhTfffDOg9wQ0ml23bt2Y3dmZM2cG1AA3TU1NyMrKgtFopGEkKeOKvwlytVptwB5AarUa+/fvD7gbHJAwtVot8vPzfVrNYGdk3clgLRYLTQZL4QV/wpI88sgjAdUpFouxaNGioCZCA57/3bZtm09PIIVCEdD4kiaDpYQTYyXITUlJCchi3nPPPTh48GBQbQlYmBqNBqtWrfJqmgOZkaXJYCnhyGhhSZKSkvxeLomLi8OxY8eCnskNasX09ddf92o1XS6XXw2pqKhAYWEhzGYzDSNJCStaWlpgNpt9Oh/4EwharVZjx44dIa1OBCVMmUyGoqIij4kgh8Mx6pSyO4wkTQZLCWdGS5A7Vhws97hy9erVIbXBb8+fkTgcDsyePRsXLly447iv6lpbW1FQUIC1a9fSiHWUiODo0aNQq9XYsmXL0LH4+Hhcu3bN6/kSiQQPP/wwjh8/HvJe5KCd/2QyGd555507Nnv6inZQW1uLgoICmEwmKkpKxOAtQe5oET1mz57NiiiBEIQJDMRDefbZZ4ca4q3/bTAYUFlZCYvFQpPBUiKOkQlyfW2YTklJwQcffMBa1I7Q3OUxMBHk9vYZvh3LnQz2vvvuo8lgKRHLyAS53nZZpaSkoLGxkdVQrSELUyaT4dChQ3eY+OHJYGkYSUqkMzxB7vAJT3dAALZFCbAgTGDAIygtLQ0zZsygyWApUYk7Qa5UKoVQKERsbCy0Wi0nogQCEKZlrQACwYgyvxxuX50lS5bAarWitbUVf//736kXDyXqKC0txX/+8x/I5XLk5OTg9OnTnGUaCGi5pK18PmZ8/grIniwAFqwVZGPvmjqQPVk4cOAAtmzZgsuXL7PeSK1Wy8oXkJ6ePvR3RUVFUHFGKRSRSIS+vj5OrxGAMNtQPn8GPn+FYM9AfGVY1gqQjbpBoVIo0U9HRwc6OjrueMgDACxrIcjeO/TvmjqCPViLtdiDYOTh/xiz7SgOnVqD3MGLtJXPR/beNJRtpqKk3D0kJSWNEGUbyucLIMgG6ggBGSy5tQIIslvw4IzgruO3MNuOHsIp7EX24PhyFSpByElspP4ClLuWgV7kJpThAtmD4SYqa08d1mAW7g9SH34Ksw1HD53CmrrBJ8KFMmDTDMwvpxM8lLuXtvJV2HQqDWWVG+Gpvxl4sCwXwfYn/RtjtpVj/ozP8cqwp4JlrQDZe9egbsSTgkK5Oxi0lrO4mWPxy2K2HT2EU2uGq78N51sApD2IILvQFEqEcwGfnwLSgh1EjoEfcTwGu7Gv3JalZe0MbDoFrKnzZsIplLuAtvPg0sl0TIspEAyIcG/2bceC7L1pKLtAgpoGplCigun3YxaAU59f8Pqypfy2800wBL0fk0K56xlcu0wru4CTQ8sTnuv9wUCFSaGEQls55s/YhFNDB9JQdiH0ZUQqTAolDGFldwmFQmEXKkwKJQz5/6GFNae7ZI7lAAAAAElFTkSuQmCC", "text/plain": [ "" ] }, + "execution_count": 11, "metadata": {}, - "execution_count": 11 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "segments.figure_segments[0].figure" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "- 特殊符号" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 12, - "source": [ - "segments.ques_mark_segments" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['\\\\SIFChoice']" ] }, + "execution_count": 12, "metadata": {}, - "execution_count": 12 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "segments.ques_mark_segments" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### 标记化切分 \n", "如果您不注重题目文本和公式的具体内容,仅仅是对题目的整体(或部分)构成感兴趣,那么可以通过修改 `symbol` 参数来将不同的成分转化成特定标记,方便您的研究。" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ " - symbol:\n", " - \"t\": text\n", " - \"f\": formula\n", " - \"g\": figure\n", " - \"m\": question mark" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 13, - "source": [ - "sif4sci(item[\"stem\"], figures=figures, tokenization=False, symbol=\"tfgm\")" - ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[FIGURE]']" ] }, + "execution_count": 13, "metadata": {}, - "execution_count": 13 + "output_type": "execute_result" } ], + "source": [ + "sif4sci(item[\"stem\"], figures=figures, tokenization=False, symbol=\"tfgm\")" + ] + }, + { + "cell_type": "markdown", "metadata": { "collapsed": false, "pycharm": { - "name": "#%%\n" + "name": "#%% md\n" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "### 令牌化\n", "\n", "为了方便后续向量化表征试题,本模块提供题目文本的令牌化解析(Tokenization),即将题目转换成令牌序列。 \n", "\n", "根据构成题目的元素类型,解析功能分为 **“文本解析”** 和 **“公式解析”** 两部分。更具体的过程解析参见 [令牌化](../Tokenizer/tokenizer.ipynb)。" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } + ] }, { "cell_type": "code", "execution_count": 14, - "source": [ - "tokens = sif4sci(item[\"stem\"], figures=figures, tokenization=True)" - ], - "outputs": [], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } + }, + "outputs": [], + "source": [ + "tokens = sif4sci(item[\"stem\"], figures=figures, tokenization=True)" + ] }, { "cell_type": "markdown", - "source": [ - "- 文本解析结果" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "- 文本解析结果" + ] }, { "cell_type": "code", "execution_count": 15, - "source": [ - "tokens.text_tokens" - ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['如图',\n", @@ -540,38 +542,38 @@ " '记']" ] }, + "execution_count": 15, "metadata": {}, - "execution_count": 15 + "output_type": "execute_result" } ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "source": [ + "tokens.text_tokens" + ] }, { "cell_type": "markdown", - "source": [ - "#### 公式解析结果" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "#### 公式解析结果" + ] }, { "cell_type": "code", "execution_count": 16, - "source": [ - "tokens.formula_tokens" - ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['ABC',\n", @@ -601,49 +603,40 @@ " '3']" ] }, + "execution_count": 16, "metadata": {}, - "execution_count": 16 + "output_type": "execute_result" } ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "source": [ + "tokens.formula_tokens" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "- 自定义参数,得到定制化解析结果" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "(1)如果您想按 latex 语法标记拆分公式的各个部分,并得到顺序序列结果,输出方法(`method`)可以选择:`linear`" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 17, - "source": [ - "sif4sci(\r\n", - " item[\"stem\"],\r\n", - " figures=figures,\r\n", - " tokenization=True,\r\n", - " tokenization_params={\r\n", - " \"formula_params\": {\r\n", - " \"method\": \"linear\",\r\n", - " }\r\n", - " }\r\n", - ").formula_tokens" - ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['ABC',\n", @@ -673,44 +666,44 @@ " '3']" ] }, + "execution_count": 17, "metadata": {}, - "execution_count": 17 + "output_type": "execute_result" } ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "source": [ + "sif4sci(\n", + " item[\"stem\"],\n", + " figures=figures,\n", + " tokenization=True,\n", + " tokenization_params={\n", + " \"formula_params\": {\n", + " \"method\": \"linear\",\n", + " }\n", + " }\n", + ").formula_tokens" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "(2) 如果您想得到公式解析出的语法分析树序列,输出方法可以选择:`ast`\n", "> 抽象语法分析树,简称语法树(Syntax tree),是源代码语法结构的一种抽象表示。它以树状的形式表现编程语言的语法结构,树上的每个节点都表示源代码中的一种结构。 \n", "> 因此,ast 可以看做是公式的语法结构表征。" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 18, - "source": [ - "sif4sci(\r\n", - " item[\"stem\"],\r\n", - " figures=figures,\r\n", - " tokenization=True,\r\n", - " tokenization_params={\r\n", - " \"formula_params\":{\r\n", - " \"method\": \"ast\",\r\n", - " }\r\n", - " }\r\n", - ").formula_tokens\r\n" - ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[,\n", @@ -725,46 +718,37 @@ " ]" ] }, + "execution_count": 18, "metadata": {}, - "execution_count": 18 + "output_type": "execute_result" } ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "source": [ + "sif4sci(\n", + " item[\"stem\"],\n", + " figures=figures,\n", + " tokenization=True,\n", + " tokenization_params={\n", + " \"formula_params\":{\n", + " \"method\": \"ast\",\n", + " }\n", + " }\n", + ").formula_tokens\n" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "- 语法树展示:" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 19, - "source": [ - "f = sif4sci(\r\n", - " item[\"stem\"],\r\n", - " figures=figures,\r\n", - " tokenization=True,\r\n", - " tokenization_params={\r\n", - " \"formula_params\":{\r\n", - " \"method\": \"ast\",\r\n", - " \"return_type\": \"ast\",\r\n", - " \"ord2token\": True,\r\n", - " \"var_numbering\": True,\r\n", - " }\r\n", - " }\r\n", - ").formula_tokens\r\n", - "f\r\n" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "[,\n", @@ -779,53 +763,60 @@ " ]" ] }, + "execution_count": 19, "metadata": {}, - "execution_count": 19 + "output_type": "execute_result" } ], - "metadata": {} + "source": [ + "f = sif4sci(\n", + " item[\"stem\"],\n", + " figures=figures,\n", + " tokenization=True,\n", + " tokenization_params={\n", + " \"formula_params\":{\n", + " \"method\": \"ast\",\n", + " \"return_type\": \"ast\",\n", + " \"ord2token\": True,\n", + " \"var_numbering\": True,\n", + " }\n", + " }\n", + ").formula_tokens\n", + "f\n" + ] }, { "cell_type": "code", "execution_count": 20, - "source": [ - "# for i in range(0, len(f)):\r\n", - "# ForestPlotter().export(\r\n", - "# f[i], root_list=[node for node in f[i]],\r\n", - "# )\r\n", - "# plt.show()\r\n" - ], + "metadata": {}, "outputs": [], - "metadata": {} + "source": [ + "# for i in range(0, len(f)):\n", + "# ForestPlotter().export(\n", + "# f[i], root_list=[node for node in f[i]],\n", + "# )\n", + "# plt.show()\n" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "(3)如果您只是关心公式的结构和类型,并不关心变量具体是什么,比如二元二次方程 `x^2 + y = 1` ,它从公式结构和类型上来说,和 `w^2 + z = 1` 没有区别。 \n", "此时,您可以设置如下参数:`ord2token = True`,将公式变量名转换成 token" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 21, - "source": [ - "sif4sci(\r\n", - " item[\"stem\"],\r\n", - " figures=figures,\r\n", - " tokenization=True,\r\n", - " tokenization_params={\r\n", - " \"formula_params\":{\r\n", - " \"method\": \"ast\",\r\n", - " \"return_type\": \"list\",\r\n", - " \"ord2token\": True,\r\n", - " }\r\n", - " }\r\n", - ").formula_tokens" - ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['mathord',\n", @@ -868,45 +859,44 @@ " '\\\\supsub']" ] }, + "execution_count": 21, "metadata": {}, - "execution_count": 21 + "output_type": "execute_result" } ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "source": [ + "sif4sci(\n", + " item[\"stem\"],\n", + " figures=figures,\n", + " tokenization=True,\n", + " tokenization_params={\n", + " \"formula_params\":{\n", + " \"method\": \"ast\",\n", + " \"return_type\": \"list\",\n", + " \"ord2token\": True,\n", + " }\n", + " }\n", + ").formula_tokens" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "(4) 如果您除了 (3) 中提供的功能之外,还需要区分不同的变量。此时可以另外设置参数:`var_numbering=True`" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 22, - "source": [ - "sif4sci(\r\n", - " item[\"stem\"],\r\n", - " figures=figures,\r\n", - " tokenization=True,\r\n", - " tokenization_params={\r\n", - " \"formula_params\":{\r\n", - " \"method\": \"ast\",\r\n", - " \"ord2token\": True,\r\n", - " \"return_type\": \"list\",\r\n", - " \"var_numbering\": True\r\n", - " }\r\n", - " }\r\n", - ").formula_tokens" - ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['mathord_0',\n", @@ -949,77 +939,87 @@ " '\\\\supsub']" ] }, + "execution_count": 22, "metadata": {}, - "execution_count": 22 + "output_type": "execute_result" } ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "source": [ + "sif4sci(\n", + " item[\"stem\"],\n", + " figures=figures,\n", + " tokenization=True,\n", + " tokenization_params={\n", + " \"formula_params\":{\n", + " \"method\": \"ast\",\n", + " \"ord2token\": True,\n", + " \"return_type\": \"list\",\n", + " \"var_numbering\": True\n", + " }\n", + " }\n", + ").formula_tokens" + ] }, { "cell_type": "markdown", - "source": [ - "## 综合训练\n", - "\n", - "综合上述方法,将题目转换成令牌序列,为后续向量化做准备。" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "## 综合训练\n", + "\n", + "综合上述方法,将题目转换成令牌序列,为后续向量化做准备。" + ] }, { "cell_type": "code", "execution_count": 23, - "source": [ - "sif4sci(item[\"stem\"], figures=figures, tokenization=True,\r\n", - " symbol=\"fgm\")" - ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['如图', '古希腊', '数学家', '希波', '克拉底', '研究', '几何图形', '此图', '三个', '半圆', '三个', '半圆', '直径', '直角三角形', '[FORMULA]', '斜边', '[FORMULA]', '直角', '[FORMULA]', '[FORMULA]', '[FORMULA]', '三边', '围成', '区域', '记', '[FORMULA]', '黑色', '记', '[FORMULA]', '其余部分', '记', '[FORMULA]', '图形', '中', '随机', '取', '一点', '此点', '取自', '[FORMULA]', '概率', '记', '[FORMULA]', '[MARK]', '[FIGURE]']" ] }, + "execution_count": 23, "metadata": {}, - "execution_count": 23 + "output_type": "execute_result" } ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + "source": [ + "sif4sci(item[\"stem\"], figures=figures, tokenization=True,\n", + " symbol=\"fgm\")" + ] } ], "metadata": { + "interpreter": { + "hash": "776957673adb719a00031a24ed5efd2fa5ce8a13405e5193f8d278edd3805d55" + }, "kernelspec": { - "name": "python3", - "display_name": "Python 3.6.13 64-bit ('data': conda)" + "display_name": "Python 3.6.13 64-bit ('data': conda)", + "name": "python3" }, "language_info": { - "name": "python", - "version": "3.6.13", - "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, - "pygments_lexer": "ipython3", + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", "nbconvert_exporter": "python", - "file_extension": ".py" - }, - "interpreter": { - "hash": "776957673adb719a00031a24ed5efd2fa5ce8a13405e5193f8d278edd3805d55" + "pygments_lexer": "ipython3", + "version": "3.6.13" } }, "nbformat": 4, From 25ce4045d7470dde2bb83ea0db02f6a50c1210a4 Mon Sep 17 00:00:00 2001 From: nnnyt <793313994@qq.com> Date: Mon, 19 Dec 2022 16:02:19 +0800 Subject: [PATCH 08/58] [fix] support gpu for Vector --- EduNLP/Vector/bert_vec.py | 6 ++++-- EduNLP/Vector/disenqnet/disenqnet.py | 6 +++--- EduNLP/Vector/elmo_vec.py | 5 +++-- EduNLP/Vector/quesnet/quesnet.py | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/EduNLP/Vector/bert_vec.py b/EduNLP/Vector/bert_vec.py index 33fc0061..eb58d48a 100644 --- a/EduNLP/Vector/bert_vec.py +++ b/EduNLP/Vector/bert_vec.py @@ -29,8 +29,10 @@ class BertModel(Vector): torch.Size([2, 768]) """ - def __init__(self, pretrained_model): - self.model = AutoModel.from_pretrained(pretrained_model) + def __init__(self, pretrained_dir, device="cpu"): + self.device = torch.device(device) + self.model = AutoModel.from_pretrained(pretrained_dir).to(self.device) + self.model.eval() def __call__(self, items: dict): # batch_size, sent_len, embedding_size diff --git a/EduNLP/Vector/disenqnet/disenqnet.py b/EduNLP/Vector/disenqnet/disenqnet.py index edff4412..28657cc9 100644 --- a/EduNLP/Vector/disenqnet/disenqnet.py +++ b/EduNLP/Vector/disenqnet/disenqnet.py @@ -12,9 +12,9 @@ def __init__(self, pretrained_dir, device="cpu"): device: str cpu or cuda, default is cpu """ - self.device = device - self.model = DisenQNet.from_pretrained(pretrained_dir) - self.model.to(self.device) + self.device = torch.device(device) + self.model = DisenQNet.from_pretrained(pretrained_dir).to(self.device) + self.model.eval() def __call__(self, items: dict, **kwargs): outputs = self.model(**items) diff --git a/EduNLP/Vector/elmo_vec.py b/EduNLP/Vector/elmo_vec.py index 84146534..7ab95e2a 100644 --- a/EduNLP/Vector/elmo_vec.py +++ b/EduNLP/Vector/elmo_vec.py @@ -15,14 +15,15 @@ class ElmoModel(Vector): - def __init__(self, pretrained_dir: str): + def __init__(self, pretrained_dir: str, device="cpu"): """ Parameters ---------- pretrained_model_path: str """ super(ElmoModel, self).__init__() - self.model = ElmoLM.from_pretrained(pretrained_dir) + self.device = torch.device(device) + self.model = ElmoLM.from_pretrained(pretrained_dir).to(self.device) self.model.eval() def __call__(self, *args, **kwargs): diff --git a/EduNLP/Vector/quesnet/quesnet.py b/EduNLP/Vector/quesnet/quesnet.py index cd845461..06477215 100644 --- a/EduNLP/Vector/quesnet/quesnet.py +++ b/EduNLP/Vector/quesnet/quesnet.py @@ -17,7 +17,7 @@ def __init__(self, pretrained_dir, img_dir=None, device="cpu", **kwargs): image dir """ self.device = torch.device(device) - self.model = QuesNet.from_pretrained(pretrained_dir, img_dir=img_dir).to(device) + self.model = QuesNet.from_pretrained(pretrained_dir, img_dir=img_dir).to(self.device) self.model.eval() def __call__(self, items: dict, **kwargs): From 1e1cf2fd8af4c0c90be391cbff4f5f71020c2c65 Mon Sep 17 00:00:00 2001 From: nnnyt <793313994@qq.com> Date: Mon, 19 Dec 2022 10:06:10 +0000 Subject: [PATCH 09/58] [feat] support gpu for I2V --- EduNLP/I2V/i2v.py | 28 +++++++++++++++------------- EduNLP/Vector/bert_vec.py | 3 +++ EduNLP/Vector/disenqnet/disenqnet.py | 3 +++ EduNLP/Vector/elmo_vec.py | 3 +++ EduNLP/Vector/gensim_vec.py | 4 ++-- EduNLP/Vector/t2v.py | 4 ++-- 6 files changed, 28 insertions(+), 17 deletions(-) diff --git a/EduNLP/I2V/i2v.py b/EduNLP/I2V/i2v.py index 720b0eff..c3fb453d 100644 --- a/EduNLP/I2V/i2v.py +++ b/EduNLP/I2V/i2v.py @@ -1,6 +1,7 @@ # coding: utf-8 # 2021/8/1 @ tongshiwei +import torch import json import os.path from typing import List, Tuple @@ -59,12 +60,12 @@ class I2V(object): """ def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None, - pretrained_t2v=False, model_dir=MODEL_DIR, **kwargs): + pretrained_t2v=False, model_dir=MODEL_DIR, device='cpu', **kwargs): if pretrained_t2v: logger.info("Use pretrained t2v model %s" % t2v) - self.t2v = get_t2v_pretrained_model(t2v, model_dir) + self.t2v = get_t2v_pretrained_model(t2v, model_dir, device) else: - self.t2v = T2V(t2v, *args, **kwargs) + self.t2v = T2V(t2v, device=device, *args, **kwargs) if tokenizer == 'bert': self.tokenizer = BertTokenizer.from_pretrained( **tokenizer_kwargs if tokenizer_kwargs is not None else {}) @@ -88,6 +89,7 @@ def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None, "kwargs": kwargs, "pretrained_t2v": pretrained_t2v } + self.device = torch.device(device) def __call__(self, items, *args, **kwargs): """transfer item to vector""" @@ -327,13 +329,13 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict], return self.t2v.infer_vector(inputs, *args, **kwargs), self.t2v.infer_tokens(inputs, *args, **kwargs) @classmethod - def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): + def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwargs): model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True) for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]: model_path = model_path.replace(i, "") logger.info("model_path: %s" % model_path) tokenizer_kwargs = {"tokenizer_config_dir": model_path} - return cls("elmo", name, pretrained_t2v=True, model_dir=model_dir, + return cls("elmo", name, pretrained_t2v=True, model_dir=model_dir, device=device, tokenizer_kwargs=tokenizer_kwargs) @@ -390,13 +392,13 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict], return self.t2v.infer_vector(inputs, *args, **kwargs), self.t2v.infer_tokens(inputs, *args, **kwargs) @classmethod - def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): + def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwargs): model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True) for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]: model_path = model_path.replace(i, "") logger.info("model_path: %s" % model_path) tokenizer_kwargs = {"tokenizer_config_dir": model_path} - return cls("bert", name, pretrained_t2v=True, model_dir=model_dir, + return cls("bert", name, pretrained_t2v=True, model_dir=model_dir, device=device, tokenizer_kwargs=tokenizer_kwargs) @@ -452,7 +454,7 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict], return i_vec, t_vec @classmethod - def from_pretrained(cls, name, model_dir=MODEL_DIR, **kwargs): + def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', **kwargs): model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True) for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]: model_path = model_path.replace(i, "") @@ -461,7 +463,7 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, **kwargs): tokenizer_kwargs = { "tokenizer_config_dir": model_path, } - return cls("disenq", name, pretrained_t2v=True, model_dir=model_dir, + return cls("disenq", name, pretrained_t2v=True, model_dir=model_dir, device=device, tokenizer_kwargs=tokenizer_kwargs, **kwargs) @@ -499,14 +501,14 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict], return self.t2v.infer_vector(encodes), self.t2v.infer_tokens(encodes) @classmethod - def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): + def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwargs): model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True) for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]: model_path = model_path.replace(i, "") logger.info("model_path: %s" % model_path) tokenizer_kwargs = { "tokenizer_config_dir": model_path} - return cls("quesnet", name, pretrained_t2v=True, model_dir=model_dir, + return cls("quesnet", name, pretrained_t2v=True, model_dir=model_dir, device=device, tokenizer_kwargs=tokenizer_kwargs) @@ -520,7 +522,7 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): } -def get_pretrained_i2v(name, model_dir=MODEL_DIR): +def get_pretrained_i2v(name, model_dir=MODEL_DIR, device='cpu'): """ It is a good idea if you want to switch item to vector earily. @@ -560,4 +562,4 @@ def get_pretrained_i2v(name, model_dir=MODEL_DIR): ) _, t2v = get_pretrained_model_info(name) _class, *params = MODEL_MAP[t2v], name - return _class.from_pretrained(*params, model_dir=model_dir) + return _class.from_pretrained(*params, model_dir=model_dir, device=device) diff --git a/EduNLP/Vector/bert_vec.py b/EduNLP/Vector/bert_vec.py index eb58d48a..86d893bd 100644 --- a/EduNLP/Vector/bert_vec.py +++ b/EduNLP/Vector/bert_vec.py @@ -35,6 +35,9 @@ def __init__(self, pretrained_dir, device="cpu"): self.model.eval() def __call__(self, items: dict): + for k, v in items.items(): + if isinstance(v, torch.Tensor): + items[k] = v.to(self.device) # batch_size, sent_len, embedding_size tokens = self.model(**items).last_hidden_state return tokens diff --git a/EduNLP/Vector/disenqnet/disenqnet.py b/EduNLP/Vector/disenqnet/disenqnet.py index 28657cc9..4c6f2602 100644 --- a/EduNLP/Vector/disenqnet/disenqnet.py +++ b/EduNLP/Vector/disenqnet/disenqnet.py @@ -17,6 +17,9 @@ def __init__(self, pretrained_dir, device="cpu"): self.model.eval() def __call__(self, items: dict, **kwargs): + for k, v in items.items(): + if isinstance(v, torch.Tensor): + items[k] = v.to(self.device) outputs = self.model(**items) return outputs.embeded, outputs.k_hidden, outputs.i_hidden diff --git a/EduNLP/Vector/elmo_vec.py b/EduNLP/Vector/elmo_vec.py index 7ab95e2a..fcc71db8 100644 --- a/EduNLP/Vector/elmo_vec.py +++ b/EduNLP/Vector/elmo_vec.py @@ -33,6 +33,9 @@ def infer_vector(self, items: Tuple[dict, List[dict]], *args, **kwargs) -> torch # TODO: handle batch and unbatch format for inputs and outputs # is_batch = isinstance(items, list) # items = items if is_batch else [items] + for k, v in items.items(): + if isinstance(v, torch.Tensor): + items[k] = v.to(self.device) outputs = self.model(**items) item_embeds = torch.cat( (outputs.forward_output[torch.arange(len(items["seq_len"])), torch.tensor(items["seq_len"]) - 1], diff --git a/EduNLP/Vector/gensim_vec.py b/EduNLP/Vector/gensim_vec.py index ec616e3d..35aa9996 100644 --- a/EduNLP/Vector/gensim_vec.py +++ b/EduNLP/Vector/gensim_vec.py @@ -23,7 +23,7 @@ class W2V(Vector): other(Word2Vec) binary: bool """ - def __init__(self, filepath, method=None, binary=None): + def __init__(self, filepath, method=None, binary=None, **kwargs): fp = PurePath(filepath) self.binary = binary if binary is not None else (True if fp.suffix == ".bin" else False) if self.binary is True: @@ -152,7 +152,7 @@ class D2V(Vector): --------- d2v model:D2V """ - def __init__(self, filepath, method="d2v"): + def __init__(self, filepath, method="d2v", **kwargs): self._method = method self._filepath = filepath if self._method == "d2v": diff --git a/EduNLP/Vector/t2v.py b/EduNLP/Vector/t2v.py index 1635eb4b..5a386666 100644 --- a/EduNLP/Vector/t2v.py +++ b/EduNLP/Vector/t2v.py @@ -96,7 +96,7 @@ def get_all_pretrained_models(): return r['name'] -def get_pretrained_t2v(name, model_dir=MODEL_DIR, **kwargs): +def get_pretrained_t2v(name, model_dir=MODEL_DIR, device='cpu', **kwargs): """ It is a good idea if you want to switch token list to vector earily. @@ -138,4 +138,4 @@ def get_pretrained_t2v(name, model_dir=MODEL_DIR, **kwargs): if model_name in ["d2v", "w2v"]: postfix = ".bin" if model_name == "d2v" else ".kv" model_path = path_append(model_path, os.path.basename(model_path) + postfix, to_str=True) - return T2V(model_name, model_path, *args, **kwargs) + return T2V(model_name, model_path, device=device, *args, **kwargs) From 8da20093058fa179aa09f263fec1ea8be3b618f9 Mon Sep 17 00:00:00 2001 From: nnnyt <793313994@qq.com> Date: Mon, 19 Dec 2022 10:24:57 +0000 Subject: [PATCH 10/58] [fix] use ubuntu-20.04 for py3.6 --- .github/workflows/python-test.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index 6f481853..c204a3cc 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -6,10 +6,14 @@ on: [push, pull_request] jobs: build: - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} strategy: matrix: python-version: [3.6, 3.7, 3.8, 3.9] + include: + - os: "ubuntu-latest" + - os: "ubuntu-20.04" + python-version: "3.6" steps: - uses: actions/checkout@v2 From c9be8fa20feb4dc64fa543dafc2cd23f479fa930 Mon Sep 17 00:00:00 2001 From: nnnyt <793313994@qq.com> Date: Mon, 19 Dec 2022 10:29:16 +0000 Subject: [PATCH 11/58] [fix] remove useless import --- tests/test_pretrain/test_pretrained_quesnet.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_pretrain/test_pretrained_quesnet.py b/tests/test_pretrain/test_pretrained_quesnet.py index f1ca3323..20745de7 100644 --- a/tests/test_pretrain/test_pretrained_quesnet.py +++ b/tests/test_pretrain/test_pretrained_quesnet.py @@ -1,7 +1,6 @@ from lib2to3.pgen2 import token import os -from bson import encode os.environ["CUDA_VISIBLE_DEVICES"] = "" import pytest From 1350ff6f221a0ff0183e3c2397def6c1067a5dcf Mon Sep 17 00:00:00 2001 From: KenelmQLH <1097824882@qq.com> Date: Fri, 24 Mar 2023 08:22:23 +0000 Subject: [PATCH 12/58] update device for t2v --- EduNLP/I2V/i2v.py | 6 +++- EduNLP/Vector/bert_vec.py | 7 +++-- EduNLP/Vector/disenqnet/disenqnet.py | 9 +++--- EduNLP/Vector/elmo_vec.py | 29 +++++++------------ EduNLP/Vector/meta.py | 7 +++++ EduNLP/Vector/quesnet/quesnet.py | 3 +- tests/test_pretrain/test_hugginface_utils.py | 2 +- tests/test_pretrain/test_pretrained_bert.py | 2 +- .../test_pretrained_disenqnet.py | 2 +- tests/test_pretrain/test_pretrained_elmo.py | 4 +-- .../test_pretrain/test_pretrained_quesnet.py | 2 +- 11 files changed, 40 insertions(+), 33 deletions(-) diff --git a/EduNLP/I2V/i2v.py b/EduNLP/I2V/i2v.py index 720b0eff..1925ddb9 100644 --- a/EduNLP/I2V/i2v.py +++ b/EduNLP/I2V/i2v.py @@ -62,7 +62,7 @@ def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None, pretrained_t2v=False, model_dir=MODEL_DIR, **kwargs): if pretrained_t2v: logger.info("Use pretrained t2v model %s" % t2v) - self.t2v = get_t2v_pretrained_model(t2v, model_dir) + self.t2v = get_t2v_pretrained_model(t2v, model_dir, **kwargs) else: self.t2v = T2V(t2v, *args, **kwargs) if tokenizer == 'bert': @@ -386,6 +386,8 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict], -------- vector:list """ + is_batch = isinstance(items, list) + items = items if is_batch else [items] inputs = self.tokenize(items, key=key, return_tensors=return_tensors) return self.t2v.infer_vector(inputs, *args, **kwargs), self.t2v.infer_tokens(inputs, *args, **kwargs) @@ -495,6 +497,8 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict], token embeddings question embedding """ + is_batch = isinstance(items, list) + items = items if is_batch else [items] encodes = self.tokenize(items, key=key, meta=meta, *args, **kwargs) return self.t2v.infer_vector(encodes), self.t2v.infer_tokens(encodes) diff --git a/EduNLP/Vector/bert_vec.py b/EduNLP/Vector/bert_vec.py index 33fc0061..3b4165e4 100644 --- a/EduNLP/Vector/bert_vec.py +++ b/EduNLP/Vector/bert_vec.py @@ -29,11 +29,14 @@ class BertModel(Vector): torch.Size([2, 768]) """ - def __init__(self, pretrained_model): - self.model = AutoModel.from_pretrained(pretrained_model) + def __init__(self, pretrained_model, device="cpu"): + self.device = device + self.model = AutoModel.from_pretrained(pretrained_model).to(device) + self.model.eval() def __call__(self, items: dict): # batch_size, sent_len, embedding_size + self.cuda_tensor(items) tokens = self.model(**items).last_hidden_state return tokens diff --git a/EduNLP/Vector/disenqnet/disenqnet.py b/EduNLP/Vector/disenqnet/disenqnet.py index edff4412..adb51741 100644 --- a/EduNLP/Vector/disenqnet/disenqnet.py +++ b/EduNLP/Vector/disenqnet/disenqnet.py @@ -1,8 +1,8 @@ import torch from EduNLP.ModelZoo.disenqnet.disenqnet import DisenQNet +from EduNLP.Vector.meta import Vector - -class DisenQModel(object): +class DisenQModel(Vector): def __init__(self, pretrained_dir, device="cpu"): """ Parameters @@ -13,10 +13,11 @@ def __init__(self, pretrained_dir, device="cpu"): cpu or cuda, default is cpu """ self.device = device - self.model = DisenQNet.from_pretrained(pretrained_dir) - self.model.to(self.device) + self.model = DisenQNet.from_pretrained(pretrained_dir).to(self.device) + self.model.eval() def __call__(self, items: dict, **kwargs): + self.cuda_tensor(items) outputs = self.model(**items) return outputs.embeded, outputs.k_hidden, outputs.i_hidden diff --git a/EduNLP/Vector/elmo_vec.py b/EduNLP/Vector/elmo_vec.py index 84146534..99adee1d 100644 --- a/EduNLP/Vector/elmo_vec.py +++ b/EduNLP/Vector/elmo_vec.py @@ -15,24 +15,24 @@ class ElmoModel(Vector): - def __init__(self, pretrained_dir: str): + def __init__(self, pretrained_dir: str, device="cpu"): """ Parameters ---------- pretrained_model_path: str """ super(ElmoModel, self).__init__() - self.model = ElmoLM.from_pretrained(pretrained_dir) + self.device = device + self.model = ElmoLM.from_pretrained(pretrained_dir).to(device) self.model.eval() - def __call__(self, *args, **kwargs): - return self.infer_vector(*args, **kwargs) - - def infer_vector(self, items: Tuple[dict, List[dict]], *args, **kwargs) -> torch.Tensor: - # TODO: handle batch and unbatch format for inputs and outputs - # is_batch = isinstance(items, list) - # items = items if is_batch else [items] + def __call__(self, items: dict, *args, **kwargs): + self.cuda_tensor(items) outputs = self.model(**items) + return outputs + + def infer_vector(self, items: dict, *args, **kwargs) -> torch.Tensor: + outputs = self(items) item_embeds = torch.cat( (outputs.forward_output[torch.arange(len(items["seq_len"])), torch.tensor(items["seq_len"]) - 1], outputs.backward_output[torch.arange(len(items["seq_len"])), 0]), @@ -40,19 +40,10 @@ def infer_vector(self, items: Tuple[dict, List[dict]], *args, **kwargs) -> torch return item_embeds def infer_tokens(self, items, *args, **kwargs) -> torch.Tensor: - # is_batch = isinstance(items, list) - outputs = self.model(**items) + outputs = self(items) forward_hiddens = outputs.forward_output backward_hiddens = outputs.backward_output return torch.cat((forward_hiddens, backward_hiddens), dim=-1) - # if is_batch: - # ret = [] - # for fh, bh, lg in zip(forward_hiddens, backward_hiddens, items.seq_len): - # _bh = torch.cat((torch.flip(bh[:lg], [0]), bh[lg:]), dim=0) - # ret.append(torch.cat((fh, _bh), dim=-1)) - # return torch.stack(tuple(ret)) - # else: - # return torch.cat((forward_hiddens[0], torch.flip(backward_hiddens, [1])[0]), dim=-1) @property def vector_size(self): diff --git a/EduNLP/Vector/meta.py b/EduNLP/Vector/meta.py index b87e90fe..31d6fc58 100644 --- a/EduNLP/Vector/meta.py +++ b/EduNLP/Vector/meta.py @@ -1,5 +1,7 @@ # coding: utf-8 # 2021/7/13 @ tongshiwei +import torch + class Vector(object): def infer_vector(self, items, *args, **kwargs) -> ...: @@ -18,3 +20,8 @@ def is_frozen(self): # pragma: no cover def freeze(self, *args, **kwargs): # pragma: no cover pass + + def cuda_tensor(self, items: dict): + for k, v in items.items(): + if isinstance(v, torch.Tensor): + items[k] = v.to(self.device) \ No newline at end of file diff --git a/EduNLP/Vector/quesnet/quesnet.py b/EduNLP/Vector/quesnet/quesnet.py index cd845461..f27cdd6f 100644 --- a/EduNLP/Vector/quesnet/quesnet.py +++ b/EduNLP/Vector/quesnet/quesnet.py @@ -2,9 +2,10 @@ from typing import Union from EduNLP.ModelZoo.quesnet import QuesNet from EduNLP.Pretrain import Question, QuesNetTokenizer +from EduNLP.Vector.meta import Vector -class QuesNetModel(object): +class QuesNetModel(Vector): def __init__(self, pretrained_dir, img_dir=None, device="cpu", **kwargs): """ Parameters diff --git a/tests/test_pretrain/test_hugginface_utils.py b/tests/test_pretrain/test_hugginface_utils.py index cb0de6a5..df685bbc 100644 --- a/tests/test_pretrain/test_hugginface_utils.py +++ b/tests/test_pretrain/test_hugginface_utils.py @@ -1,7 +1,7 @@ from EduNLP.Pretrain.hugginface_utils import TokenizerForHuggingface from transformers import AutoTokenizer import os - +os.environ["WANDB_DISABLED"] = "true" # TODO class TestPretrainUtils: diff --git a/tests/test_pretrain/test_pretrained_bert.py b/tests/test_pretrain/test_pretrained_bert.py index 80929a91..9296dccc 100644 --- a/tests/test_pretrain/test_pretrained_bert.py +++ b/tests/test_pretrain/test_pretrained_bert.py @@ -1,6 +1,6 @@ import os os.environ["CUDA_VISIBLE_DEVICES"] = "" - +os.environ["WANDB_DISABLED"] = "true" import torch from EduNLP.ModelZoo.bert import BertForPropertyPrediction, BertForKnowledgePrediction from transformers import BertModel as HFBertModel diff --git a/tests/test_pretrain/test_pretrained_disenqnet.py b/tests/test_pretrain/test_pretrained_disenqnet.py index 41998568..c76be12b 100644 --- a/tests/test_pretrain/test_pretrained_disenqnet.py +++ b/tests/test_pretrain/test_pretrained_disenqnet.py @@ -1,6 +1,6 @@ import os os.environ["CUDA_VISIBLE_DEVICES"] = "" - +os.environ["WANDB_DISABLED"] = "true" import pytest import torch from EduNLP.ModelZoo.disenqnet import DisenQNet diff --git a/tests/test_pretrain/test_pretrained_elmo.py b/tests/test_pretrain/test_pretrained_elmo.py index 16ffb55e..7abe04ee 100644 --- a/tests/test_pretrain/test_pretrained_elmo.py +++ b/tests/test_pretrain/test_pretrained_elmo.py @@ -1,6 +1,6 @@ import os os.environ["CUDA_VISIBLE_DEVICES"] = "" - +os.environ["WANDB_DISABLED"] = "true" import pytest import torch from EduNLP.ModelZoo.rnn import ElmoLM @@ -201,7 +201,7 @@ def test_elmo_t2v(self, pretrained_model_dir): t2v = T2V('elmo', pretrained_model_dir) encodes = tokenizer(items, key=lambda x: x['stem']) output = t2v(encodes) - assert output.shape[-1] == t2v.vector_size + assert output.forward_output.shape[-1] == t2v.vector_size // 2 assert t2v.infer_vector(encodes).shape[1] == t2v.vector_size assert t2v.infer_tokens(encodes).shape[2] == t2v.vector_size diff --git a/tests/test_pretrain/test_pretrained_quesnet.py b/tests/test_pretrain/test_pretrained_quesnet.py index f1ca3323..bd088703 100644 --- a/tests/test_pretrain/test_pretrained_quesnet.py +++ b/tests/test_pretrain/test_pretrained_quesnet.py @@ -3,7 +3,7 @@ from bson import encode os.environ["CUDA_VISIBLE_DEVICES"] = "" - +os.environ["WANDB_DISABLED"] = "true" import pytest import torch from EduNLP.ModelZoo.quesnet import QuesNet From 16cf3acc6f6c51c408f2bcff88d0cc89fe376aba Mon Sep 17 00:00:00 2001 From: KenelmQLH <1097824882@qq.com> Date: Mon, 27 Mar 2023 12:12:00 +0000 Subject: [PATCH 13/58] fix gensim with empty_vector --- EduNLP/Vector/bert_vec.py | 2 +- EduNLP/Vector/gensim_vec.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/EduNLP/Vector/bert_vec.py b/EduNLP/Vector/bert_vec.py index 89ad018b..c95226b7 100644 --- a/EduNLP/Vector/bert_vec.py +++ b/EduNLP/Vector/bert_vec.py @@ -28,7 +28,7 @@ class BertModel(Vector): torch.Size([2, 768]) """ - def __init__(self, pretrained_model, device="cpu"): + def __init__(self, pretrained_dir, device="cpu"): self.device = device self.model = AutoModel.from_pretrained(pretrained_dir).to(self.device) self.model.eval() diff --git a/EduNLP/Vector/gensim_vec.py b/EduNLP/Vector/gensim_vec.py index 35aa9996..9f25b8e0 100644 --- a/EduNLP/Vector/gensim_vec.py +++ b/EduNLP/Vector/gensim_vec.py @@ -67,7 +67,8 @@ def __getitem__(self, item): def infer_vector(self, items, agg="mean", *args, **kwargs) -> list: token_vectors = self.infer_tokens(items, *args, **kwargs) - return [eval("np.%s" % agg)(item, axis=0) for item in token_vectors] + # return [eval("np.%s" % agg)(item, axis=0) if item else np.array([]) for item in token_vectors] + return [eval("np.%s" % agg)(item, axis=0) if item else np.zeros(self.vector_size,) for item in token_vectors] def infer_tokens(self, items, *args, **kwargs) -> list: return [list(self(*item)) for item in items] From 4d93fbfcd08eb347a68d7ef0e75ba6e9a4176366 Mon Sep 17 00:00:00 2001 From: nnnyt <793313994@qq.com> Date: Mon, 27 Mar 2023 21:23:57 +0800 Subject: [PATCH 14/58] add downstream tasks for DisenQ --- EduNLP/ModelZoo/__init__.py | 2 + EduNLP/ModelZoo/disenqnet/__init__.py | 2 +- EduNLP/ModelZoo/disenqnet/disenqnet.py | 145 +++++++++++++++++++++ EduNLP/ModelZoo/rnn/__init__.py | 1 + EduNLP/ModelZoo/rnn/rnn.py | 13 +- EduNLP/ModelZoo/utils/__init__.py | 1 + EduNLP/ModelZoo/utils/downstream_output.py | 12 ++ 7 files changed, 164 insertions(+), 12 deletions(-) create mode 100644 EduNLP/ModelZoo/utils/downstream_output.py diff --git a/EduNLP/ModelZoo/__init__.py b/EduNLP/ModelZoo/__init__.py index c4607475..a1fb4b43 100644 --- a/EduNLP/ModelZoo/__init__.py +++ b/EduNLP/ModelZoo/__init__.py @@ -1,3 +1,5 @@ from .utils import * from .bert import * from .rnn import * +from .disenqnet import * +from .quesnet import * diff --git a/EduNLP/ModelZoo/disenqnet/__init__.py b/EduNLP/ModelZoo/disenqnet/__init__.py index 975cfaff..4b65684f 100644 --- a/EduNLP/ModelZoo/disenqnet/__init__.py +++ b/EduNLP/ModelZoo/disenqnet/__init__.py @@ -1,3 +1,3 @@ # -*- coding: utf-8 -*- -from .disenqnet import DisenQNet, DisenQNetForPreTraining +from .disenqnet import DisenQNet, DisenQNetForPreTraining, DisenQNetForPropertyPrediction, DisenQNetForKnowledgePrediction diff --git a/EduNLP/ModelZoo/disenqnet/disenqnet.py b/EduNLP/ModelZoo/disenqnet/disenqnet.py index b8dc6fca..436f14b0 100644 --- a/EduNLP/ModelZoo/disenqnet/disenqnet.py +++ b/EduNLP/ModelZoo/disenqnet/disenqnet.py @@ -8,10 +8,13 @@ import os import json from gensim.models import KeyedVectors +from typing import Optional, List from .modules import TextEncoder, AttnModel, ConceptEstimator, MIEstimator, DisenEstimator from .utils import get_mask from ..utils import set_device +from ..utils import PropertyPredictionOutput, KnowledgePredictionOutput +from ..rnn import HAM from ..base_model import BaseModel from transformers.modeling_outputs import ModelOutput from transformers import PretrainedConfig @@ -218,3 +221,145 @@ def from_config(cls, config_path, **kwargs): warmup=model_config['warmup'], n_adversarial=model_config['n_adversarial'], ) + + +class DisenQNetForPropertyPrediction(BaseModel): + base_model_prefix = 'disenq' + + def __init__(self, vocab_size: int, hidden_size: int, dropout_rate: float, wv=None, + head_dropout=0.5, **kwargs): + super(DisenQNetForPropertyPrediction, self).__init__() + self.disenq = DisenQNet( + vocab_size=vocab_size, + hidden_size=hidden_size, + dropout_rate=dropout_rate, + wv=wv, + **kwargs) + self.head_dropout = head_dropout + self.dropout = nn.Dropout(head_dropout) + self.classifier = nn.Linear(hidden_size, 1) + self.sigmoid = nn.Sigmoid() + self.criterion = nn.MSELoss() + + self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "kwargs", 'wv']} + self.config.update(kwargs) + self.config['architecture'] = 'DisenQNetForPropertyPrediction' + self.config = PretrainedConfig.from_dict(self.config) + + def forward(self, seq_idx=None, seq_len=None, labels=None, vector_type="i") -> ModelOutput: + outputs = self.disenq(seq_idx, seq_len) + if vector_type == "k": + item_embeds = outputs.k_hidden + elif vector_type == "i": + item_embeds = outputs.i_hidden + else: + raise KeyError("vector_type must be one of ('k', 'i') ") + item_embeds = self.dropout(item_embeds) + + logits = self.sigmoid(self.classifier(item_embeds)) + loss = None + if labels is not None: + loss = self.criterion(logits, labels) + return PropertyPredictionOutput( + loss=loss, + logits=logits + ) + + @classmethod + def from_config(cls, config_path, **kwargs): + with open(config_path, "r", encoding="utf-8") as rf: + model_config = json.load(rf) + model_config.update(kwargs) + return cls( + vocab_size=model_config['vocab_size'], + hidden_size=model_config['hidden_size'], + dropout_rate=model_config['dropout_rate'], + head_dropout=model_config.get('head_dropout', 0.5), + ) + + +class DisenQNetForKnowledgePrediction(BaseModel): + base_model_prefix = 'disenq' + + def __init__(self, vocab_size: int, hidden_size: int, dropout_rate: float, + num_classes_list: List[int], + num_total_classes: int, + wv=None, + head_dropout: Optional[float] = 0.5, + flat_cls_weight: Optional[float] = 0.5, + attention_unit_size: Optional[int] = 256, + fc_hidden_size: Optional[int] = 512, + beta: Optional[float] = 0.5, + **kwargs): + super(DisenQNetForKnowledgePrediction, self).__init__() + self.disenq = DisenQNet( + vocab_size=vocab_size, + hidden_size=hidden_size, + dropout_rate=dropout_rate, + wv=wv, + **kwargs) + self.head_dropout = head_dropout + self.dropout = nn.Dropout(head_dropout) + self.sigmoid = nn.Sigmoid() + self.criterion = nn.MSELoss() + self.flat_classifier = nn.Linear(in_features=hidden_size, out_features=num_total_classes) + self.ham_classifier = HAM( + num_classes_list=num_classes_list, + num_total_classes=num_total_classes, + sequence_model_hidden_size=hidden_size, + attention_unit_size=attention_unit_size, + fc_hidden_size=fc_hidden_size, + beta=beta, + dropout_rate=dropout_rate + ) + self.flat_cls_weight = flat_cls_weight + self.num_classes_list = num_classes_list + self.num_total_classes = num_total_classes + + self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "kwargs", 'wv']} + self.config.update(kwargs) + self.config['architecture'] = 'DisenQNetForKnowledgePrediction' + self.config = PretrainedConfig.from_dict(self.config) + + def forward(self, seq_idx=None, seq_len=None, labels=None, vector_type="i") -> ModelOutput: + outputs = self.disenq(seq_idx, seq_len) + if vector_type == "k": + item_embeds = outputs.k_hidden + elif vector_type == "i": + item_embeds = outputs.i_hidden + else: + raise KeyError("vector_type must be one of ('k', 'i') ") + tokens_embeds = outputs.embeded + item_embeds = self.dropout(item_embeds) + tokens_embeds = self.dropout(tokens_embeds) + flat_logits = self.sigmoid(self.flat_classifier(item_embeds)) + ham_outputs = self.ham_classifier(tokens_embeds) + ham_logits = self.sigmoid(ham_outputs.scores) + logits = self.flat_cls_weight * flat_logits + (1 - self.flat_cls_weight) * ham_logits + loss = None + if labels is not None: + labels = torch.sum(torch.nn.functional.one_hot(labels, num_classes=self.num_total_classes), dim=1) + labels = labels.float() + loss = self.criterion(logits, labels) + return KnowledgePredictionOutput( + loss=loss, + logits=logits + ) + + @classmethod + def from_config(cls, config_path, **kwargs): + with open(config_path, "r", encoding="utf-8") as rf: + model_config = json.load(rf) + model_config.update(kwargs) + return cls( + vocab_size=model_config['vocab_size'], + hidden_size=model_config['hidden_size'], + dropout_rate=model_config['dropout_rate'], + num_total_classes=model_config.get('num_total_classes'), + num_classes_list=model_config.get('num_classes_list'), + head_dropout=model_config.get('head_dropout', 0.5), + flat_cls_weight=model_config.get('flat_cls_weight', 0.5), + attention_unit_size=model_config.get('attention_unit_size', 256), + fc_hidden_size=model_config.get('fc_hidden_size', 512), + beta=model_config.get('beta', 0.5) + ) diff --git a/EduNLP/ModelZoo/rnn/__init__.py b/EduNLP/ModelZoo/rnn/__init__.py index 2ac479e2..e03987fb 100644 --- a/EduNLP/ModelZoo/rnn/__init__.py +++ b/EduNLP/ModelZoo/rnn/__init__.py @@ -2,3 +2,4 @@ # 2021/7/12 @ tongshiwei from .rnn import * +from .harnn import * diff --git a/EduNLP/ModelZoo/rnn/rnn.py b/EduNLP/ModelZoo/rnn/rnn.py index 79f3ef27..eca46efd 100644 --- a/EduNLP/ModelZoo/rnn/rnn.py +++ b/EduNLP/ModelZoo/rnn/rnn.py @@ -11,6 +11,7 @@ from typing import Optional from ..base_model import BaseModel from ..utils import torch_utils as mytorch +from ..utils import PropertyPredictionOutput, KnowledgePredictionOutput from .harnn import HAM __all__ = ["LM", "ElmoLM", "ElmoLMForPreTraining", "ElmoLMForPropertyPrediction", "ElmoLMForKnowledgePrediction"] @@ -323,11 +324,6 @@ def from_config(cls, config_path, **kwargs): ) -class PropertyPredictionOutput(ModelOutput): - loss: torch.FloatTensor = None - logits: torch.FloatTensor = None - - class ElmoLMForPropertyPrediction(BaseModel): base_model_prefix = 'elmo' @@ -385,11 +381,6 @@ def from_config(cls, config_path, **kwargs): ) -class KnowledgePredictionOutput(ModelOutput): - loss: torch.FloatTensor = None - logits: torch.FloatTensor = None - - class ElmoLMForKnowledgePrediction(BaseModel): base_model_prefix = 'elmo' @@ -435,7 +426,7 @@ def __init__(self, vocab_size: int, self.config = {k: v for k, v in locals().items() if k != "self" and k != "__class__" and k != "kwargs"} self.config.update(kwargs) - self.config['architecture'] = 'ElmoLMForPreTraining' + self.config['architecture'] = 'ElmoLMForKnowledgePrediction' self.config = PretrainedConfig.from_dict(self.config) def forward(self, seq_idx=None, seq_len=None, labels=None) -> ModelOutput: diff --git a/EduNLP/ModelZoo/utils/__init__.py b/EduNLP/ModelZoo/utils/__init__.py index 3751fcb8..418a20aa 100644 --- a/EduNLP/ModelZoo/utils/__init__.py +++ b/EduNLP/ModelZoo/utils/__init__.py @@ -7,3 +7,4 @@ from .data import load_items from .modules import MLP, TextCNN from .torch_utils import * +from .downstream_output import * diff --git a/EduNLP/ModelZoo/utils/downstream_output.py b/EduNLP/ModelZoo/utils/downstream_output.py new file mode 100644 index 00000000..fdb42386 --- /dev/null +++ b/EduNLP/ModelZoo/utils/downstream_output.py @@ -0,0 +1,12 @@ +import torch +from transformers.modeling_outputs import ModelOutput + + +class PropertyPredictionOutput(ModelOutput): + loss: torch.FloatTensor = None + logits: torch.FloatTensor = None + + +class KnowledgePredictionOutput(ModelOutput): + loss: torch.FloatTensor = None + logits: torch.FloatTensor = None \ No newline at end of file From 6f9aa8cc759701ef377a564a1ba3b1242fc0d171 Mon Sep 17 00:00:00 2001 From: nnnyt <793313994@qq.com> Date: Mon, 27 Mar 2023 22:33:01 +0800 Subject: [PATCH 15/58] modify downstream task for BERT --- EduNLP/ModelZoo/bert/bert.py | 55 +++++++++++++++------- EduNLP/ModelZoo/disenqnet/__init__.py | 3 +- EduNLP/ModelZoo/disenqnet/disenqnet.py | 2 +- EduNLP/ModelZoo/utils/downstream_output.py | 2 +- EduNLP/Pretrain/quesnet_vec.py | 1 + 5 files changed, 43 insertions(+), 20 deletions(-) diff --git a/EduNLP/ModelZoo/bert/bert.py b/EduNLP/ModelZoo/bert/bert.py index 2e86a72d..1cd0dd69 100644 --- a/EduNLP/ModelZoo/bert/bert.py +++ b/EduNLP/ModelZoo/bert/bert.py @@ -7,9 +7,10 @@ import os from ..base_model import BaseModel from transformers.modeling_outputs import ModelOutput -from transformers import BertModel, PretrainedConfig +from transformers import BertModel, PretrainedConfig, BertConfig from typing import List, Optional from ..rnn.harnn import HAM +import logging __all__ = ["BertForPropertyPrediction", "BertForKnowledgePrediction"] @@ -20,9 +21,15 @@ class BertForPPOutput(ModelOutput): class BertForPropertyPrediction(BaseModel): - def __init__(self, pretrained_model_dir=None, head_dropout=0.5): + def __init__(self, pretrained_model_dir=None, head_dropout=0.5, init=True): super(BertForPropertyPrediction, self).__init__() - self.bert = BertModel.from_pretrained(pretrained_model_dir) + bert_config = BertConfig.from_pretrained(pretrained_model_dir) + if init: + print(f'Load BertModel from checkpoint: {pretrained_model_dir}') + self.bert = BertModel.from_pretrained(pretrained_model_dir) + else: + print(f'Load BertModel from config: {pretrained_model_dir}') + self.bert = BertModel(bert_config) self.hidden_size = self.bert.config.hidden_size self.head_dropout = head_dropout self.dropout = nn.Dropout(head_dropout) @@ -30,7 +37,7 @@ def __init__(self, pretrained_model_dir=None, head_dropout=0.5): self.sigmoid = nn.Sigmoid() self.criterion = nn.MSELoss() - self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__"]} + self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "bert_config"]} self.config['architecture'] = 'BertForPropertyPrediction' self.config = PretrainedConfig.from_dict(self.config) @@ -54,37 +61,47 @@ def forward(self, @classmethod def from_config(cls, config_path, **kwargs): + config_path = os.path.join(os.path.dirname(config_path), 'model_config.json') with open(config_path, "r", encoding="utf-8") as rf: model_config = json.load(rf) + model_config['pretrained_model_dir'] = os.path.dirname(config_path) model_config.update(kwargs) return cls( pretrained_model_dir=model_config['pretrained_model_dir'], - head_dropout=model_config.get("head_dropout", 0.5) + head_dropout=model_config.get("head_dropout", 0.5), + init=model_config.get('init', False) ) - # @classmethod - # def from_pretrained(cls): - # NotImplementedError - # # 需要验证是否和huggingface的模型兼容 + def save_config(self, config_dir): + config_path = os.path.join(config_dir, "model_config.json") + with open(config_path, "w", encoding="utf-8") as wf: + json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2) + self.bert.config.save_pretrained(config_dir) class BertForKnowledgePrediction(BaseModel): def __init__(self, + pretrained_model_dir=None, num_classes_list: List[int] = None, num_total_classes: int = None, - pretrained_model_dir=None, head_dropout=0.5, flat_cls_weight=0.5, attention_unit_size=256, fc_hidden_size=512, beta=0.5, + init=True ): super(BertForKnowledgePrediction, self).__init__() - self.bert = BertModel.from_pretrained(pretrained_model_dir) + bert_config = BertConfig.from_pretrained(pretrained_model_dir) + if init: + print(f'Load BertModel from checkpoint: {pretrained_model_dir}') + self.bert = BertModel.from_pretrained(pretrained_model_dir) + else: + print(f'Load BertModel from config: {pretrained_model_dir}') + self.bert = BertModel(bert_config) self.hidden_size = self.bert.config.hidden_size self.head_dropout = head_dropout self.dropout = nn.Dropout(head_dropout) - self.classifier = nn.Linear(self.hidden_size, 1) self.sigmoid = nn.Sigmoid() self.criterion = nn.MSELoss() self.flat_classifier = nn.Linear(self.hidden_size, num_total_classes) @@ -101,7 +118,7 @@ def __init__(self, self.num_classes_list = num_classes_list self.num_total_classes = num_total_classes - self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__"]} + self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "bert_config"]} self.config['architecture'] = 'BertForKnowledgePrediction' self.config = PretrainedConfig.from_dict(self.config) @@ -131,8 +148,10 @@ def forward(self, @classmethod def from_config(cls, config_path, **kwargs): + config_path = os.path.join(os.path.dirname(config_path), 'model_config.json') with open(config_path, "r", encoding="utf-8") as rf: model_config = json.load(rf) + model_config['pretrained_model_dir'] = os.path.dirname(config_path) model_config.update(kwargs) return cls( pretrained_model_dir=model_config['pretrained_model_dir'], @@ -143,9 +162,11 @@ def from_config(cls, config_path, **kwargs): attention_unit_size=model_config.get('attention_unit_size', 256), fc_hidden_size=model_config.get('fc_hidden_size', 512), beta=model_config.get('beta', 0.5), + init=model_config.get('init', False) ) - # @classmethod - # def from_pretrained(cls): - # NotImplementedError - # # 需要验证是否和huggingface的模型兼容 + def save_config(self, config_dir): + config_path = os.path.join(config_dir, "model_config.json") + with open(config_path, "w", encoding="utf-8") as wf: + json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2) + self.bert.config.save_pretrained(config_dir) diff --git a/EduNLP/ModelZoo/disenqnet/__init__.py b/EduNLP/ModelZoo/disenqnet/__init__.py index 4b65684f..263c410b 100644 --- a/EduNLP/ModelZoo/disenqnet/__init__.py +++ b/EduNLP/ModelZoo/disenqnet/__init__.py @@ -1,3 +1,4 @@ # -*- coding: utf-8 -*- -from .disenqnet import DisenQNet, DisenQNetForPreTraining, DisenQNetForPropertyPrediction, DisenQNetForKnowledgePrediction +from .disenqnet import DisenQNet, DisenQNetForPreTraining, \ + DisenQNetForPropertyPrediction, DisenQNetForKnowledgePrediction diff --git a/EduNLP/ModelZoo/disenqnet/disenqnet.py b/EduNLP/ModelZoo/disenqnet/disenqnet.py index 436f14b0..49f4f662 100644 --- a/EduNLP/ModelZoo/disenqnet/disenqnet.py +++ b/EduNLP/ModelZoo/disenqnet/disenqnet.py @@ -227,7 +227,7 @@ class DisenQNetForPropertyPrediction(BaseModel): base_model_prefix = 'disenq' def __init__(self, vocab_size: int, hidden_size: int, dropout_rate: float, wv=None, - head_dropout=0.5, **kwargs): + head_dropout=0.5, **kwargs): super(DisenQNetForPropertyPrediction, self).__init__() self.disenq = DisenQNet( vocab_size=vocab_size, diff --git a/EduNLP/ModelZoo/utils/downstream_output.py b/EduNLP/ModelZoo/utils/downstream_output.py index fdb42386..15793ab5 100644 --- a/EduNLP/ModelZoo/utils/downstream_output.py +++ b/EduNLP/ModelZoo/utils/downstream_output.py @@ -9,4 +9,4 @@ class PropertyPredictionOutput(ModelOutput): class KnowledgePredictionOutput(ModelOutput): loss: torch.FloatTensor = None - logits: torch.FloatTensor = None \ No newline at end of file + logits: torch.FloatTensor = None diff --git a/EduNLP/Pretrain/quesnet_vec.py b/EduNLP/Pretrain/quesnet_vec.py index da3899b6..3e10b5e2 100644 --- a/EduNLP/Pretrain/quesnet_vec.py +++ b/EduNLP/Pretrain/quesnet_vec.py @@ -631,6 +631,7 @@ def pretrain_quesnet(path, output_dir, img_dir=None, save_embs=False, train_para ... "know_name": "['代数', '集合', '集合的相等']" ... }] >>> tokenizer.set_vocab(items, key=lambda x: x['ques_content'], trim_min_count=1, silent=True) + >>> tokenizer.set_meta_vocab(items, silent=True) >>> pretrain_quesnet('./data/standard_luna_data.json', './testQuesNet', tokenizer) # doctest: +SKIP """ default_train_params = { From 572d5e92a9f7d86941049340a9487513048b0d20 Mon Sep 17 00:00:00 2001 From: Longhu Qin Date: Tue, 28 Mar 2023 10:08:51 +0800 Subject: [PATCH 16/58] finish test --- EduNLP/Vector/disenqnet/disenqnet.py | 1 + EduNLP/Vector/elmo_vec.py | 2 +- EduNLP/Vector/meta.py | 2 +- tests/test_pretrain/test_hugginface_utils.py | 2 +- tests/test_pretrain/test_pretrained_elmo.py | 2 +- 5 files changed, 5 insertions(+), 4 deletions(-) diff --git a/EduNLP/Vector/disenqnet/disenqnet.py b/EduNLP/Vector/disenqnet/disenqnet.py index adb51741..c3871716 100644 --- a/EduNLP/Vector/disenqnet/disenqnet.py +++ b/EduNLP/Vector/disenqnet/disenqnet.py @@ -2,6 +2,7 @@ from EduNLP.ModelZoo.disenqnet.disenqnet import DisenQNet from EduNLP.Vector.meta import Vector + class DisenQModel(Vector): def __init__(self, pretrained_dir, device="cpu"): """ diff --git a/EduNLP/Vector/elmo_vec.py b/EduNLP/Vector/elmo_vec.py index b6d22da7..e76f9d9c 100644 --- a/EduNLP/Vector/elmo_vec.py +++ b/EduNLP/Vector/elmo_vec.py @@ -19,7 +19,7 @@ def __call__(self, items: dict, *args, **kwargs): self.cuda_tensor(items) outputs = self.model(**items) return outputs - + def infer_vector(self, items: dict, *args, **kwargs) -> torch.Tensor: outputs = self(items) item_embeds = torch.cat( diff --git a/EduNLP/Vector/meta.py b/EduNLP/Vector/meta.py index 31d6fc58..83b961d5 100644 --- a/EduNLP/Vector/meta.py +++ b/EduNLP/Vector/meta.py @@ -24,4 +24,4 @@ def freeze(self, *args, **kwargs): # pragma: no cover def cuda_tensor(self, items: dict): for k, v in items.items(): if isinstance(v, torch.Tensor): - items[k] = v.to(self.device) \ No newline at end of file + items[k] = v.to(self.device) diff --git a/tests/test_pretrain/test_hugginface_utils.py b/tests/test_pretrain/test_hugginface_utils.py index df685bbc..bcfea1f2 100644 --- a/tests/test_pretrain/test_hugginface_utils.py +++ b/tests/test_pretrain/test_hugginface_utils.py @@ -3,7 +3,7 @@ import os os.environ["WANDB_DISABLED"] = "true" -# TODO + class TestPretrainUtils: def test_hf_tokenzier(self, pretrained_tokenizer_dir): tokenizer = TokenizerForHuggingface(tokenize_method=None) diff --git a/tests/test_pretrain/test_pretrained_elmo.py b/tests/test_pretrain/test_pretrained_elmo.py index 7abe04ee..3cbb0ee1 100644 --- a/tests/test_pretrain/test_pretrained_elmo.py +++ b/tests/test_pretrain/test_pretrained_elmo.py @@ -196,7 +196,7 @@ def test_elmo_t2v(self, pretrained_model_dir): t2v = ElmoModel(pretrained_model_dir) encodes = tokenizer(items, key=lambda x: x['stem']) output = t2v(encodes) - assert output.shape[1] == t2v.vector_size + assert output.forward_output.shape[-1] == t2v.vector_size // 2 t2v = T2V('elmo', pretrained_model_dir) encodes = tokenizer(items, key=lambda x: x['stem']) From 99ed226a2ce057aa39337e8aa2da2ff458d71867 Mon Sep 17 00:00:00 2001 From: nnnyt <793313994@qq.com> Date: Tue, 28 Mar 2023 10:45:15 +0800 Subject: [PATCH 17/58] add finetuning for disenq --- EduNLP/Pretrain/__init__.py | 2 +- EduNLP/Pretrain/disenqnet_vec.py | 159 ++++++++++++++++++ .../test_pretrained_disenqnet.py | 81 +++++++++ 3 files changed, 241 insertions(+), 1 deletion(-) diff --git a/EduNLP/Pretrain/__init__.py b/EduNLP/Pretrain/__init__.py index 5c736b66..6d687270 100644 --- a/EduNLP/Pretrain/__init__.py +++ b/EduNLP/Pretrain/__init__.py @@ -5,6 +5,6 @@ from .elmo_vec import * from .bert_vec import * from .quesnet_vec import QuesNetTokenizer, pretrain_quesnet, Question -from .disenqnet_vec import DisenQTokenizer, train_disenqnet +from .disenqnet_vec import * from .pretrian_utils import * from .hugginface_utils import * diff --git a/EduNLP/Pretrain/disenqnet_vec.py b/EduNLP/Pretrain/disenqnet_vec.py index d2376689..a583ace8 100644 --- a/EduNLP/Pretrain/disenqnet_vec.py +++ b/EduNLP/Pretrain/disenqnet_vec.py @@ -13,10 +13,15 @@ from dataclasses import dataclass, field from ..SIF import EDU_SPYMBOLS from ..ModelZoo.disenqnet.disenqnet import DisenQNetForPreTraining +from ..ModelZoo.disenqnet.disenqnet import DisenQNetForPropertyPrediction, DisenQNetForKnowledgePrediction from ..ModelZoo.utils import load_items, pad_sequence from .pretrian_utils import PretrainedEduTokenizer +__all__ = ["DisenQTokenizer", "DisenQDataset", "train_disenqnet", "finetune_disenqnet_for_property_prediction", + "finetune_disenqnet_for_knowledge_prediction"] + + def check_num(s): # (1/2) -> 1/2 if s.startswith('(') and s.endswith(')'): @@ -226,6 +231,8 @@ def __getitem__(self, index): return_tensors=False, return_text=False) if self.mode in ["train", "val"]: ret['concept'] = self._list_to_onehot(item[self.data_formation["knowledge"]], self.concept_to_idx) + if self.mode in ['finetune']: + ret['labels'] = item[self.data_formation["labels"]] return ret def collate_fn(self, batch_data): @@ -443,3 +450,155 @@ def train_disenqnet(train_items: List[dict], output_dir: str, pretrained_dir: st trainer.model.save_config(output_dir) tokenizer.save_pretrained(output_dir) return output_dir + + +def finetune_disenqnet_for_property_prediction(train_items, + output_dir, + pretrained_model, + eval_items=None, + tokenizer_params=None, + data_params=None, + train_params=None, + model_params=None + ): + """ + Parameters + ---------- + train_items: list, required + The training corpus, each item could be str or dict + output_dir: str, required + The directory to save trained model files + pretrained_model: str, optional + The pretrained model name or path for model and tokenizer + eval_items: list, required + The evaluating items, each item could be str or dict + tokenizer_params: dict, optional, default=None + The parameters passed to ElmoTokenizer + data_params: dict, optional, default=None + The parameters passed to ElmoDataset and ElmoTokenizer + model_params: dict, optional, default=None + The parameters passed to Trainer + train_params: dict, optional, default=None + """ + tokenizer_params = tokenizer_params if tokenizer_params else {} + data_params = data_params if data_params is not None else {} + model_params = model_params if model_params is not None else {} + train_params = train_params if train_params is not None else {} + default_data_formation = { + "ques_content": "ques_content", + "knowledge": "knowledge", + "labels": "difficulty" + } + data_formation = data_params.get("data_formation", None) + if data_formation is not None: + default_data_formation.update(data_formation) + data_formation = default_data_formation + # tokenizer configuration + tokenizer = DisenQTokenizer.from_pretrained(pretrained_model, **tokenizer_params) + # dataset configuration + train_dataset = DisenQDataset(train_items, tokenizer, data_formation, + mode="finetune") + if eval_items: + eval_dataset = DisenQDataset(eval_items, tokenizer, data_formation, + mode="finetune") + else: + eval_dataset = None + # model configuration + model = DisenQNetForPropertyPrediction.from_pretrained(pretrained_model, **model_params) + # training configuration + work_train_params = deepcopy(DEFAULT_TRAIN_PARAMS) + work_train_params["output_dir"] = output_dir + if train_params is not None: + work_train_params.update(train_params if train_params else {}) + if model_params: + if 'hidden_size' in model_params: + work_train_params['hidden_size'] = model_params['hidden_size'] + work_args = DisenQTrainingArguments(**work_train_params) + trainer = Trainer( + model=model, + args=work_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + data_collator=train_dataset.collate_fn, + ) + trainer.train() + # trainer.model.save_pretrained(output_dir) + trainer.save_model(output_dir) + trainer.model.save_config(output_dir) + tokenizer.save_pretrained(output_dir) + + +def finetune_disenqnet_for_knowledge_prediction(train_items, + output_dir, + pretrained_model="bert-base-chinese", + eval_items=None, + tokenizer_params=None, + data_params=None, + train_params=None, + model_params=None + ): + """ + Parameters + ---------- + train_items: list, required + The training corpus, each item could be str or dict + output_dir: str, required + The directory to save trained model files + pretrained_model: str, optional + The pretrained model name or path for model and tokenizer + eval_items: list, required + The evaluating items, each item could be str or dict + tokenizer_params: dict, optional, default=None + The parameters passed to ElmoTokenizer + data_params: dict, optional, default=None + The parameters passed to ElmoDataset and ElmoTokenizer + model_params: dict, optional, default=None + The parameters passed to Trainer + train_params: dict, optional, default=None + """ + tokenizer_params = tokenizer_params if tokenizer_params else {} + data_params = data_params if data_params is not None else {} + model_params = model_params if model_params is not None else {} + train_params = train_params if train_params is not None else {} + default_data_formation = { + "ques_content": "ques_content", + "knowledge": "knowledge", + "labels": "know_list" + } + data_formation = data_params.get("data_formation", None) + if data_formation is not None: + default_data_formation.update(data_formation) + data_formation = default_data_formation + # tokenizer configuration + tokenizer = DisenQTokenizer.from_pretrained(pretrained_model, **tokenizer_params) + # dataset configuration + train_dataset = DisenQDataset(train_items, tokenizer, data_formation, + mode="finetune") + if eval_items: + eval_dataset = DisenQDataset(eval_items, tokenizer, data_formation, + mode="finetune") + else: + eval_dataset = None + # model configuration + model = DisenQNetForKnowledgePrediction.from_pretrained(pretrained_model, **model_params) + # training configuration + work_train_params = deepcopy(DEFAULT_TRAIN_PARAMS) + work_train_params["output_dir"] = output_dir + if train_params is not None: + work_train_params.update(train_params if train_params else {}) + if model_params: + if 'hidden_size' in model_params: + work_train_params['hidden_size'] = model_params['hidden_size'] + work_args = DisenQTrainingArguments(**work_train_params) + trainer = Trainer( + model=model, + args=work_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + data_collator=train_dataset.collate_fn, + ) + trainer.train() + # trainer.model.save_pretrained(output_dir) + trainer.save_model(output_dir) + trainer.model.save_config(output_dir) + tokenizer.save_pretrained(output_dir) diff --git a/tests/test_pretrain/test_pretrained_disenqnet.py b/tests/test_pretrain/test_pretrained_disenqnet.py index 41998568..9dd60787 100644 --- a/tests/test_pretrain/test_pretrained_disenqnet.py +++ b/tests/test_pretrain/test_pretrained_disenqnet.py @@ -4,6 +4,9 @@ import pytest import torch from EduNLP.ModelZoo.disenqnet import DisenQNet +from EduNLP.ModelZoo.disenqnet import DisenQNetForKnowledgePrediction, DisenQNetForPropertyPrediction +from EduNLP.Pretrain import finetune_disenqnet_for_knowledge_prediction +from EduNLP.Pretrain import finetune_disenqnet_for_property_prediction from EduNLP.Pretrain import DisenQTokenizer, train_disenqnet from EduNLP.Vector import T2V, DisenQModel from EduNLP.I2V import DisenQ, get_pretrained_i2v @@ -102,6 +105,84 @@ def test_train_disenq(self, standard_luna_data, pretrained_model_dir): encodes = tokenizer(test_items, lambda x: x['ques_content']) model(**encodes) + def test_train_pp(self, standard_luna_data, pretrained_pp_dir, pretrained_model_dir): + data_params = { + "stem_key": "ques_content", + "label_key": "difficulty" + } + train_params = { + "num_train_epochs": 1, + "per_device_train_batch_size": 2, + "per_device_eval_batch_size": 2, + "no_cuda": not TEST_GPU, + } + train_items = standard_luna_data + # train without eval_items + finetune_disenqnet_for_property_prediction( + train_items, + pretrained_pp_dir, + pretrained_model=pretrained_model_dir, + train_params=train_params, + data_params=data_params + ) + # train with eval_items + finetune_disenqnet_for_property_prediction( + train_items, + pretrained_pp_dir, + pretrained_model=pretrained_model_dir, + eval_items=train_items, + train_params=train_params, + data_params=data_params + ) + model = DisenQNetForPropertyPrediction.from_pretrained(pretrained_pp_dir) + tokenizer = DisenQTokenizer.from_pretrained(pretrained_pp_dir) + + encodes = tokenizer(train_items[:8], lambda x: x['ques_content']) + # TODO: need to handle inference for T2V for batch or single + model(**encodes) + + def test_train_kp(self, standard_luna_data, pretrained_model_dir, pretrained_kp_dir): + data_params = { + "stem_key": "ques_content", + "label_key": "know_list" + } + train_params = { + "num_train_epochs": 1, + "per_device_train_batch_size": 2, + "per_device_eval_batch_size": 2, + "no_cuda": not TEST_GPU, + } + model_params = { + "num_classes_list": [10, 27, 963], + "num_total_classes": 1000, + } + train_items = standard_luna_data + # train without eval_items + finetune_disenqnet_for_knowledge_prediction( + train_items, + pretrained_kp_dir, + pretrained_model=pretrained_model_dir, + train_params=train_params, + data_params=data_params, + model_params=model_params + ) + # train with eval_items + finetune_disenqnet_for_knowledge_prediction( + train_items, + pretrained_kp_dir, + pretrained_model=pretrained_model_dir, + eval_items=train_items, + train_params=train_params, + data_params=data_params, + model_params=model_params + ) + model = DisenQNetForKnowledgePrediction.from_pretrained(pretrained_kp_dir) + tokenizer = DisenQTokenizer.from_pretrained(pretrained_kp_dir) + + encodes = tokenizer(train_items[:8], lambda x: x['ques_content']) + # TODO: need to handle inference for T2V for batch or single + model(**encodes) + def test_disenq_t2v(self, pretrained_model_dir): items = [ {'stem': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \ From 701d01d2ca9dce3a13a51a0a9d39cd51a38c8b7e Mon Sep 17 00:00:00 2001 From: Longhu Qin Date: Tue, 28 Mar 2023 16:10:58 +0800 Subject: [PATCH 18/58] updata .github --- .github/workflows/python-test.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index 6f481853..00dc8da2 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -6,10 +6,14 @@ on: [push, pull_request] jobs: build: - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} strategy: matrix: python-version: [3.6, 3.7, 3.8, 3.9] + include: + - os: "ubuntu-latest" + - os: "ubuntu-20.04" + python-version: "3.6" steps: - uses: actions/checkout@v2 @@ -24,4 +28,4 @@ jobs: - name: Test with pytest run: | pytest - codecov + codecov \ No newline at end of file From c9afda1e27efde9339e1102053589f1c6d5f9571 Mon Sep 17 00:00:00 2001 From: Longhu Qin Date: Tue, 28 Mar 2023 19:05:32 +0800 Subject: [PATCH 19/58] update test --- EduNLP/Vector/bert_vec.py | 4 ---- tests/test_pretrain/test_pretrained_elmo.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/EduNLP/Vector/bert_vec.py b/EduNLP/Vector/bert_vec.py index c95226b7..41ed98cd 100644 --- a/EduNLP/Vector/bert_vec.py +++ b/EduNLP/Vector/bert_vec.py @@ -34,10 +34,6 @@ def __init__(self, pretrained_dir, device="cpu"): self.model.eval() def __call__(self, items: dict): - for k, v in items.items(): - if isinstance(v, torch.Tensor): - items[k] = v.to(self.device) - # batch_size, sent_len, embedding_size self.cuda_tensor(items) tokens = self.model(**items).last_hidden_state return tokens diff --git a/tests/test_pretrain/test_pretrained_elmo.py b/tests/test_pretrain/test_pretrained_elmo.py index 3cbb0ee1..8dbec3f3 100644 --- a/tests/test_pretrain/test_pretrained_elmo.py +++ b/tests/test_pretrain/test_pretrained_elmo.py @@ -201,7 +201,7 @@ def test_elmo_t2v(self, pretrained_model_dir): t2v = T2V('elmo', pretrained_model_dir) encodes = tokenizer(items, key=lambda x: x['stem']) output = t2v(encodes) - assert output.forward_output.shape[-1] == t2v.vector_size // 2 + assert output.shape[-1] == t2v.vector_size assert t2v.infer_vector(encodes).shape[1] == t2v.vector_size assert t2v.infer_tokens(encodes).shape[2] == t2v.vector_size From d6e60cd154290f65f3f989974cbf569ea64a69f0 Mon Sep 17 00:00:00 2001 From: Longhu Qin Date: Wed, 29 Mar 2023 11:56:10 +0800 Subject: [PATCH 20/58] [BUG] Fix i2v.inter_vector error when params are provided --- EduNLP/Vector/bert_vec.py | 4 ++-- EduNLP/Vector/disenqnet/disenqnet.py | 2 +- EduNLP/Vector/elmo_vec.py | 6 +++--- EduNLP/Vector/gensim_vec.py | 6 +++--- EduNLP/Vector/quesnet/quesnet.py | 6 +++--- tests/test_pretrain/test_pretrained_bert.py | 2 ++ 6 files changed, 14 insertions(+), 12 deletions(-) diff --git a/EduNLP/Vector/bert_vec.py b/EduNLP/Vector/bert_vec.py index 41ed98cd..ce597199 100644 --- a/EduNLP/Vector/bert_vec.py +++ b/EduNLP/Vector/bert_vec.py @@ -38,7 +38,7 @@ def __call__(self, items: dict): tokens = self.model(**items).last_hidden_state return tokens - def infer_vector(self, items: dict, pooling_strategy='CLS') -> torch.Tensor: + def infer_vector(self, items: dict, pooling_strategy='CLS', **kwargs) -> torch.Tensor: vector = self(items) if pooling_strategy == 'CLS': return vector[:, 0, :] @@ -50,7 +50,7 @@ def infer_vector(self, items: dict, pooling_strategy='CLS') -> torch.Tensor: # batch_size, embedding_dim return mul_mask.sum(1) / (mask.sum(1) + 1e-10) - def infer_tokens(self, items: dict, return_special_tokens=False) -> torch.Tensor: + def infer_tokens(self, items: dict, return_special_tokens=False, **kwargs) -> torch.Tensor: tokens = self(items) if return_special_tokens: # include embedding of [CLS] and [SEP] diff --git a/EduNLP/Vector/disenqnet/disenqnet.py b/EduNLP/Vector/disenqnet/disenqnet.py index c3871716..27f6d571 100644 --- a/EduNLP/Vector/disenqnet/disenqnet.py +++ b/EduNLP/Vector/disenqnet/disenqnet.py @@ -17,7 +17,7 @@ def __init__(self, pretrained_dir, device="cpu"): self.model = DisenQNet.from_pretrained(pretrained_dir).to(self.device) self.model.eval() - def __call__(self, items: dict, **kwargs): + def __call__(self, items: dict): self.cuda_tensor(items) outputs = self.model(**items) return outputs.embeded, outputs.k_hidden, outputs.i_hidden diff --git a/EduNLP/Vector/elmo_vec.py b/EduNLP/Vector/elmo_vec.py index e76f9d9c..36ea8254 100644 --- a/EduNLP/Vector/elmo_vec.py +++ b/EduNLP/Vector/elmo_vec.py @@ -15,12 +15,12 @@ def __init__(self, pretrained_dir: str, device="cpu"): self.model = ElmoLM.from_pretrained(pretrained_dir).to(device) self.model.eval() - def __call__(self, items: dict, *args, **kwargs): + def __call__(self, items: dict): self.cuda_tensor(items) outputs = self.model(**items) return outputs - def infer_vector(self, items: dict, *args, **kwargs) -> torch.Tensor: + def infer_vector(self, items: dict, **kwargs) -> torch.Tensor: outputs = self(items) item_embeds = torch.cat( (outputs.forward_output[torch.arange(len(items["seq_len"])), torch.tensor(items["seq_len"]) - 1], @@ -28,7 +28,7 @@ def infer_vector(self, items: dict, *args, **kwargs) -> torch.Tensor: dim=-1) return item_embeds - def infer_tokens(self, items, *args, **kwargs) -> torch.Tensor: + def infer_tokens(self, items, **kwargs) -> torch.Tensor: outputs = self(items) forward_hiddens = outputs.forward_output backward_hiddens = outputs.backward_output diff --git a/EduNLP/Vector/gensim_vec.py b/EduNLP/Vector/gensim_vec.py index 9f25b8e0..7d3f05de 100644 --- a/EduNLP/Vector/gensim_vec.py +++ b/EduNLP/Vector/gensim_vec.py @@ -65,12 +65,12 @@ def __getitem__(self, item): index = self.key_to_index(item) return self.wv[item] if index not in self.constants.values() else np.zeros((self.vector_size,)) - def infer_vector(self, items, agg="mean", *args, **kwargs) -> list: - token_vectors = self.infer_tokens(items, *args, **kwargs) + def infer_vector(self, items, agg="mean", **kwargs) -> list: + token_vectors = self.infer_tokens(items, **kwargs) # return [eval("np.%s" % agg)(item, axis=0) if item else np.array([]) for item in token_vectors] return [eval("np.%s" % agg)(item, axis=0) if item else np.zeros(self.vector_size,) for item in token_vectors] - def infer_tokens(self, items, *args, **kwargs) -> list: + def infer_tokens(self, items, **kwargs) -> list: return [list(self(*item)) for item in items] diff --git a/EduNLP/Vector/quesnet/quesnet.py b/EduNLP/Vector/quesnet/quesnet.py index 1253afe8..36371f1b 100644 --- a/EduNLP/Vector/quesnet/quesnet.py +++ b/EduNLP/Vector/quesnet/quesnet.py @@ -21,7 +21,7 @@ def __init__(self, pretrained_dir, img_dir=None, device="cpu", **kwargs): self.model = QuesNet.from_pretrained(pretrained_dir, img_dir=img_dir).to(self.device) self.model.eval() - def __call__(self, items: dict, **kwargs): + def __call__(self, items: dict): """ get question embedding with quesnet Parameters @@ -34,7 +34,7 @@ def __call__(self, items: dict, **kwargs): outputs = self.model(self.model.make_batch(qs, device=self.device)) return outputs.hidden, outputs.embeded - def infer_vector(self, items: Union[dict, list]) -> torch.Tensor: + def infer_vector(self, items: Union[dict, list], **kwargs) -> torch.Tensor: """ get question embedding with quesnet Parameters @@ -44,7 +44,7 @@ def infer_vector(self, items: Union[dict, list]) -> torch.Tensor: """ return self(items)[0] - def infer_tokens(self, items: Union[dict, list]) -> torch.Tensor: + def infer_tokens(self, items: Union[dict, list], **kwargs) -> torch.Tensor: """ get token embeddings with quesnet Parameters diff --git a/tests/test_pretrain/test_pretrained_bert.py b/tests/test_pretrain/test_pretrained_bert.py index 9296dccc..d8db3cff 100644 --- a/tests/test_pretrain/test_pretrained_bert.py +++ b/tests/test_pretrain/test_pretrained_bert.py @@ -206,6 +206,8 @@ def test_i2v(self, pretrained_model_dir): i_vec = i2v.infer_item_vector(items, key=lambda x: x['stem']) assert len(i_vec[0]) == i2v.vector_size + i_vec = i2v.infer_item_vector(items, key=lambda x: x['stem'], pooling_strategy='average') + assert len(i_vec[0]) == i2v.vector_size t_vec = i2v.infer_token_vector(items, key=lambda x: x['stem']) assert len(t_vec[0][0]) == i2v.vector_size From 4808d885ed2b204f0c61af3ae94659db0eb84993 Mon Sep 17 00:00:00 2001 From: KenelmQLH <1097824882@qq.com> Date: Wed, 29 Mar 2023 04:16:10 +0000 Subject: [PATCH 21/58] Fix load_error when training and loading model on different device --- EduNLP/ModelZoo/base_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EduNLP/ModelZoo/base_model.py b/EduNLP/ModelZoo/base_model.py index a7ee418e..bcf3caef 100644 --- a/EduNLP/ModelZoo/base_model.py +++ b/EduNLP/ModelZoo/base_model.py @@ -31,7 +31,7 @@ def from_pretrained(cls, pretrained_model_path, *args, **kwargs): config_path = os.path.join(pretrained_model_path, "config.json") model_path = os.path.join(pretrained_model_path, "pytorch_model.bin") model = cls.from_config(config_path, *args, **kwargs) - loaded_state_dict = torch.load(model_path) + loaded_state_dict = torch.load(model_path, map_location=torch.device('cpu')) loaded_keys = loaded_state_dict.keys() expected_keys = model.state_dict().keys() From 5e0d5e937a908d684e63a05614e585d1c0f6929a Mon Sep 17 00:00:00 2001 From: nnnyt <793313994@qq.com> Date: Thu, 30 Mar 2023 15:16:32 +0800 Subject: [PATCH 22/58] [fix] unify output format for downstream models --- EduNLP/ModelZoo/bert/bert.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/EduNLP/ModelZoo/bert/bert.py b/EduNLP/ModelZoo/bert/bert.py index 1cd0dd69..956c44bf 100644 --- a/EduNLP/ModelZoo/bert/bert.py +++ b/EduNLP/ModelZoo/bert/bert.py @@ -1,23 +1,15 @@ import torch from torch import nn -from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence -from baize.torch import load_net -import torch.nn.functional as F import json import os from ..base_model import BaseModel -from transformers.modeling_outputs import ModelOutput +from ..utils import PropertyPredictionOutput, KnowledgePredictionOutput from transformers import BertModel, PretrainedConfig, BertConfig -from typing import List, Optional +from typing import List from ..rnn.harnn import HAM -import logging - -__all__ = ["BertForPropertyPrediction", "BertForKnowledgePrediction"] -class BertForPPOutput(ModelOutput): - loss: torch.FloatTensor = None - logits: torch.FloatTensor = None +__all__ = ["BertForPropertyPrediction", "BertForKnowledgePrediction"] class BertForPropertyPrediction(BaseModel): @@ -54,7 +46,7 @@ def forward(self, loss = None if labels is not None: loss = self.criterion(logits, labels) if labels is not None else None - return BertForPPOutput( + return PropertyPredictionOutput( loss=loss, logits=logits, ) @@ -141,7 +133,7 @@ def forward(self, labels = torch.sum(torch.nn.functional.one_hot(labels, num_classes=self.num_total_classes), dim=1) labels = labels.float() loss = self.criterion(logits, labels) if labels is not None else None - return BertForPPOutput( + return KnowledgePredictionOutput( loss=loss, logits=logits, ) From e36dac405bae7315685af173423b209acae2216c Mon Sep 17 00:00:00 2001 From: nnnyt <793313994@qq.com> Date: Mon, 26 Jun 2023 22:13:48 +0800 Subject: [PATCH 23/58] [doc] add similarity prediction demo --- .../downstream/similarity_prediction.ipynb | 298 ++++++++++++++++++ 1 file changed, 298 insertions(+) create mode 100644 examples/downstream/similarity_prediction.ipynb diff --git a/examples/downstream/similarity_prediction.ipynb b/examples/downstream/similarity_prediction.ipynb new file mode 100644 index 00000000..ef2e1f93 --- /dev/null +++ b/examples/downstream/similarity_prediction.ipynb @@ -0,0 +1,298 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 下游任务Demo:相似度预估" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "import pickle\n", + "import faiss\n", + "import time\n", + "from tqdm import tqdm\n", + "from scipy.stats import pearsonr, spearmanr\n", + "from sklearn.metrics.pairwise import paired_cosine_distances" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ['CUDA_VISIBLE_DEVICES'] = '0'" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. 获取题目表征" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 读取相似度预估下游任务题目数据,格式:每行一道题目文本\n", + "with open({'path/to/your/data/math.tsv'}, 'r') as f:\n", + " lines = f.readlines()\n", + "ques = []\n", + "for line in lines:\n", + " ques.append(line.strip('\\n'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 以DisenQNet为例\n", + "\n", + "from EduNLP.Pretrain import DisenQTokenizer\n", + "from EduNLP.Vector import DisenQModel\n", + "\n", + "path = \"/path/to/disenqnet/checkpoint\"\n", + "tokenizer = DisenQTokenizer.from_pretrained(path)\n", + "t2v = DisenQModel(path, device=\"cuda\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ques_emb = []\n", + "with np.errstate(all='raise'):\n", + " for i, text in enumerate(tqdm(ques)):\n", + " encodes = tokenizer([text], key=lambda x: x)\n", + " emb = t2v.infer_vector(encodes, key=lambda x: x[\"stem\"], vector_type=\"k\").detach().cpu().reshape(-1).numpy()\n", + " ques_emb.append(emb)\n", + "ques_emb = np.array(ques_emb)\n", + "ques_emb.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('./cache/disenq_300_embs.pkl', 'wb') as f:\n", + " pickle.dump(ques_emb, f)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. 相似度预估 Ranking" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 读取数据\n", + "sim = pd.read_csv('/path/to/your/data/similarity.csv')\n", + "test_id1 = []\n", + "test_id2 = []\n", + "labels = []\n", + "for i, line in sim.iterrows():\n", + " id1, id2, _, _, _, sim = line\n", + " try:\n", + " idx1 = id1-1\n", + " idx2 = id2-1\n", + " score = sum([int(x) for x in sim.split('|')]) / 3\n", + " test_id1.append(idx1)\n", + " test_id2.append(idx2)\n", + " labels.append(score)\n", + " except:\n", + " print(id1, id2, score)\n", + "np.array(labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_ranking_metrics(ques_emb):\n", + " ques_emb1 = ques_emb[test_id1]\n", + " ques_emb2 = ques_emb[test_id2]\n", + " cosine_scores = 1 - (paired_cosine_distances(ques_emb1, ques_emb2))\n", + " pearson_cosine, _ = pearsonr(labels, cosine_scores)\n", + " spearman_cosine, _ = spearmanr(labels, cosine_scores)\n", + " print(f'Pearson: {pearson_cosine:.4f}, Spearman: {spearman_cosine:.4f}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 读取Step1中保存的题目表征\n", + "with open('./cache/disenq_300_embs.pkl', 'rb') as f:\n", + " embs = pickle.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "compute_ranking_metrics(embs)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. 相似度预估 Recall" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 读取Step1中保存的题目表征\n", + "with open('./cache/disenq_300_embs.pkl', 'rb') as f:\n", + " embs = pickle.load(f)\n", + "\n", + "norm_embs = embs / (np.linalg.norm(embs, ord=2, axis=-1, keepdims=True) + 1e-12)\n", + "norm_embs = norm_embs.astype('float32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dim = norm_embs.shape[-1]\n", + "param = 'IVF512,PQ15'\n", + "measure = faiss.METRIC_L2\n", + "index = faiss.index_factory(dim, param, measure)\n", + "index.train(norm_embs)\n", + "index.add(norm_embs)\n", + "faiss.write_index(index, './index/disenq.index')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 读取数据并按照recall任务进行处理\n", + "sim = pd.read_csv('/path/to/your/data/similarity.csv')\n", + "query = {}\n", + "for i, line in sim.iterrows():\n", + " id1, id2, _, _, _, sim = line\n", + " id1 = int(id1)\n", + " id2 = int(id2)\n", + " score = sum([int(x) for x in sim.split('|')]) / 3\n", + " if score >= 5:\n", + " if id1 in query:\n", + " query[id1].append((id2, score))\n", + " else:\n", + " query[id1] = [(id2, score)]\n", + " if id2 in query:\n", + " query[id2].append((id1, score))\n", + " else:\n", + " query[id2] = [(id1, score)]\n", + "for k in query:\n", + " query[k].sort(key=lambda x: x[1], reverse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_recall_metrics(query, result, p=100):\n", + " total_hr, total_ndcg = 0, 0\n", + " for k, v in query.items():\n", + " res = result[k][:p]\n", + " hit, dcg, idcg = 0, 0, 0\n", + " for i, (label, score) in enumerate(v):\n", + " idcg += (2 ** score - 1) / np.log2(i + 2)\n", + " if label in res:\n", + " hit += 1\n", + " dcg += (2 ** score - 1) / np.log2(res.index(label) + 2)\n", + " total_hr += (hit / len(v))\n", + " total_ndcg += (dcg / idcg)\n", + " print(f'HR@{p}: {total_hr / len(query):.4f}, NDCG@{p}: {total_ndcg / len(query):.4f}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "avg_time = 0\n", + "for _ in range(5):\n", + " result = {}\n", + " total_time = 0\n", + " for k in tqdm(query):\n", + " idx = k-1\n", + " start = time.time()\n", + " _, idxs = index.search(norm_embs[idx].reshape(1, -1), 101)\n", + " end = time.time()\n", + " total_time += (end - start) * 1000\n", + " res_ids = idxs.tolist()[0]\n", + " if idx in res_ids:\n", + " res_ids.remove(idx)\n", + " result[k] = []\n", + " for i in res_ids[:100]:\n", + " try:\n", + " result[k].append(i+1)\n", + " except:\n", + " pass\n", + " print('Average time: ', total_time / len(query))\n", + " avg_time += total_time / len(query)\n", + " compute_recall_metrics(query, result, 10)\n", + " compute_recall_metrics(query, result, 20)\n", + " compute_recall_metrics(query, result, 30)\n", + " compute_recall_metrics(query, result, 50)\n", + " compute_recall_metrics(query, result, 100)\n", + "print(avg_time / 5)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From d9e3187d945987f94e547bd9170db8aec8c6dc9e Mon Sep 17 00:00:00 2001 From: shangzi <2716440431@qq.com> Date: Thu, 29 Jun 2023 11:59:41 +0000 Subject: [PATCH 24/58] add demos for difficulty & discrimination prediction --- examples/diff_disc_pred/src/common.py | 7 + examples/diff_disc_pred/src/diff_pred.ipynb | 310 ++++++++++++++++++ examples/diff_disc_pred/src/disc_pred.ipynb | 304 +++++++++++++++++ examples/diff_disc_pred/src/utils/__init__.py | 3 + examples/diff_disc_pred/src/utils/dataset.py | 287 ++++++++++++++++ examples/diff_disc_pred/src/utils/file.py | 85 +++++ 6 files changed, 996 insertions(+) create mode 100644 examples/diff_disc_pred/src/common.py create mode 100644 examples/diff_disc_pred/src/diff_pred.ipynb create mode 100644 examples/diff_disc_pred/src/disc_pred.ipynb create mode 100644 examples/diff_disc_pred/src/utils/__init__.py create mode 100644 examples/diff_disc_pred/src/utils/dataset.py create mode 100644 examples/diff_disc_pred/src/utils/file.py diff --git a/examples/diff_disc_pred/src/common.py b/examples/diff_disc_pred/src/common.py new file mode 100644 index 00000000..60061ea1 --- /dev/null +++ b/examples/diff_disc_pred/src/common.py @@ -0,0 +1,7 @@ +import os + +ROOT = os.path.dirname(os.path.dirname(__file__)) +DATA_DIR = os.path.join(ROOT, "data") +WORK_DIR = os.path.join(ROOT, "src") + + diff --git a/examples/diff_disc_pred/src/diff_pred.ipynb b/examples/diff_disc_pred/src/diff_pred.ipynb new file mode 100644 index 00000000..fd9ef627 --- /dev/null +++ b/examples/diff_disc_pred/src/diff_pred.ipynb @@ -0,0 +1,310 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 难度预估" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch import nn\n", + "import numpy as np\n", + "from transformers.modeling_outputs import ModelOutput\n", + "from transformers import BertModel, TrainingArguments, Trainer\n", + "import torch.nn.functional as F\n", + "from sklearn.metrics import ndcg_score, mean_squared_error\n", + "from torchmetrics import MeanAbsoluteError, PearsonCorrCoef, SpearmanCorrCoef\n", + "import os\n", + "from EduNLP.Pretrain import BertTokenizer \n", + "import json\n", + "from utils import Dataset_bert, load_json, get_val, get_train\n", + "from common import ROOT, DATA_DIR" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/data/shangzi/anaconda3/envs/tgen/lib/python3.8/site-packages/torchmetrics/utilities/prints.py:36: UserWarning: Metric `SpearmanCorrcoef` will save all targets and predictions in the buffer. For large datasets, this may lead to large memory footprint.\n", + " warnings.warn(*args, **kwargs)\n" + ] + } + ], + "source": [ + "MAE = MeanAbsoluteError()\n", + "PCC = PearsonCorrCoef()\n", + "SCC = SpearmanCorrCoef()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 加载数据,定义预训练模型路径" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[load_json] start : /data/shangzi/edunlp/gaokao-prediction/data/train/高中数学.json\n", + "[load_json] num = 3600, open_path = /data/shangzi/edunlp/gaokao-prediction/data/train/高中数学.json\n", + "[load_json] start : /data/shangzi/edunlp/gaokao-prediction/data/test/高中数学paper.json\n", + "[load_json] num = 7, open_path = /data/shangzi/edunlp/gaokao-prediction/data/test/高中数学paper.json\n" + ] + } + ], + "source": [ + "output_dir = \"output/difficulty\" #模型保存路径\n", + "pretrained_model_dir = os.path.join(DATA_DIR, \"bert_math_768\") #bert路径,也可以更换为其他模型的路径,如disenqnet, roberta等\n", + "train_data = load_json(os.path.join(DATA_DIR, \"train\", \"高中数学.json\")) #训练集\n", + "train_items = get_train(train_data)\n", + "val_data = load_json(os.path.join(DATA_DIR, \"test\", \"高中数学paper.json\")) #测试集\n", + "val_items, val_gap = get_val(val_data)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 定义网络结构" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "class DifficultyPredictionOutput(ModelOutput):\n", + " loss: torch.FloatTensor = None\n", + " logits: torch.FloatTensor = None\n", + " labels: torch.FloatTensor = None\n", + "\n", + "class BertPrediction(nn.Module):\n", + " \n", + " def __init__(self, pretrained_model_dir=None, classifier_dropout=0.5):\n", + " super(BertPrediction, self).__init__() \n", + " self.bert = BertModel.from_pretrained(pretrained_model_dir)\n", + " hidden_size = self.bert.config.hidden_size\n", + " # print(hidden_size)\n", + " self.classifier_dropout = classifier_dropout\n", + " self.dropout = nn.Dropout(classifier_dropout)\n", + " self.classifier = nn.Linear(hidden_size, 1)\n", + " self.sigmoid = nn.Sigmoid()\n", + "\n", + " def forward(self,\n", + " input_ids=None,\n", + " attention_mask=None,\n", + " token_type_ids=None,\n", + " content=None,\n", + " labels=None,\n", + " ):\n", + " input_ids = content['input_ids'].squeeze(0)\n", + " attention_mask = content['attention_mask'].squeeze(0)\n", + " token_type_ids = content['token_type_ids'].squeeze(0)\n", + " item_embed = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)['last_hidden_state'][:, 0, :]\n", + " logits = self.sigmoid(self.classifier(item_embed))\n", + " loss = F.mse_loss(logits.squeeze(0), labels)\n", + " return DifficultyPredictionOutput(\n", + " loss = loss,\n", + " logits = logits,\n", + " labels = labels\n", + " )" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 定义评价指标" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_metrics(pred):\n", + " logits = torch.as_tensor(pred.predictions[0]).squeeze(0)\n", + " logits = logits.view([logits.size()[0]],-1)\n", + " labels = torch.as_tensor(pred.label_ids)\n", + " print(\"logits\", logits)\n", + " print(\"labels\", labels)\n", + " pres = logits.numpy().tolist()\n", + " golds = labels.numpy().tolist()\n", + " ret = {\n", + " \"mae\": MAE(logits, labels),\n", + " \"mse\": mean_squared_error(golds, pres),\n", + " \"rmse\": np.sqrt(mean_squared_error(golds, pres)),\n", + " \"pcc\": PCC(logits, labels),\n", + " \"scc\": SCC(logits, labels),\n", + " 'ndcg': testdata_metrics(val_gap, golds, pres).tolist(),\n", + " }\n", + " return ret\n", + "def testdata_metrics(val_gap, diff, pred):\n", + " diff, pred = np.array(diff), np.array(pred)\n", + " idx = np.where(diff>0)[0]\n", + " ndcg = []\n", + " for s, e in val_gap:\n", + " _diff, _pred = diff[s:e], pred[s:e]\n", + " if _diff[0]==-1:\n", + " _diff = [i+1 for i in range(len(_diff))]\n", + " ndcg.append([ndcg_score([_diff], [_pred]), ndcg_score([_diff], [_pred], k=10), ndcg_score([_diff], [_pred], k=20), ndcg_score([_diff], [_pred], k=30)])\n", + " ndcg = np.mean(ndcg, axis=0)\n", + " return ndcg" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 定义训练和测试相关参数" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "class MyTrainer(Trainer):\n", + " pass\n", + "\n", + "def train_diff_pred(\n", + " output_dir,\n", + " pretrained_model_dir,\n", + " train_items=None,\n", + " val_items=None,\n", + " test_items=None,\n", + " train_params=None):\n", + " model = BertPrediction(pretrained_model_dir=pretrained_model_dir)\n", + " tokenizer = BertTokenizer(add_special_tokens=True)\n", + " # training parameters\n", + " if train_params is not None:\n", + " epochs = train_params['epochs'] if 'epochs' in train_params else 1\n", + " batch_size = train_params['batch_size'] if 'batch_size' in train_params else 64\n", + " save_steps = train_params['save_steps'] if 'save_steps' in train_params else 100\n", + " save_total_limit = train_params['save_total_limit'] if 'save_total_limit' in train_params else 2\n", + " logging_steps = train_params['logging_steps'] if 'logging_steps' in train_params else 5\n", + " gradient_accumulation_steps = train_params['gradient_accumulation_steps'] \\\n", + " if 'gradient_accumulation_steps' in train_params else 1\n", + " logging_dir = train_params['logging_dir'] if 'logging_dir' in train_params else f\"{ROOT}/log\"\n", + " else:\n", + " # default\n", + " epochs = 50\n", + " batch_size = 1\n", + " save_steps = 1000\n", + " save_total_limit = 2\n", + " logging_steps = 100\n", + " gradient_accumulation_steps = 1\n", + " logging_dir = f\"{ROOT}/log\"\n", + "\n", + "\n", + " train_dataset = Dataset_bert(train_items, tokenizer)\n", + " eval_dataset = Dataset_bert(val_items, tokenizer)\n", + "\n", + " training_args = TrainingArguments(\n", + " output_dir=output_dir,\n", + " overwrite_output_dir=True,\n", + "\n", + " num_train_epochs=epochs,\n", + " per_device_train_batch_size=batch_size,\n", + " per_device_eval_batch_size=batch_size,\n", + " evaluation_strategy = \"steps\", # epoch\n", + " eval_steps=logging_steps*5,\n", + " \n", + " save_steps=save_steps,\n", + " save_total_limit=save_total_limit,\n", + " \n", + " logging_steps=logging_steps,\n", + " logging_dir=logging_dir,\n", + "\n", + " gradient_accumulation_steps=gradient_accumulation_steps,\n", + " learning_rate=5e-5,\n", + " )\n", + "\n", + " trainer = MyTrainer(\n", + " model=model,\n", + " args=training_args,\n", + " data_collator=train_dataset.collate_fn,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset,\n", + " compute_metrics=compute_metrics\n", + " )\n", + "\n", + " trainer.train()\n", + " trainer.save_model(output_dir)\n", + " tokenizer.save_pretrained(output_dir)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 训练和测试" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_diff_pred(\n", + " output_dir,\n", + " pretrained_model_dir,\n", + " train_items=train_items,\n", + " val_items=val_items,\n", + " train_params= None\n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tgen", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/diff_disc_pred/src/disc_pred.ipynb b/examples/diff_disc_pred/src/disc_pred.ipynb new file mode 100644 index 00000000..4c32456d --- /dev/null +++ b/examples/diff_disc_pred/src/disc_pred.ipynb @@ -0,0 +1,304 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 区分度预估" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-06-28 03:30:52.333651: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2023-06-28 03:30:52.381526: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2023-06-28 03:30:54.202449: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" + ] + } + ], + "source": [ + "import torch\n", + "from torch import nn\n", + "import numpy as np\n", + "from transformers.modeling_outputs import ModelOutput\n", + "from transformers import BertModel, TrainingArguments, Trainer\n", + "import torch.nn.functional as F\n", + "from sklearn.metrics import ndcg_score, mean_squared_error\n", + "from torchmetrics import MeanAbsoluteError, PearsonCorrCoef, SpearmanCorrCoef\n", + "import os\n", + "from EduNLP.Pretrain import BertTokenizer \n", + "import json\n", + "from utils import Dataset_bert, pre_disc\n", + "from common import ROOT, DATA_DIR" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/data/shangzi/anaconda3/envs/tgen/lib/python3.8/site-packages/torchmetrics/utilities/prints.py:36: UserWarning: Metric `SpearmanCorrcoef` will save all targets and predictions in the buffer. For large datasets, this may lead to large memory footprint.\n", + " warnings.warn(*args, **kwargs)\n" + ] + } + ], + "source": [ + "MAE = MeanAbsoluteError()\n", + "PCC = PearsonCorrCoef()\n", + "SCC = SpearmanCorrCoef()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 加载数据,定义预训练模型路径" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "output_dir = os.path.join(ROOT, \"output/discrimination\") #模型保存路径\n", + "pretrained_model_dir = \"../data/bert_math_768\" #bert路径,也可以更换为其他模型的路径,如disenqnet, roberta等\n", + "train_items = pre_disc(os.path.join(DATA_DIR, \"train\", \"ctt_train.csv\")) #训练集\n", + "val_items = pre_disc(os.path.join(DATA_DIR, \"test\", \"ctt_test.csv\")) #测试集" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 定义网络结构" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "class DiscriminationPredictionOutput(ModelOutput):\n", + " loss: torch.FloatTensor = None\n", + " logits: torch.FloatTensor = None\n", + " labels: torch.FloatTensor = None\n", + "\n", + "class BertPrediction(nn.Module):\n", + " \n", + " def __init__(self, pretrained_model_dir=None, config=None, classifier_dropout=0.5):\n", + "\n", + " super(BertPrediction, self).__init__()\n", + "\n", + " self.bert = BertModel.from_pretrained(pretrained_model_dir) \n", + " \n", + " hidden_size = self.bert.config.hidden_size\n", + " self.classifier_dropout = classifier_dropout\n", + " self.dropout = nn.Dropout(classifier_dropout)\n", + " self.classifier = nn.Linear(hidden_size, 1)\n", + " self.sigmoid = nn.Sigmoid()\n", + "\n", + " def forward(self,\n", + " input_ids=None,\n", + " attention_mask=None,\n", + " token_type_ids=None,\n", + " content=None,\n", + " labels=None,\n", + " ):\n", + " \n", + " input_ids = content['input_ids'].squeeze(0)\n", + " attention_mask = content['attention_mask'].squeeze(0)\n", + " token_type_ids = content['token_type_ids'].squeeze(0) \n", + " item_embed = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)['last_hidden_state'][:, 0, :]\n", + " logits = self.sigmoid(self.classifier(item_embed))\n", + " loss = F.mse_loss(logits.squeeze(0), labels)\n", + " return DiscriminationPredictionOutput(\n", + " loss = loss,\n", + " logits = logits,\n", + " labels = labels\n", + " )" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 定义评价指标" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_metrics(pred):\n", + " logits = torch.as_tensor(pred.predictions[0]).squeeze(0)\n", + " logits = logits.view([logits.size()[0]],-1)\n", + " labels = torch.as_tensor(pred.label_ids)\n", + " pres = logits.numpy().tolist()\n", + " golds = labels.numpy().tolist()\n", + " ret = {\n", + " \"mae\": MAE(logits, labels),\n", + " \"mse\": mean_squared_error(golds, pres),\n", + " \"rmse\": np.sqrt(mean_squared_error(golds, pres)),\n", + " \"pcc\": PCC(logits, labels),\n", + " \"scc\": SCC(logits, labels),\n", + " 'ndcg': testdata_metrics(golds, pres).tolist(),\n", + " }\n", + " return ret\n", + "\n", + "def testdata_metrics(diff, pred):\n", + " diff, pred = np.array(diff), np.array(pred)\n", + " ndcg = []\n", + " ndcg.append([ndcg_score([diff], [pred]), ndcg_score([diff], [pred], k=10), ndcg_score([diff], [pred], k=20), ndcg_score([diff], [pred], k=30)])\n", + " ndcg = np.mean(ndcg, axis=0)\n", + " return ndcg" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 定义训练和测试相关参数" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "class MyTrainer(Trainer):\n", + " pass\n", + "\n", + "def train_disc_pred(\n", + " output_dir,\n", + " pretrained_model_dir,\n", + " train_items=None,\n", + " val_items=None,\n", + " train_params=None):\n", + " model = BertPrediction(pretrained_model_dir=pretrained_model_dir) \n", + " tokenizer = BertTokenizer(add_special_tokens=True)\n", + " \n", + " if train_params is not None:\n", + " epochs = train_params['epochs'] if 'epochs' in train_params else 1\n", + " batch_size = train_params['batch_size'] if 'batch_size' in train_params else 64\n", + " save_steps = train_params['save_steps'] if 'save_steps' in train_params else 100\n", + " save_total_limit = train_params['save_total_limit'] if 'save_total_limit' in train_params else 2\n", + " logging_steps = train_params['logging_steps'] if 'logging_steps' in train_params else 5\n", + " gradient_accumulation_steps = train_params['gradient_accumulation_steps'] \\\n", + " if 'gradient_accumulation_steps' in train_params else 1\n", + " logging_dir = train_params['logging_dir'] if 'logging_dir' in train_params else f\"{ROOT}/log\"\n", + " else:\n", + " # default\n", + " epochs = 50\n", + " batch_size = 1\n", + " save_steps = 1000\n", + " save_total_limit = 2\n", + " logging_steps = 100\n", + " gradient_accumulation_steps = 1\n", + " logging_dir = f\"{ROOT}/log\"\n", + "\n", + " train_dataset = Dataset_bert(train_items, tokenizer)\n", + " eval_dataset = Dataset_bert(val_items, tokenizer)\n", + "\n", + " training_args = TrainingArguments(\n", + " output_dir=output_dir,\n", + " overwrite_output_dir=True,\n", + "\n", + " num_train_epochs=epochs,\n", + " per_device_train_batch_size=batch_size,\n", + " per_device_eval_batch_size=batch_size,\n", + " evaluation_strategy = \"steps\", # epoch\n", + " eval_steps=logging_steps*5,\n", + " \n", + " save_steps=save_steps,\n", + " save_total_limit=save_total_limit,\n", + " \n", + " logging_steps=logging_steps,\n", + " logging_dir=logging_dir,\n", + "\n", + " gradient_accumulation_steps=gradient_accumulation_steps,\n", + " learning_rate=5e-5,\n", + " )\n", + "\n", + " trainer = MyTrainer(\n", + " model=model,\n", + " args=training_args,\n", + " data_collator=train_dataset.collate_fn,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset,\n", + " compute_metrics=compute_metrics,\n", + " )\n", + "\n", + " trainer.train()\n", + " trainer.save_model(output_dir)\n", + " tokenizer.save_pretrained(output_dir)\n", + " #trainer.evaluate()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 训练和测试" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_disc_pred(\n", + " output_dir,\n", + " pretrained_model_dir=pretrained_model_dir,\n", + " train_items=train_items,\n", + " val_items=val_items,\n", + " train_params= None\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "tgen", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/diff_disc_pred/src/utils/__init__.py b/examples/diff_disc_pred/src/utils/__init__.py new file mode 100644 index 00000000..637426bc --- /dev/null +++ b/examples/diff_disc_pred/src/utils/__init__.py @@ -0,0 +1,3 @@ +from .file import * +from .dataset import * +from .tokenizer import * \ No newline at end of file diff --git a/examples/diff_disc_pred/src/utils/dataset.py b/examples/diff_disc_pred/src/utils/dataset.py new file mode 100644 index 00000000..abf50a8e --- /dev/null +++ b/examples/diff_disc_pred/src/utils/dataset.py @@ -0,0 +1,287 @@ +from torch.utils.data import Dataset +from EduNLP.ModelZoo.utils import pad_sequence +import torch +import numpy as np +from EduNLP.I2V import W2V, Bert +from EduNLP.Pretrain import BertTokenizer +from tqdm import tqdm +from gensim.models import KeyedVectors +import os +import re +import jieba + +class BaseDataset(Dataset): + def __init__(self, items, tokenizer, mode="train", labal_key="difficulty"): + self.tokenizer = tokenizer + self.items = items + self.mode = mode + self.labal_key = labal_key + + + def __getitem__(self, index): + item = self.items[index] + ret = self.tokenizer(item["content"]) + if self.mode in ["train", "val"]: + ret["labels"] = item[self.labal_key] + return ret + + def __len__(self): + return len(self.items) + + + def collate_fn(self, batch_data): + bert_tokenizer=self.tokenizer.tokenizer + pad_idx = bert_tokenizer.vocab.get(bert_tokenizer.unk_token) + first = batch_data[0] + batch = { + k: [item[k] for item in batch_data] for k in first.keys() + } + for k,v in batch.items(): + if k != "labels": + batch[k] = pad_sequence(v, pad_val=pad_idx) + + batch = {key: torch.as_tensor(val) for key, val in batch.items()} + # print("[debug] batch final: ", batch) + return batch + +class Dataset_bert(Dataset): + def __init__(self, items, tokenizer): + self.tokenizer = tokenizer + self.items = items + + self.preprocess() + def preprocess(self): + for item in tqdm(self.items): + content_vector = self.tokenizer(item['content'], return_tensors="np") + item["content"] = content_vector + + def __getitem__(self, index): + item = self.items[index] + return item + + def __len__(self): + return len(self.items) + + def collate_fn(self, batch_data): + first = batch_data[0] + batch = { + k: [item[k] for item in batch_data] for k in first.keys() + } + batch["content"] = self.tokenizer.bert_tokenizer.pad(batch["content"], return_tensors='pt') + batch["labels"] = torch.as_tensor(batch["labels"]) + return batch + +class Dataset_bert_jiuzhang(Dataset): + def __init__(self, items, tokenizer): + self.tokenizer = tokenizer + self.items = items + + self.preprocess() + def preprocess(self): + for item in tqdm(self.items): + content_vector = self.tokenizer(str(item['content']), return_tensors="pt", max_length=512, truncation=True) + item["content"] = content_vector + + def __getitem__(self, index): + item = self.items[index] + return item + + def __len__(self): + return len(self.items) + + def collate_fn(self, batch_data): + first = batch_data[0] + batch = { + k: [item[k] for item in batch_data] for k in first.keys() + } + batch["content"] = self.tokenizer(str(batch["content"]), return_tensors='pt', max_length=512, truncation=True) #jiuzhang + batch["labels"] = torch.as_tensor(batch["labels"]) + return batch + +class DiffiultyDataset_w2v(Dataset): + def __init__(self, items, pretrained_path, mode="train"): + self.pretrained_path = pretrained_path + self.items = items + self.mode = mode + self.i2v = W2V("pure_text", "w2v", pretrained_path) + self.preprocess() + + def preprocess(self): + for item in tqdm(self.items): + item['content'] = torch.FloatTensor(np.array(self.i2v.infer_item_vector([item["content"]]))).squeeze(0) + + def __getitem__(self, index): + item = self.items[index] + return item + + def __len__(self): + return len(self.items) + + + def collate_fn(self, batch_data): + first = batch_data[0] + batch = { + k: [item[k] for item in batch_data] for k in first.keys() + } + batch["content"] = torch.stack(batch["content"]) + batch["labels"] = torch.as_tensor(batch["labels"]) + batch = {key: torch.as_tensor(val).squeeze(0) for key, val in batch.items()} + return batch + +class PubWord2Vector(object): + def __init__(self, pretrained_path, language=""): + self.language = language + if language == "mix": + assert os.path.isdir(pretrained_path) + + self.eng_w2v_path = f"{pretrained_path}/glove.6B.300d.word2vec" + self.chs_w2v_path = f"{pretrained_path}/sgns.baidubaike.bigram-char.bz2" + + self.eng_w2v = KeyedVectors.load_word2vec_format(self.eng_w2v_path, binary=False) + self.chs_w2v = KeyedVectors.load_word2vec_format(self.chs_w2v_path, binary=False) + else: + assert os.path.isfile(pretrained_path) + try: + self.w2v = KeyedVectors.load_word2vec_format(pretrained_path, binary=False) + except Exception as e: + print(e) + self.w2v = KeyedVectors.load_word2vec_format(pretrained_path, binary=True) + def word_to_embedding(self, word): + if self.language: + if re.search(u'[\u4E00-\u9FA5]', word): + if word in self.chs_w2v: + return self.chs_w2v[word] + else: + count = 0 + temp_array = np.zeros([300]) + for i in word: + if i in self.chs_w2v: + temp_array += self.chs_w2v[i] + count += 1 + if count != 0: + temp_array /= count + return temp_array + else: + if word in self.eng_w2v: + return self.eng_w2v[word] + elif word.lower() in self.eng_w2v: + return self.eng_w2v[word.lower()] + else: + temp_array = np.zeros([300]) + return temp_array + else: + if word in self.w2v: + return self.w2v[word] + elif word.lower() in self.w2v: + return self.w2v[word.lower()] + else: + temp_array = np.zeros([300]) + return temp_array + + @property + def vector_size(self): + return 300 + +class DiffiultyDataset_w2v_pub(Dataset): + def __init__(self, items, pretrained_path, mode="train"): + self.pretrained_path = pretrained_path + self.items = items + self.mode = mode + self.i2v = PubWord2Vector(pretrained_path) + self.preprocess() + + def preprocess(self): + for item in tqdm(self.items): + words = jieba.lcut(item["content"]) + item_vector = [] + for word in words: + temp_emb = self.i2v.word_to_embedding(word) + item_vector.append(temp_emb) + item_vector = torch.FloatTensor(np.mean(item_vector, axis=0)) if item_vector else torch.FloatTensor(np.zeros([300])) + item['content'] = item_vector + + def __getitem__(self, index): + item = self.items[index] + return item + + def __len__(self): + return len(self.items) + + + def collate_fn(self, batch_data): + first = batch_data[0] + batch = { + k: [item[k] for item in batch_data] for k in first.keys() + } + + batch["content"] = torch.stack(batch["content"]) + batch["labels"] = torch.as_tensor(batch["labels"]) + batch = {key: torch.as_tensor(val).squeeze(0) for key, val in batch.items()} + return batch + + +class DiscriminationDataset(Dataset): + def __init__(self, items, tokenizer, mode="train"): + self.tokenizer = tokenizer + self.items = items + self.mode = mode + + + def __getitem__(self, index): + item = self.items[index] + ret = self.tokenizer(item["content"]) + if self.mode in ["train", "val"]: + ret["labels"] = item["labels"] + return ret + + def __len__(self): + return len(self.items) + + + def collate_fn(self, batch_data): + bert_tokenizer=self.tokenizer.bert_tokenizer + pad_idx = bert_tokenizer.vocab.get(bert_tokenizer.unk_token) + first = batch_data[0] + batch = { + k: [item[k] for item in batch_data] for k in first.keys() + } + for k,v in batch.items(): + if k != "labels": + batch[k] = pad_sequence(v, pad_val=pad_idx) + + batch["content"] = torch.tensor( [it.cpu().detach().numpy() for it in batch["content"]] ) + batch["labels"] = torch.tensor([it.cpu().detach().numpy() for it in batch["labels"]]) + return batch + + +class ReliabilityDataset(Dataset): + def __init__(self, items, tokenizer, mode="train"): + self.tokenizer = tokenizer + self.items = items + self.mode = mode + + + def __getitem__(self, index): + item = self.items[index] + ret = self.tokenizer(item["content"]) + if self.mode in ["train", "val"]: + ret["labels"] = item["reliability"] + return ret + + def __len__(self): + return len(self.items) + + + def collate_fn(self, batch_data): + bert_tokenizer=self.tokenizer.bert_tokenizer + pad_idx = bert_tokenizer.vocab.get(bert_tokenizer.unk_token) + first = batch_data[0] + batch = { + k: [item[k] for item in batch_data] for k in first.keys() + } + for k,v in batch.items(): + if k != "labels": + batch[k] = pad_sequence(v, pad_val=pad_idx) + + batch = {key: torch.as_tensor(val) for key, val in batch.items()} + return batch \ No newline at end of file diff --git a/examples/diff_disc_pred/src/utils/file.py b/examples/diff_disc_pred/src/utils/file.py new file mode 100644 index 00000000..207ab1a0 --- /dev/null +++ b/examples/diff_disc_pred/src/utils/file.py @@ -0,0 +1,85 @@ +import json +import pickle +import os +import warnings +import pandas as pd + +def check2mkdir(file_path): + dir_path = os.path.dirname(file_path) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + +def save_json(save_data, output_path): + print("[save_json] start : {}".format(output_path)) + check2mkdir(output_path) + with open(output_path,'w+',encoding="utf-8") as f: + for row_dic in save_data: + try: + jsondata=json.dumps(row_dic, ensure_ascii=False) + f.write(jsondata + "\n") + except Exception as e: + print("[Exception] at {}:\n{}\n".format(row_dic, e)) + raise Exception("[save_json] 出现错误") + print("[save_json] num = {}, open_path = {}".format(len(save_data), output_path)) + + +def get_json(open_path, error_handler="raise"): + print("[get_json] start : {}".format(open_path)) + load_data = [] + i = 0 + with open(open_path, 'r', encoding="utf-8") as f: + try: + for line in f: + load_data.append(json.loads(line)) + i += 1 + except Exception as e: + if error_handler == "ignore": + warnings.warn("[Warning] at line {}:\n{}\n".format(i, e)) + else: + print("[Exception] at line {}:\n{}\n".format(i, e)) + raise Exception("[get_json] 出现错误") + print("[get_json] num = {}, open_path = {}".format(len(load_data), open_path)) + return load_data + +def load_json(open_path): + print("[load_json] start : {}".format(open_path)) + with open(open_path, "r", encoding='utf-8') as f: + load_q = json.load(f) + print("[load_json] num = {}, open_path = {}".format(len(load_q), open_path)) + return load_q + +def pre_disc(csv_path): + items = pd.read_csv(csv_path) + stem = items["stem"].tolist() + disc = items["disc"].tolist() + data = [] + for i in range(len(stem)): + dic = {} + dic["content"] = stem[i] + dic["labels"] = disc[i] + data.append(dic) + return data + +def get_train(train): + train_data = [] + for item in train: + dic = {} + dic["content"] = item["content"] + dic["labels"] = float(item["difficulty"]) + train_data.append(dic) + return train_data + +def get_val(val): + test_data, test_gap = [], [] + start, end = 0, 0 + for batch in val: + end += len(batch['questions']) + for item in batch['questions']: + dic = {} + dic['content'] = item["stem"] + dic['labels'] = item['diff'] + #dic["labels"] = dic.pop("difficulty") + test_data.append(dic) + test_gap.append([start, end]) + start = end + return test_data, test_gap \ No newline at end of file From 19ca5db3ed797e354532c2827cb235aa3c5ca515 Mon Sep 17 00:00:00 2001 From: nnnyt <793313994@qq.com> Date: Sun, 2 Jul 2023 18:21:57 +0800 Subject: [PATCH 25/58] [fix] fix dependencies --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ff792b24..a1950a7b 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ vec_deps = [ 'gensim', - 'transformers', + 'transformers<4.29.0', 'torchvision', 'datasets'] + ml_pytorch_deps From ff26a84fcd423874c4d42736319ba37a1d146f59 Mon Sep 17 00:00:00 2001 From: KenelmQLH <1097824882@qq.com> Date: Sun, 2 Jul 2023 13:09:50 +0000 Subject: [PATCH 26/58] remove print and fix tokenizer --- EduNLP/Pretrain/disenqnet_vec.py | 1 - EduNLP/Tokenizer/tokenizer.py | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/EduNLP/Pretrain/disenqnet_vec.py b/EduNLP/Pretrain/disenqnet_vec.py index d2376689..6dc2fd3e 100644 --- a/EduNLP/Pretrain/disenqnet_vec.py +++ b/EduNLP/Pretrain/disenqnet_vec.py @@ -156,7 +156,6 @@ def preprocess_dataset(pretrained_dir, disen_tokenizer, items, data_formation, t if not os.path.exists(concept_list_path): concepts = set() for data in items: - print(data) concept = data[data_formation["knowledge"]] for c in concept: if c not in concepts: diff --git a/EduNLP/Tokenizer/tokenizer.py b/EduNLP/Tokenizer/tokenizer.py index 56e3b28d..d154fff4 100644 --- a/EduNLP/Tokenizer/tokenizer.py +++ b/EduNLP/Tokenizer/tokenizer.py @@ -115,7 +115,7 @@ def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): class PureTextTokenizer(Tokenizer): - def __init__(self, handle_figure_formula="skip", **kwargs): + def __init__(self, symbol="gmas", handle_figure_formula="skip", **kwargs): """ Treat all elements in SIF item as prue text. Spectially, tokenize formulas as text. @@ -184,13 +184,14 @@ def __init__(self, handle_figure_formula="skip", **kwargs): "text_params": text_params, "figure_params": kwargs.get("figure_params", None) } + self.symbol = symbol def __call__(self, items: Iterable, key=lambda x: x, **kwargs): for item in items: yield self._tokenize(item, key=key, **kwargs) def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): - return tokenize(seg(key(item), symbol="gmas"), **self.tokenization_params, **kwargs).tokens + return tokenize(seg(key(item), symbol=self.symbol), **self.tokenization_params, **kwargs).tokens class AstFormulaTokenizer(Tokenizer): From 67c70efa46c9f0be07297d34250b3e99e35a6191 Mon Sep 17 00:00:00 2001 From: shangzi <2716440431@qq.com> Date: Sun, 2 Jul 2023 13:13:04 +0000 Subject: [PATCH 27/58] add demos for difficulty prediction and discrimination prediction --- examples/diff_disc_pred/src/common.py | 7 - examples/diff_disc_pred/src/utils/__init__.py | 3 - examples/diff_disc_pred/src/utils/file.py | 85 --- .../difficulty_prediction.ipynb} | 4 +- .../discrimination_prediction.ipynb} | 4 +- .../utils/dataset.py => downstream/utils.py} | 656 ++++++++++-------- 6 files changed, 376 insertions(+), 383 deletions(-) delete mode 100644 examples/diff_disc_pred/src/common.py delete mode 100644 examples/diff_disc_pred/src/utils/__init__.py delete mode 100644 examples/diff_disc_pred/src/utils/file.py rename examples/{diff_disc_pred/src/diff_pred.ipynb => downstream/difficulty_prediction.ipynb} (98%) rename examples/{diff_disc_pred/src/disc_pred.ipynb => downstream/discrimination_prediction.ipynb} (98%) rename examples/{diff_disc_pred/src/utils/dataset.py => downstream/utils.py} (77%) diff --git a/examples/diff_disc_pred/src/common.py b/examples/diff_disc_pred/src/common.py deleted file mode 100644 index 60061ea1..00000000 --- a/examples/diff_disc_pred/src/common.py +++ /dev/null @@ -1,7 +0,0 @@ -import os - -ROOT = os.path.dirname(os.path.dirname(__file__)) -DATA_DIR = os.path.join(ROOT, "data") -WORK_DIR = os.path.join(ROOT, "src") - - diff --git a/examples/diff_disc_pred/src/utils/__init__.py b/examples/diff_disc_pred/src/utils/__init__.py deleted file mode 100644 index 637426bc..00000000 --- a/examples/diff_disc_pred/src/utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .file import * -from .dataset import * -from .tokenizer import * \ No newline at end of file diff --git a/examples/diff_disc_pred/src/utils/file.py b/examples/diff_disc_pred/src/utils/file.py deleted file mode 100644 index 207ab1a0..00000000 --- a/examples/diff_disc_pred/src/utils/file.py +++ /dev/null @@ -1,85 +0,0 @@ -import json -import pickle -import os -import warnings -import pandas as pd - -def check2mkdir(file_path): - dir_path = os.path.dirname(file_path) - if not os.path.exists(dir_path): - os.makedirs(dir_path) - -def save_json(save_data, output_path): - print("[save_json] start : {}".format(output_path)) - check2mkdir(output_path) - with open(output_path,'w+',encoding="utf-8") as f: - for row_dic in save_data: - try: - jsondata=json.dumps(row_dic, ensure_ascii=False) - f.write(jsondata + "\n") - except Exception as e: - print("[Exception] at {}:\n{}\n".format(row_dic, e)) - raise Exception("[save_json] 出现错误") - print("[save_json] num = {}, open_path = {}".format(len(save_data), output_path)) - - -def get_json(open_path, error_handler="raise"): - print("[get_json] start : {}".format(open_path)) - load_data = [] - i = 0 - with open(open_path, 'r', encoding="utf-8") as f: - try: - for line in f: - load_data.append(json.loads(line)) - i += 1 - except Exception as e: - if error_handler == "ignore": - warnings.warn("[Warning] at line {}:\n{}\n".format(i, e)) - else: - print("[Exception] at line {}:\n{}\n".format(i, e)) - raise Exception("[get_json] 出现错误") - print("[get_json] num = {}, open_path = {}".format(len(load_data), open_path)) - return load_data - -def load_json(open_path): - print("[load_json] start : {}".format(open_path)) - with open(open_path, "r", encoding='utf-8') as f: - load_q = json.load(f) - print("[load_json] num = {}, open_path = {}".format(len(load_q), open_path)) - return load_q - -def pre_disc(csv_path): - items = pd.read_csv(csv_path) - stem = items["stem"].tolist() - disc = items["disc"].tolist() - data = [] - for i in range(len(stem)): - dic = {} - dic["content"] = stem[i] - dic["labels"] = disc[i] - data.append(dic) - return data - -def get_train(train): - train_data = [] - for item in train: - dic = {} - dic["content"] = item["content"] - dic["labels"] = float(item["difficulty"]) - train_data.append(dic) - return train_data - -def get_val(val): - test_data, test_gap = [], [] - start, end = 0, 0 - for batch in val: - end += len(batch['questions']) - for item in batch['questions']: - dic = {} - dic['content'] = item["stem"] - dic['labels'] = item['diff'] - #dic["labels"] = dic.pop("difficulty") - test_data.append(dic) - test_gap.append([start, end]) - start = end - return test_data, test_gap \ No newline at end of file diff --git a/examples/diff_disc_pred/src/diff_pred.ipynb b/examples/downstream/difficulty_prediction.ipynb similarity index 98% rename from examples/diff_disc_pred/src/diff_pred.ipynb rename to examples/downstream/difficulty_prediction.ipynb index fd9ef627..98ad02f7 100644 --- a/examples/diff_disc_pred/src/diff_pred.ipynb +++ b/examples/downstream/difficulty_prediction.ipynb @@ -26,7 +26,9 @@ "from EduNLP.Pretrain import BertTokenizer \n", "import json\n", "from utils import Dataset_bert, load_json, get_val, get_train\n", - "from common import ROOT, DATA_DIR" + "\n", + "ROOT = os.path.dirname(os.path.dirname(__file__))\n", + "DATA_DIR = os.path.join(ROOT, \"data\")" ] }, { diff --git a/examples/diff_disc_pred/src/disc_pred.ipynb b/examples/downstream/discrimination_prediction.ipynb similarity index 98% rename from examples/diff_disc_pred/src/disc_pred.ipynb rename to examples/downstream/discrimination_prediction.ipynb index 4c32456d..ab3a98d4 100644 --- a/examples/diff_disc_pred/src/disc_pred.ipynb +++ b/examples/downstream/discrimination_prediction.ipynb @@ -37,7 +37,9 @@ "from EduNLP.Pretrain import BertTokenizer \n", "import json\n", "from utils import Dataset_bert, pre_disc\n", - "from common import ROOT, DATA_DIR" + "\n", + "ROOT = os.path.dirname(os.path.dirname(__file__))\n", + "DATA_DIR = os.path.join(ROOT, \"data\")" ] }, { diff --git a/examples/diff_disc_pred/src/utils/dataset.py b/examples/downstream/utils.py similarity index 77% rename from examples/diff_disc_pred/src/utils/dataset.py rename to examples/downstream/utils.py index abf50a8e..935dff6b 100644 --- a/examples/diff_disc_pred/src/utils/dataset.py +++ b/examples/downstream/utils.py @@ -1,287 +1,371 @@ -from torch.utils.data import Dataset -from EduNLP.ModelZoo.utils import pad_sequence -import torch -import numpy as np -from EduNLP.I2V import W2V, Bert -from EduNLP.Pretrain import BertTokenizer -from tqdm import tqdm -from gensim.models import KeyedVectors -import os -import re -import jieba - -class BaseDataset(Dataset): - def __init__(self, items, tokenizer, mode="train", labal_key="difficulty"): - self.tokenizer = tokenizer - self.items = items - self.mode = mode - self.labal_key = labal_key - - - def __getitem__(self, index): - item = self.items[index] - ret = self.tokenizer(item["content"]) - if self.mode in ["train", "val"]: - ret["labels"] = item[self.labal_key] - return ret - - def __len__(self): - return len(self.items) - - - def collate_fn(self, batch_data): - bert_tokenizer=self.tokenizer.tokenizer - pad_idx = bert_tokenizer.vocab.get(bert_tokenizer.unk_token) - first = batch_data[0] - batch = { - k: [item[k] for item in batch_data] for k in first.keys() - } - for k,v in batch.items(): - if k != "labels": - batch[k] = pad_sequence(v, pad_val=pad_idx) - - batch = {key: torch.as_tensor(val) for key, val in batch.items()} - # print("[debug] batch final: ", batch) - return batch - -class Dataset_bert(Dataset): - def __init__(self, items, tokenizer): - self.tokenizer = tokenizer - self.items = items - - self.preprocess() - def preprocess(self): - for item in tqdm(self.items): - content_vector = self.tokenizer(item['content'], return_tensors="np") - item["content"] = content_vector - - def __getitem__(self, index): - item = self.items[index] - return item - - def __len__(self): - return len(self.items) - - def collate_fn(self, batch_data): - first = batch_data[0] - batch = { - k: [item[k] for item in batch_data] for k in first.keys() - } - batch["content"] = self.tokenizer.bert_tokenizer.pad(batch["content"], return_tensors='pt') - batch["labels"] = torch.as_tensor(batch["labels"]) - return batch - -class Dataset_bert_jiuzhang(Dataset): - def __init__(self, items, tokenizer): - self.tokenizer = tokenizer - self.items = items - - self.preprocess() - def preprocess(self): - for item in tqdm(self.items): - content_vector = self.tokenizer(str(item['content']), return_tensors="pt", max_length=512, truncation=True) - item["content"] = content_vector - - def __getitem__(self, index): - item = self.items[index] - return item - - def __len__(self): - return len(self.items) - - def collate_fn(self, batch_data): - first = batch_data[0] - batch = { - k: [item[k] for item in batch_data] for k in first.keys() - } - batch["content"] = self.tokenizer(str(batch["content"]), return_tensors='pt', max_length=512, truncation=True) #jiuzhang - batch["labels"] = torch.as_tensor(batch["labels"]) - return batch - -class DiffiultyDataset_w2v(Dataset): - def __init__(self, items, pretrained_path, mode="train"): - self.pretrained_path = pretrained_path - self.items = items - self.mode = mode - self.i2v = W2V("pure_text", "w2v", pretrained_path) - self.preprocess() - - def preprocess(self): - for item in tqdm(self.items): - item['content'] = torch.FloatTensor(np.array(self.i2v.infer_item_vector([item["content"]]))).squeeze(0) - - def __getitem__(self, index): - item = self.items[index] - return item - - def __len__(self): - return len(self.items) - - - def collate_fn(self, batch_data): - first = batch_data[0] - batch = { - k: [item[k] for item in batch_data] for k in first.keys() - } - batch["content"] = torch.stack(batch["content"]) - batch["labels"] = torch.as_tensor(batch["labels"]) - batch = {key: torch.as_tensor(val).squeeze(0) for key, val in batch.items()} - return batch - -class PubWord2Vector(object): - def __init__(self, pretrained_path, language=""): - self.language = language - if language == "mix": - assert os.path.isdir(pretrained_path) - - self.eng_w2v_path = f"{pretrained_path}/glove.6B.300d.word2vec" - self.chs_w2v_path = f"{pretrained_path}/sgns.baidubaike.bigram-char.bz2" - - self.eng_w2v = KeyedVectors.load_word2vec_format(self.eng_w2v_path, binary=False) - self.chs_w2v = KeyedVectors.load_word2vec_format(self.chs_w2v_path, binary=False) - else: - assert os.path.isfile(pretrained_path) - try: - self.w2v = KeyedVectors.load_word2vec_format(pretrained_path, binary=False) - except Exception as e: - print(e) - self.w2v = KeyedVectors.load_word2vec_format(pretrained_path, binary=True) - def word_to_embedding(self, word): - if self.language: - if re.search(u'[\u4E00-\u9FA5]', word): - if word in self.chs_w2v: - return self.chs_w2v[word] - else: - count = 0 - temp_array = np.zeros([300]) - for i in word: - if i in self.chs_w2v: - temp_array += self.chs_w2v[i] - count += 1 - if count != 0: - temp_array /= count - return temp_array - else: - if word in self.eng_w2v: - return self.eng_w2v[word] - elif word.lower() in self.eng_w2v: - return self.eng_w2v[word.lower()] - else: - temp_array = np.zeros([300]) - return temp_array - else: - if word in self.w2v: - return self.w2v[word] - elif word.lower() in self.w2v: - return self.w2v[word.lower()] - else: - temp_array = np.zeros([300]) - return temp_array - - @property - def vector_size(self): - return 300 - -class DiffiultyDataset_w2v_pub(Dataset): - def __init__(self, items, pretrained_path, mode="train"): - self.pretrained_path = pretrained_path - self.items = items - self.mode = mode - self.i2v = PubWord2Vector(pretrained_path) - self.preprocess() - - def preprocess(self): - for item in tqdm(self.items): - words = jieba.lcut(item["content"]) - item_vector = [] - for word in words: - temp_emb = self.i2v.word_to_embedding(word) - item_vector.append(temp_emb) - item_vector = torch.FloatTensor(np.mean(item_vector, axis=0)) if item_vector else torch.FloatTensor(np.zeros([300])) - item['content'] = item_vector - - def __getitem__(self, index): - item = self.items[index] - return item - - def __len__(self): - return len(self.items) - - - def collate_fn(self, batch_data): - first = batch_data[0] - batch = { - k: [item[k] for item in batch_data] for k in first.keys() - } - - batch["content"] = torch.stack(batch["content"]) - batch["labels"] = torch.as_tensor(batch["labels"]) - batch = {key: torch.as_tensor(val).squeeze(0) for key, val in batch.items()} - return batch - - -class DiscriminationDataset(Dataset): - def __init__(self, items, tokenizer, mode="train"): - self.tokenizer = tokenizer - self.items = items - self.mode = mode - - - def __getitem__(self, index): - item = self.items[index] - ret = self.tokenizer(item["content"]) - if self.mode in ["train", "val"]: - ret["labels"] = item["labels"] - return ret - - def __len__(self): - return len(self.items) - - - def collate_fn(self, batch_data): - bert_tokenizer=self.tokenizer.bert_tokenizer - pad_idx = bert_tokenizer.vocab.get(bert_tokenizer.unk_token) - first = batch_data[0] - batch = { - k: [item[k] for item in batch_data] for k in first.keys() - } - for k,v in batch.items(): - if k != "labels": - batch[k] = pad_sequence(v, pad_val=pad_idx) - - batch["content"] = torch.tensor( [it.cpu().detach().numpy() for it in batch["content"]] ) - batch["labels"] = torch.tensor([it.cpu().detach().numpy() for it in batch["labels"]]) - return batch - - -class ReliabilityDataset(Dataset): - def __init__(self, items, tokenizer, mode="train"): - self.tokenizer = tokenizer - self.items = items - self.mode = mode - - - def __getitem__(self, index): - item = self.items[index] - ret = self.tokenizer(item["content"]) - if self.mode in ["train", "val"]: - ret["labels"] = item["reliability"] - return ret - - def __len__(self): - return len(self.items) - - - def collate_fn(self, batch_data): - bert_tokenizer=self.tokenizer.bert_tokenizer - pad_idx = bert_tokenizer.vocab.get(bert_tokenizer.unk_token) - first = batch_data[0] - batch = { - k: [item[k] for item in batch_data] for k in first.keys() - } - for k,v in batch.items(): - if k != "labels": - batch[k] = pad_sequence(v, pad_val=pad_idx) - - batch = {key: torch.as_tensor(val) for key, val in batch.items()} +from torch.utils.data import Dataset +from EduNLP.ModelZoo.utils import pad_sequence +import torch +import numpy as np +from EduNLP.I2V import W2V, Bert +from EduNLP.Pretrain import BertTokenizer +from tqdm import tqdm +from gensim.models import KeyedVectors +import json +import os +import re +import warnings +import jieba +import pandas as pd + + +def check2mkdir(file_path): + dir_path = os.path.dirname(file_path) + if not os.path.exists(dir_path): + os.makedirs(dir_path) + +def save_json(save_data, output_path): + print("[save_json] start : {}".format(output_path)) + check2mkdir(output_path) + with open(output_path,'w+',encoding="utf-8") as f: + for row_dic in save_data: + try: + jsondata=json.dumps(row_dic, ensure_ascii=False) + f.write(jsondata + "\n") + except Exception as e: + print("[Exception] at {}:\n{}\n".format(row_dic, e)) + raise Exception("[save_json] 出现错误") + print("[save_json] num = {}, open_path = {}".format(len(save_data), output_path)) + + +def get_json(open_path, error_handler="raise"): + print("[get_json] start : {}".format(open_path)) + load_data = [] + i = 0 + with open(open_path, 'r', encoding="utf-8") as f: + try: + for line in f: + load_data.append(json.loads(line)) + i += 1 + except Exception as e: + if error_handler == "ignore": + warnings.warn("[Warning] at line {}:\n{}\n".format(i, e)) + else: + print("[Exception] at line {}:\n{}\n".format(i, e)) + raise Exception("[get_json] 出现错误") + print("[get_json] num = {}, open_path = {}".format(len(load_data), open_path)) + return load_data + +def load_json(open_path): + print("[load_json] start : {}".format(open_path)) + with open(open_path, "r", encoding='utf-8') as f: + load_q = json.load(f) + print("[load_json] num = {}, open_path = {}".format(len(load_q), open_path)) + return load_q + +def pre_disc(csv_path): + items = pd.read_csv(csv_path) + stem = items["stem"].tolist() + disc = items["disc"].tolist() + data = [] + for i in range(len(stem)): + dic = {} + dic["content"] = stem[i] + dic["labels"] = disc[i] + data.append(dic) + return data + +def get_train(train): + train_data = [] + for item in train: + dic = {} + dic["content"] = item["content"] + dic["labels"] = float(item["difficulty"]) + train_data.append(dic) + return train_data + +def get_val(val): + test_data, test_gap = [], [] + start, end = 0, 0 + for batch in val: + end += len(batch['questions']) + for item in batch['questions']: + dic = {} + dic['content'] = item["stem"] + dic['labels'] = item['diff'] + #dic["labels"] = dic.pop("difficulty") + test_data.append(dic) + test_gap.append([start, end]) + start = end + return test_data, test_gap + +class BaseDataset(Dataset): + def __init__(self, items, tokenizer, mode="train", labal_key="difficulty"): + self.tokenizer = tokenizer + self.items = items + self.mode = mode + self.labal_key = labal_key + + + def __getitem__(self, index): + item = self.items[index] + ret = self.tokenizer(item["content"]) + if self.mode in ["train", "val"]: + ret["labels"] = item[self.labal_key] + return ret + + def __len__(self): + return len(self.items) + + + def collate_fn(self, batch_data): + bert_tokenizer=self.tokenizer.tokenizer + pad_idx = bert_tokenizer.vocab.get(bert_tokenizer.unk_token) + first = batch_data[0] + batch = { + k: [item[k] for item in batch_data] for k in first.keys() + } + for k,v in batch.items(): + if k != "labels": + batch[k] = pad_sequence(v, pad_val=pad_idx) + + batch = {key: torch.as_tensor(val) for key, val in batch.items()} + # print("[debug] batch final: ", batch) + return batch + +class Dataset_bert(Dataset): + def __init__(self, items, tokenizer): + self.tokenizer = tokenizer + self.items = items + + self.preprocess() + def preprocess(self): + for item in tqdm(self.items): + content_vector = self.tokenizer(item['content'], return_tensors="np") + item["content"] = content_vector + + def __getitem__(self, index): + item = self.items[index] + return item + + def __len__(self): + return len(self.items) + + def collate_fn(self, batch_data): + first = batch_data[0] + batch = { + k: [item[k] for item in batch_data] for k in first.keys() + } + batch["content"] = self.tokenizer.bert_tokenizer.pad(batch["content"], return_tensors='pt') + batch["labels"] = torch.as_tensor(batch["labels"]) + return batch + +class Dataset_bert_jiuzhang(Dataset): + def __init__(self, items, tokenizer): + self.tokenizer = tokenizer + self.items = items + + self.preprocess() + def preprocess(self): + for item in tqdm(self.items): + content_vector = self.tokenizer(str(item['content']), return_tensors="pt", max_length=512, truncation=True) + item["content"] = content_vector + + def __getitem__(self, index): + item = self.items[index] + return item + + def __len__(self): + return len(self.items) + + def collate_fn(self, batch_data): + first = batch_data[0] + batch = { + k: [item[k] for item in batch_data] for k in first.keys() + } + batch["content"] = self.tokenizer(str(batch["content"]), return_tensors='pt', max_length=512, truncation=True) #jiuzhang + batch["labels"] = torch.as_tensor(batch["labels"]) + return batch + +class DiffiultyDataset_w2v(Dataset): + def __init__(self, items, pretrained_path, mode="train"): + self.pretrained_path = pretrained_path + self.items = items + self.mode = mode + self.i2v = W2V("pure_text", "w2v", pretrained_path) + self.preprocess() + + def preprocess(self): + for item in tqdm(self.items): + item['content'] = torch.FloatTensor(np.array(self.i2v.infer_item_vector([item["content"]]))).squeeze(0) + + def __getitem__(self, index): + item = self.items[index] + return item + + def __len__(self): + return len(self.items) + + + def collate_fn(self, batch_data): + first = batch_data[0] + batch = { + k: [item[k] for item in batch_data] for k in first.keys() + } + batch["content"] = torch.stack(batch["content"]) + batch["labels"] = torch.as_tensor(batch["labels"]) + batch = {key: torch.as_tensor(val).squeeze(0) for key, val in batch.items()} + return batch + +class PubWord2Vector(object): + def __init__(self, pretrained_path, language=""): + self.language = language + if language == "mix": + assert os.path.isdir(pretrained_path) + + self.eng_w2v_path = f"{pretrained_path}/glove.6B.300d.word2vec" + self.chs_w2v_path = f"{pretrained_path}/sgns.baidubaike.bigram-char.bz2" + + self.eng_w2v = KeyedVectors.load_word2vec_format(self.eng_w2v_path, binary=False) + self.chs_w2v = KeyedVectors.load_word2vec_format(self.chs_w2v_path, binary=False) + else: + assert os.path.isfile(pretrained_path) + try: + self.w2v = KeyedVectors.load_word2vec_format(pretrained_path, binary=False) + except Exception as e: + print(e) + self.w2v = KeyedVectors.load_word2vec_format(pretrained_path, binary=True) + def word_to_embedding(self, word): + if self.language: + if re.search(u'[\u4E00-\u9FA5]', word): + if word in self.chs_w2v: + return self.chs_w2v[word] + else: + count = 0 + temp_array = np.zeros([300]) + for i in word: + if i in self.chs_w2v: + temp_array += self.chs_w2v[i] + count += 1 + if count != 0: + temp_array /= count + return temp_array + else: + if word in self.eng_w2v: + return self.eng_w2v[word] + elif word.lower() in self.eng_w2v: + return self.eng_w2v[word.lower()] + else: + temp_array = np.zeros([300]) + return temp_array + else: + if word in self.w2v: + return self.w2v[word] + elif word.lower() in self.w2v: + return self.w2v[word.lower()] + else: + temp_array = np.zeros([300]) + return temp_array + + @property + def vector_size(self): + return 300 + +class DiffiultyDataset_w2v_pub(Dataset): + def __init__(self, items, pretrained_path, mode="train"): + self.pretrained_path = pretrained_path + self.items = items + self.mode = mode + self.i2v = PubWord2Vector(pretrained_path) + self.preprocess() + + def preprocess(self): + for item in tqdm(self.items): + words = jieba.lcut(item["content"]) + item_vector = [] + for word in words: + temp_emb = self.i2v.word_to_embedding(word) + item_vector.append(temp_emb) + item_vector = torch.FloatTensor(np.mean(item_vector, axis=0)) if item_vector else torch.FloatTensor(np.zeros([300])) + item['content'] = item_vector + + def __getitem__(self, index): + item = self.items[index] + return item + + def __len__(self): + return len(self.items) + + + def collate_fn(self, batch_data): + first = batch_data[0] + batch = { + k: [item[k] for item in batch_data] for k in first.keys() + } + + batch["content"] = torch.stack(batch["content"]) + batch["labels"] = torch.as_tensor(batch["labels"]) + batch = {key: torch.as_tensor(val).squeeze(0) for key, val in batch.items()} + return batch + + +class DiscriminationDataset(Dataset): + def __init__(self, items, tokenizer, mode="train"): + self.tokenizer = tokenizer + self.items = items + self.mode = mode + + + def __getitem__(self, index): + item = self.items[index] + ret = self.tokenizer(item["content"]) + if self.mode in ["train", "val"]: + ret["labels"] = item["labels"] + return ret + + def __len__(self): + return len(self.items) + + + def collate_fn(self, batch_data): + bert_tokenizer=self.tokenizer.bert_tokenizer + pad_idx = bert_tokenizer.vocab.get(bert_tokenizer.unk_token) + first = batch_data[0] + batch = { + k: [item[k] for item in batch_data] for k in first.keys() + } + for k,v in batch.items(): + if k != "labels": + batch[k] = pad_sequence(v, pad_val=pad_idx) + + batch["content"] = torch.tensor( [it.cpu().detach().numpy() for it in batch["content"]] ) + batch["labels"] = torch.tensor([it.cpu().detach().numpy() for it in batch["labels"]]) + return batch + + +class ReliabilityDataset(Dataset): + def __init__(self, items, tokenizer, mode="train"): + self.tokenizer = tokenizer + self.items = items + self.mode = mode + + + def __getitem__(self, index): + item = self.items[index] + ret = self.tokenizer(item["content"]) + if self.mode in ["train", "val"]: + ret["labels"] = item["reliability"] + return ret + + def __len__(self): + return len(self.items) + + + def collate_fn(self, batch_data): + bert_tokenizer=self.tokenizer.bert_tokenizer + pad_idx = bert_tokenizer.vocab.get(bert_tokenizer.unk_token) + first = batch_data[0] + batch = { + k: [item[k] for item in batch_data] for k in first.keys() + } + for k,v in batch.items(): + if k != "labels": + batch[k] = pad_sequence(v, pad_val=pad_idx) + + batch = {key: torch.as_tensor(val) for key, val in batch.items()} return batch \ No newline at end of file From eb4964300549e17ed0d9757f584cc1660df6af4a Mon Sep 17 00:00:00 2001 From: KenelmQLH <1097824882@qq.com> Date: Sun, 2 Jul 2023 13:14:56 +0000 Subject: [PATCH 28/58] update gitignore --- .gitignore | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ea2af581..d22fa043 100644 --- a/.gitignore +++ b/.gitignore @@ -110,4 +110,10 @@ venv.bak/ .pyre/ # User Definition -data/ \ No newline at end of file +data/ +deprecated/ +tmp*/ +jieba.cache +*.kv +*.zip +examples/test_model \ No newline at end of file From 9394c61a9b2bf1b06ec63e5453e7d0bb12862584 Mon Sep 17 00:00:00 2001 From: shangzi <2716440431@qq.com> Date: Sun, 2 Jul 2023 14:05:25 +0000 Subject: [PATCH 29/58] update AUTHORS.md --- AUTHORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.md b/AUTHORS.md index f1a9fafc..870aed9c 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -20,5 +20,6 @@ [Jundong Wu](https://github.com/wintermelon008) +[Shangzi Xue](https://github.com/ShangziXue) The stared contributors are the corresponding authors. From 19b540fe2af6956903e7ab1776a2c565e1252058 Mon Sep 17 00:00:00 2001 From: KenelmQLH <1097824882@qq.com> Date: Sun, 2 Jul 2023 14:08:00 +0000 Subject: [PATCH 30/58] [DOC] Add paper segmentation --- examples/downstream/paper_seg/data.py | 160 +++++++++++++ examples/downstream/paper_seg/model.py | 216 ++++++++++++++++++ examples/downstream/paper_seg/paper_seg.ipynb | 215 +++++++++++++++++ examples/downstream/paper_seg/trainer.py | 144 ++++++++++++ examples/downstream/paper_seg/utils.py | 39 ++++ 5 files changed, 774 insertions(+) create mode 100644 examples/downstream/paper_seg/data.py create mode 100644 examples/downstream/paper_seg/model.py create mode 100644 examples/downstream/paper_seg/paper_seg.ipynb create mode 100644 examples/downstream/paper_seg/trainer.py create mode 100644 examples/downstream/paper_seg/utils.py diff --git a/examples/downstream/paper_seg/data.py b/examples/downstream/paper_seg/data.py new file mode 100644 index 00000000..24d49265 --- /dev/null +++ b/examples/downstream/paper_seg/data.py @@ -0,0 +1,160 @@ +import os +import numpy as np +from tqdm import tqdm +import jieba +import re +import torch +from torch.utils.data import Dataset +from torch.autograd import Variable +import torch.nn.functional as F +from EduNLP.I2V import W2V, Bert, DisenQ +import warnings + + +VECTOR_MODEL_MAP = { + "w2v": W2V, + "bert": Bert, + "disenq": DisenQ, +} + +class PaperI2V(): + def __init__(self, pretrained_model_type, pretrained_model_dir, device="cpu", language=""): + self.pretrained_model_type = pretrained_model_type + self.pretrained_model_dir = pretrained_model_dir + self.device = device + + tokenizer_kwargs = {"tokenizer_config_dir": pretrained_model_dir} + + if pretrained_model_type == "w2v": + # set text tokenizer + text_params = { + "granularity": "word", + "stopwords": None, + } + tokenizer_kwargs["text_params"] = text_params + self.i2v = VECTOR_MODEL_MAP[pretrained_model_type]("pure_text", + 'w2v', + pretrained_model_dir, + tokenizer_kwargs=tokenizer_kwargs + ) + elif pretrained_model_type in ["bert"]: + self.i2v = VECTOR_MODEL_MAP[pretrained_model_type]('bert', + 'bert', + pretrained_model_dir, + tokenizer_kwargs=tokenizer_kwargs, + device=device + ) + elif pretrained_model_type in ["disenq"]: + self.i2v = VECTOR_MODEL_MAP[pretrained_model_type]('disenq', + 'disenq', + pretrained_model_dir, + tokenizer_kwargs=tokenizer_kwargs, + device=device + ) + @classmethod + def prcoess_line(cls, aline): + return aline + + def to_embedding(self, aline): + aline = self.prcoess_line(aline) + + if self.pretrained_model_type == "w2v_pub": + words = jieba.lcut(aline) + token_vector = [] + for word in words: + if not re.sub(r'\s', '', word) == '': + temp_emb = self.i2v.word_to_embedding(word) + token_vector.append(temp_emb.tolist()) + token_vector = torch.FloatTensor(token_vector) + elif self.pretrained_model_type == "w2v": + token_vector = torch.FloatTensor(np.array(self.i2v.infer_token_vector([aline]))).squeeze(0) + elif self.pretrained_model_type == "bert": + token_vector = self.i2v.infer_token_vector([aline]).squeeze(0) + token_vector = token_vector.float().cpu().detach() + elif self.pretrained_model_type == "disenq": + token_vector = self.i2v.infer_token_vector([aline]).squeeze(0) + token_vector = token_vector.float().cpu().detach() + + if aline == "": + warnings.warn("[ERROR] to_embedding: aline is empty") + return None + + return token_vector + +class VecDataset(Dataset): + def __init__(self, + language="", + text_data_dir=None, + emb_data_path=None, + pretrained_model_type="bert", + pretrained_model_dir=None, + paper_i2v: PaperI2V = None, + device="cpu", + mode="train", + do_w2v = False, + ): + self.device = device + self.text_data_dir = text_data_dir + self.emb_data_path = emb_data_path + self.pretrained_model_type = pretrained_model_type + self.pretrained_model_dir = pretrained_model_dir + self.mode = mode + self.input_data = [] + + if paper_i2v is not None: + self.paper_i2v = paper_i2v + else: + # 适配 双语pub_w2v + language = "english" if language == "english" else "" + self.paper_i2v = PaperI2V(pretrained_model_type, + pretrained_model_dir, + device=self.device, + language=language) + + if not os.path.exists(emb_data_path) or do_w2v: + os.makedirs(os.path.dirname(emb_data_path), exist_ok=True) + self.set_all_text_embedding(text_data_dir, emb_data_path) + else: + self.get_all_text_embedding(emb_data_path) + + @property + def embed_dim(self): + return self.paper_i2v.vector_size + + def __getitem__(self, index): + # return self.input_data[index] + doc, tag = self.input_data[index] + self.input_data[index][0] = [ sent.to(self.device) for sent in doc] + self.input_data[index][1] = tag.to(self.device) + return self.input_data[index] + + def __len__(self): + return len(self.input_data) + + def set_all_text_embedding(self, indir, outpath): + print(f'setting {self.mode} data ... ') + path_list = os.listdir(indir) + for file_name in tqdm(path_list): + file_path = os.path.join(indir, file_name) + doc, tag = self.paper_i2v.get_tagged_text_to_embedding(file_path) + self.input_data.append( [doc, tag] ) + torch.save(self.input_data, outpath) + + def get_all_text_embedding(self, inpath): + print(f'loading {self.mode} data ... ') + self.input_data = torch.load(inpath) + + def pad(self, tags): + max_length = max([_tags.size()[0] for _tags in tags]) + for i, _tags in enumerate(tags): + _length = _tags.size()[0] + tags[i] = F.pad(_tags, (0, max_length - _length)) # (max_length) + return torch.stack(tags, dim=0) + + def collcate_fn(self, batch_data): + documents, tags = list(zip(*batch_data)) + batch = { + "documents": list(documents), + "tags": self.pad(list(tags)), + } + return batch \ No newline at end of file diff --git a/examples/downstream/paper_seg/model.py b/examples/downstream/paper_seg/model.py new file mode 100644 index 00000000..466c672a --- /dev/null +++ b/examples/downstream/paper_seg/model.py @@ -0,0 +1,216 @@ +import torch +import torch.nn as nn +import torch.nn.init as init +import torch.nn.functional as F +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence +from transformers.modeling_outputs import ModelOutput +from EduNLP.ModelZoo.base_model import BaseModel +import numpy as np +import json +import os +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) + + +def zero_state(module, batch_size, device): + # * 2 is for the two directions + return torch.zeros(module.num_layers * 2, batch_size, module.hidden_dim).to(device), \ + torch.zeros(module.num_layers * 2, batch_size, module.hidden_dim).to(device) + + +def unsort(sort_order): + result = [-1] * len(sort_order) + for i, index in enumerate(sort_order): + result[index] = i + return result + +class SentenceEncoder(nn.Module): + def __init__(self, input_size, hidden_dim, num_layers): + super().__init__() + self.input_size = input_size + self.hidden_dim = hidden_dim + self.num_layers = num_layers + + self.lstm = nn.LSTM(input_size=self.input_size, + hidden_size=self.hidden_dim, + num_layers=self.num_layers, + dropout=0, + bidirectional=True) # batch_first=False + + def forward(self, sentence_embs, sentence_lens): + """ + Max-pooling for sentence representations + """ + batch_size = sentence_embs.shape[1] + device = sentence_embs.device + s = zero_state(self, batch_size, device=device) + packed_tensor = pack_padded_sequence(sentence_embs, sentence_lens) + packed_output, _ = self.lstm(packed_tensor, s) + padded_output, lengths = pad_packed_sequence(packed_output) # (max sentence len, batch, 256*2) + maxes = torch.zeros(batch_size, padded_output.size(2)).to(device) + for i in range(batch_size): + maxes[i, :] = torch.max(padded_output[:lengths[i], i, :], 0)[0] + + return maxes + +class TrainForPaperSegOutput(ModelOutput): + loss: torch.FloatTensor = None + logits: torch.FloatTensor = None + + +class PaperSegModel(BaseModel): + def __init__(self, embed_dim, hidden_dim, num_layers): + super().__init__() + self.embed_dim = embed_dim + self.hidden_dim = hidden_dim + self.num_layers = num_layers + self.sent_batch_size = None + + self.sentence_encoder = SentenceEncoder(self.embed_dim, + self.hidden_dim, + self.num_layers) + self.lstm = nn.LSTM(input_size=self.hidden_dim*2, + hidden_size=self.hidden_dim, + num_layers=self.num_layers, + dropout=0, + bidirectional=True) # batch_first=False + self.full_connect = nn.Linear(self.hidden_dim*2, 2) # 2 label + # self.reset_parameters() + + self.criterion = torch.nn.CrossEntropyLoss() + + self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__"]} + self.config['architecture'] = 'PaperSegModel' + + def set_sentence_batch(self, sent_batch_size): + self.sent_batch_size = sent_batch_size + + def make_bach_sentences(self, sentences, lens, sent_batch_size): + idx = 0 + batch_sentences, batch_lens = [], [] + while idx < len(sentences): + next_idx = idx + sent_batch_size if idx + sent_batch_size <= len(sentences) else len(sentences) + + max_length = max( lens[idx: next_idx] ) + padded_sentences = [self.pad_sent(s, max_length) for s in sentences[idx: next_idx]] + padded_sentences = torch.stack(padded_sentences, dim=1) + + batch_sentences.append( padded_sentences ) + batch_lens.append( lens[idx: next_idx] ) + idx = next_idx + return batch_sentences, batch_lens + + def reset_parameters(self): + for name, param in self.lstm.named_parameters(): + if name.startswith('weight'): + init.orthogonal_(param) + else: + assert name.startswith('bias') + init.constant_(param, 0.) + for name, param in self.full_connect.named_parameters(): + if name.startswith('weight'): + init.orthogonal_(param) + else: + assert name.startswith('bias') + init.constant_(param, 0.) + + def pad_sent(self, seq, max_length): + s_length = seq.size()[0] + padded = F.pad(seq, (0, 0, 0, max_length - s_length)) # (1, 1, max_length, 300) + return padded + + def pad_document(self, d, max_document_length): + d_length = d.size()[0] + v = d.unsqueeze(0).unsqueeze(0) + padded = F.pad(v, (0, 0, 0, max_document_length - d_length )) # (1, 1, max_length, 300) + shape = padded.size() + return padded.view(shape[2], 1, shape[3]) # (max_length, 1, 300) + + def forward(self, documents=None, tags=None): # batch [documnts]\ + """ + documnts: + [ sentences[word_embeddings[Tensor([300]), ... ], ...], ...] + tags: + torch.Tensor() + """ + batch_size = len(documents) + device = documents[0][0][0].device + document_lens = [] + all_sentences = [] + for document in documents: + all_sentences.extend(document) + document_lens.append(len(document)) + + # sentence 排序 + all_sentence_lens = [s.size()[0] for s in all_sentences] + sort_order = np.argsort(all_sentence_lens)[::-1] + sorted_sentences = [all_sentences[i] for i in sort_order] + sorted_lengths = [s.size()[0] for s in sorted_sentences] + + # sentence 编码 + if self.sent_batch_size is not None: + all_encoded_sentences = [] + all_sent_embs, all_sent_lens = self.make_bach_sentences(sorted_sentences, sorted_lengths, self.sent_batch_size) + for batch_sent_embs, batch_sent_lens in zip(all_sent_embs, all_sent_lens): + batch_encoded_sentences = self.sentence_encoder(batch_sent_embs, batch_sent_lens) + all_encoded_sentences.extend(batch_encoded_sentences) + all_encoded_sentences = torch.stack(all_encoded_sentences, dim=0) + else: + max_length = max(all_sentence_lens) + sorted_padded_sentences = [self.pad_sent(s, max_length) for s in sorted_sentences] + sorted_padded_sentences = torch.stack(sorted_padded_sentences, dim=1) + all_encoded_sentences = self.sentence_encoder(sorted_padded_sentences, sorted_lengths) + unsort_order = torch.LongTensor(unsort(sort_order)).to(device) + unsorted_encodings = all_encoded_sentences.index_select(0, unsort_order) + + index = 0 + encoded_documents = [] + for sentences_count in document_lens: + end_index = index + sentences_count + encoded_documents.append(unsorted_encodings[index: end_index, :]) + index = end_index + + # document 排序 + max_doc_size = np.max(document_lens) + ordered_document_idx = np.argsort(document_lens)[::-1] + ordered_doc_sizes = sorted(document_lens)[::-1] + ordered_documents = [encoded_documents[idx] for idx in ordered_document_idx] + padded_docs = [self.pad_document(d, max_doc_size) for d in ordered_documents] + docs_tensor = torch.cat(padded_docs, 1) + packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes) + sentence_lstm_output, _ = self.lstm(packed_docs, zero_state(self, batch_size=batch_size, device=device)) + padded_x, _ = pad_packed_sequence(sentence_lstm_output) + + doc_outputs = [] + for i, doc_len in enumerate(ordered_doc_sizes): + # doc_outputs.append(padded_x[0:doc_len - 1, i, :]) # -1 to remove last prediction + doc_outputs.append(padded_x[:, i, :]) + + unsorted_doc_outputs = [doc_outputs[i] for i in unsort(ordered_document_idx)] + self.sentence_outputs = torch.cat(unsorted_doc_outputs, 0) + + logits = self.full_connect(self.sentence_outputs) + loss = None + if tags is not None: + loss = self.criterion(logits, tags.view(-1)) + + return TrainForPaperSegOutput( + loss=loss, + logits=logits + ) + + @classmethod + def from_config(cls, config_path, **kwargs): + with open(config_path, "r", encoding="utf-8") as rf: + model_config = json.load(rf) + model_config.update(kwargs) + return cls( + embed_dim=model_config["embed_dim"], + hidden_dim=model_config["hidden_dim"], + num_layers=model_config["num_layers"] + ) + + def save_config(self, config_dir): + config_path = os.path.join(config_dir, "config.json") + with open(config_path, "w", encoding="utf-8") as wf: + json.dump(self.config, wf, ensure_ascii=False, indent=2) \ No newline at end of file diff --git a/examples/downstream/paper_seg/paper_seg.ipynb b/examples/downstream/paper_seg/paper_seg.ipynb new file mode 100644 index 00000000..598e0290 --- /dev/null +++ b/examples/downstream/paper_seg/paper_seg.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 试卷切分" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/data/qlh/anaconda3/envs/py39/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import os\n", + "import torch\n", + "from torch.utils.data import DataLoader\n", + "from torch.utils.tensorboard import SummaryWriter\n", + "from data import VecDataset\n", + "from trainer import MyTrainer\n", + "from model import PaperSegModel\n", + "from utils import get_logger, ROOT_DIR" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 以DisneQNet为例\n", + "class Args:\n", + " subject = \"math\"\n", + " data_path = os.path.join(ROOT_DIR, \"data\")\n", + " checkpoint_dir = os.path.join(ROOT_DIR, \"checkpoint\")\n", + " \n", + " pretrained_model_type=\"disenqnet\"\n", + " pretrained_model_dir=\"/path/to/disenqnet/checkpoint\"\n", + "\n", + " device=\"cpu\"\n", + "\n", + "args = Args()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "args.train_data_path = f\"{args.data_path}/train/paper_txt_tagged/{args.subject}\"\n", + "args.valid_data_path = f\"{args.data_path}/valid/paper_txt_tagged/{args.subject}\"\n", + "args.test_data_path = f\"{args.data_path}/test/paper_txt_tagged/{args.subject}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# logger\n", + "logfile = f'{args.checkpoint_dir}/train.log'\n", + "logger = get_logger(logfile)\n", + "# tensorboard\n", + "tensorboard_dir = f'{args.checkpoint_dir}/tensorboard'\n", + "os.makedirs(tensorboard_dir, exist_ok=True)\n", + "tensorboard_writer = SummaryWriter(tensorboard_dir)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 加载向量数据集" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_set = VecDataset(\n", + " text_data_dir=args.train_data_path,\n", + " emb_data_path=args.train_data_path.replace(\"paper_txt_tagged\", f\"paper_emb_{args.pretrained_model_type}{i2v_postfix}\") + \".train.pt\",\n", + " mode=\"train\",\n", + " pretrained_model_type=args.pretrained_model_type,\n", + " pretrained_model_dir=args.pretrained_model_dir,\n", + " device=args.device,\n", + " )\n", + "valid_set = VecDataset(\n", + " text_data_dir=args.valid_data_path,\n", + " emb_data_path=args.valid_data_path.replace(\"paper_txt_tagged\", f\"paper_emb_{args.pretrained_model_type}{i2v_postfix}\") + \".valid.pt\",\n", + " mode=\"valid\",\n", + " pretrained_model_type=args.pretrained_model_type,\n", + " pretrained_model_dir=args.pretrained_model_dir,\n", + " paper_i2v=train_set.paper_i2v,\n", + " device=args.device,\n", + " )\n", + "test_set = VecDataset(\n", + " text_data_dir=args.test_data_path,\n", + " emb_data_path=args.test_data_path.replace(\"paper_txt_tagged\", f\"paper_emb_{args.pretrained_model_type}{i2v_postfix}\") + \".test.pt\",\n", + " mode=\"test\",\n", + " pretrained_model_type=args.pretrained_model_type,\n", + " pretrained_model_dir=args.pretrained_model_dir,\n", + " paper_i2v=train_set.paper_i2v,\n", + " device=args.device,\n", + " )\n", + "train_loader = DataLoader(train_set, batch_size=4, shuffle=True, num_workers=0, collate_fn=train_set.collcate_fn)\n", + "valid_loader = DataLoader(valid_set, batch_size=1, shuffle=False, num_workers=0, collate_fn=valid_set.collcate_fn)\n", + "test_loader = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=0, collate_fn=test_set.collcate_fn)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 加载模型" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = PaperSegModel(\n", + " embed_dim=train_set.embed_dim,\n", + " hidden_dim=256,\n", + " num_layers=2\n", + " )\n", + "model = model.to(args.device)\n", + "logger.info('prepare model have done!')\n", + "# model.save_pretrained(args.checkpoint_dir)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 训练和评估" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)\n", + "trainer = MyTrainer(\n", + " args=args,\n", + " model=model,\n", + " optimizer=optimizer,\n", + " logger=logger,\n", + " tensorboard_writer=tensorboard_writer,\n", + ")\n", + "trainer.train(train_loader, valid_loader)\n", + "logger.info(\"Finish training ... \")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = PaperSegModel.from_pretrained(args.checkpoint_dir).to(args.device)\n", + "trainer = MyTrainer(\n", + " args=args,\n", + " model=model,\n", + " logger=logger,\n", + ")\n", + "trainer.valid(test_loader)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py39", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/downstream/paper_seg/trainer.py b/examples/downstream/paper_seg/trainer.py new file mode 100644 index 00000000..572c5215 --- /dev/null +++ b/examples/downstream/paper_seg/trainer.py @@ -0,0 +1,144 @@ +import os +import sys +# sys.path.append(os.path.dirname(__file__)) +import torchmetrics +import math +import numpy as np +import torch +from torchmetrics.classification import BinaryPrecision, BinaryRecall, BinaryF1Score +from torchmetrics import MetricCollection +from utils import get_pk +from model import PaperSegModel + +def my_cuda_tensor(items, device): + for k, v in items.items(): + if isinstance(v, torch.Tensor): + items[k] = v.to(device) + elif isinstance(v, list): + items[k] = my_cuda_document(v, device) + return items + +def my_cuda_document(v, device): + if isinstance(v, torch.Tensor): + v = v.to(device) + elif isinstance(v, list): + v = [my_cuda_document(x, device) for x in v] + return v + +class MyTrainer(object): + def __init__(self, args, model, optimizer=None, scheduler=None, logger=None, tensorboard_writer=None, **kwargs): + self.args = args + self.model = model + + self.optimizer = optimizer + self.scheduler = scheduler + self.logger = logger + self.tensorboard_writer = tensorboard_writer + + self.Metric = MetricCollection([ + BinaryPrecision(), + BinaryRecall(), + BinaryF1Score(), + ]).to(args.device) + + def train(self, train_dataloader, eval_dataloader): + self._global_step = 0 + size = len(train_dataloader.dataset) // train_dataloader.batch_size + + # best_val_metric = + best_valid_loss = 9999 + best_epoch = None + + for epoch in range(self.args.epochs): + self.model.train() + # train + train_loss = 0 + self.logger.info(f"------ epoch {epoch} ------") + for batch in train_dataloader: # batch_size = 1 + batch = my_cuda_tensor(batch, self.args.device) + + # self.model.zero_grad() + self.optimizer.zero_grad() + + outputs = self.model(**batch) + + self.tensorboard_writer.add_scalar("train_loss", outputs.loss.item(), self._global_step) + self._global_step +=1 + train_loss += outputs.loss.item() + + # Backpropagation + loss = outputs.loss + loss.backward() + self.optimizer.step() + + train_loss /= size + self.logger.info(f'epoch {epoch:3} done, loss = {train_loss}') + # validate + valid_loss, valid_pk, valid_metrics = self.valid(eval_dataloader) + + for k, v in valid_metrics.items(): + self.tensorboard_writer.add_scalar(f"Metric/{k}", v, epoch) + self.tensorboard_writer.add_scalar(f"Metric/pk", valid_pk, epoch) + self.tensorboard_writer.add_scalars(f"EpochLoss", {"TrainLoss": train_loss, "ValidLoss": valid_loss}, epoch) + + # store best model + # if valid_metrics["BinaryF1Score"] > best_val_metric: + # best_val_metric = valid_metrics["BinaryF1Score"] + if valid_loss < best_valid_loss: + best_valid_loss = valid_loss + + try: + self.model.save_pretrained(self.args.checkpoint_dir) + except Exception: + self.logger.info("[Warning] Model.save_pretrained Error !!!") + + with open(f'{self.args.checkpoint_dir}/best_model.bin', mode='wb') as f: + torch.save(self.model.state_dict(), f) + with open(f'{self.args.checkpoint_dir}/best_model.obj.bin', mode='wb') as f: + torch.save(self.model, f) + + best_epoch = epoch + self.logger.info(f"saving best model at epoch {epoch}...") + + self.logger.info(f"Finish training, best model is at epoch {best_epoch}...") + + def valid(self, valid_dataloader): + assert valid_dataloader.batch_size == 1 + self.model.eval() + size = len(valid_dataloader.dataset) // valid_dataloader.batch_size + valid_pk_list = [] + valid_loss = 0 + for batch in valid_dataloader: + batch = my_cuda_tensor(batch, self.args.device) + # documents, tags = batch + # for document, tag in zip(documents, tags): + if isinstance(self.model, PaperSegModel): + outputs = self.model(**batch) + flat_pred_tags = torch.argmax(torch.softmax(outputs.logits, 1), 1) + valid_loss += outputs.loss.item() + else: + outputs = self.model(batch["documents"]) + flat_pred_tags = torch.argmax(torch.softmax(outputs, 1), 1) + + criterion = torch.nn.CrossEntropyLoss() + loss = criterion(outputs, batch["tags"].view(-1)) + valid_loss += loss.item() + + """ compute Metric""" + self.Metric.update(flat_pred_tags, batch["tags"].view(-1)) + """ compute pk""" + tag = batch["tags"].detach().cpu().numpy() + pred_tag = flat_pred_tags.detach().cpu().numpy() + document = batch["documents"][0] + """ only can compute each document at each time """ + k = max(math.ceil(len(document)/2), 2) + pk = get_pk(pred_tag, tag, k) + valid_pk_list.append(pk) + + valid_loss /= size + valid_pk = np.mean(valid_pk_list) + valid_metrics = {k: v.item() for k,v in self.Metric.compute().items()} + self.logger.info(f"Validate: valid_loss= {valid_loss}, valid_pk= {valid_pk}, Validation metric: {valid_metrics}") + self.Metric.reset() + + return valid_loss, valid_pk, valid_metrics \ No newline at end of file diff --git a/examples/downstream/paper_seg/utils.py b/examples/downstream/paper_seg/utils.py new file mode 100644 index 00000000..000466d5 --- /dev/null +++ b/examples/downstream/paper_seg/utils.py @@ -0,0 +1,39 @@ + + +import os +import logging +from datetime import datetime + +ROOT_DIR = os.path.dirname(os.path.dirname(__file__)) + +def get_logger(logfile): + os.makedirs(os.path.dirname(logfile), exist_ok=True) + + logger = logging.getLogger(name="QuesQuality") + logger.setLevel(logging.INFO) + + handler = logging.FileHandler(filename=logfile, encoding="utf-8", mode="w") + handler.setLevel(logging.INFO) + formatter = logging.Formatter(fmt="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") + handler.setFormatter(formatter) + + consolehandler = logging.StreamHandler() + consolehandler.setFormatter(formatter) + + logger.addHandler(handler) + logger.addHandler(consolehandler) # log to file and print to console + return logger + + +def get_pk(y_pred, y, k): + tag_num = len(y) + count = 0 + for i in range(0, tag_num-k): + seg_count_y_pred = 0 + seg_count_y = 0 + for j in range(i, i+k): + seg_count_y_pred += y_pred[j] + seg_count_y += y[j] + if seg_count_y_pred != seg_count_y: + count += 1 + return count From f2ba01b7dea7ba7f25c6348bd9b0d4f9aff81b4e Mon Sep 17 00:00:00 2001 From: wintermelon008 <1904346407@qq.com> Date: Sun, 2 Jul 2023 22:09:17 +0800 Subject: [PATCH 31/58] Fix bugs for quesnet figure loading --- EduNLP/Pretrain/quesnet_vec.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/EduNLP/Pretrain/quesnet_vec.py b/EduNLP/Pretrain/quesnet_vec.py index 3e10b5e2..c46def72 100644 --- a/EduNLP/Pretrain/quesnet_vec.py +++ b/EduNLP/Pretrain/quesnet_vec.py @@ -159,7 +159,14 @@ def _convert_to_ids(self, item: Union[str, dict, list], key=lambda x: x, if isinstance(w, FigureSegment): # image try: - im = Image.open(os.path.join(self.img_dir, f'{w.src[10:-1]}.png')) + if self.img_dir != '': + # Warnings: you should make sure the figure exists! + im = Image.open(os.path.join(self.img_dir, f'{w.src[10:-1]}.png')) + else: + fig_id = f"{w.src[10:-1]}" + fig_index = item['ques_figure_ids'].index(fig_id) + fig_src = item['ques_figure_paths'][fig_index] + im = Image.open(fig_src) im = im.resize((56, 56)) token_idx.append(to_grayscale(im)) except Exception: From b9604e4278399524480af45a2af207943b21338a Mon Sep 17 00:00:00 2001 From: wintermelon008 <1904346407@qq.com> Date: Sun, 2 Jul 2023 22:10:40 +0800 Subject: [PATCH 32/58] Add quesnet_new --- EduNLP/Pretrain/quesnet_vec_new.py | 761 +++++++++++++++++++++++++++++ 1 file changed, 761 insertions(+) create mode 100644 EduNLP/Pretrain/quesnet_vec_new.py diff --git a/EduNLP/Pretrain/quesnet_vec_new.py b/EduNLP/Pretrain/quesnet_vec_new.py new file mode 100644 index 00000000..78082fea --- /dev/null +++ b/EduNLP/Pretrain/quesnet_vec_new.py @@ -0,0 +1,761 @@ +# !!! 本项目正在重构中 + +from .pretrian_utils import PretrainedEduTokenizer +from ..SIF.segment.segment import FigureSegment +from ..ModelZoo.quesnet import QuesNetForPreTraining, AE +from EduNLP import logger + +from torch.utils.data import DataLoader, Dataset +from torchvision.transforms.functional import to_grayscale +from torchvision.transforms.functional import to_tensor +from gensim.models import Word2Vec +import torch + +import warnings +import queue +import random +import math +import threading +import logging +import signal +import os +import json +import linecache +import numpy as np +from PIL import Image +from typing import List, Union, Optional +from collections import namedtuple +from functools import partial +from tqdm import tqdm + + +def save_list(item2index, path): + item2index = sorted(item2index.items(), key=lambda kv: kv[1]) + items = [item for item, _ in item2index] + with open(path, "wt", encoding="utf-8") as file: + file.write('\n'.join(items)) + return + + +def clip(v, low, high): + # 将 v 限制在 low~high 之间 + return max(low, min(v, high)) + +# 数据集的基本单元 +Question = namedtuple('Question', + ['id', 'content', 'answer', 'false_options', 'labels']) + + +class QuesNetTokenizer(PretrainedEduTokenizer): + """ + Examples + -------- + >>> tokenizer = QuesNetTokenizer(meta=['knowledge']) + >>> test_items = [{"ques_content": "$\\triangle A B C$ 的内角为 $A, \\quad B, $\\FigureID{test_id}$", + ... "knowledge": "['*', '-', '/']"}, {"ques_content": "$\\triangle A B C$ 的内角为 $A, \\quad B", + ... "knowledge": "['*', '-', '/']"}] + >>> tokenizer.set_vocab(test_items, + ... trim_min_count=1, key=lambda x: x["ques_content"], silent=True) + >>> tokenizer.set_meta_vocab(test_items, silent=True) + >>> token_items = [tokenizer(i, key=lambda x: x["ques_content"]) for i in test_items] + >>> print(token_items[0].keys()) + dict_keys(['seq_idx', 'meta_idx']) + >>> token_items = tokenizer(test_items, key=lambda x: x["ques_content"]) + >>> print(len(token_items["seq_idx"])) + 2 + """ + + def __init__(self, vocab_path=None, meta_vocab_dir=None, img_dir: str = None, + max_length=250, tokenize_method="custom", symbol="mas", add_specials: list = None, + meta: List[str] = None, img_token='', unk_token="", pad_token="", **kwargs): + """ + Parameters + ---------- + img_dir : str, optional + path of images, by default None + vocab_path : str, optional + path of vacab file, by default None + max_length : int, optional + by default 250 + meta : list, optional + the name of meta (side information), by default 'knowledge' + img_token : str, optional + by default '' + unk_token : str, optional + by default "" + pad_token : str, optional + by default "" + """ + if add_specials is None: + add_specials = [img_token] + else: + add_specials = [img_token] + add_specials + self.tokenization_params = { + "formula_params": { + "method": "linear", + "skip_figure_formula": True + } + } + kwargs.update(self.tokenization_params) + super().__init__(vocab_path=vocab_path, max_length=max_length, tokenize_method=tokenize_method, + add_specials=add_specials, unk_token=unk_token, pad_token=pad_token, symbol=symbol, **kwargs) + if meta is None: + meta = ['know_name'] + self.img_dir = img_dir + self.img_token = img_token + self.unk_token = unk_token + self.pad_token = pad_token + self.meta = meta + self.meta_vocab_dir = meta_vocab_dir + + self.stoi = dict() + self.itos = dict() + self.stoi['word'] = self.vocab.token_to_idx + self.itos['word'] = self.vocab.idx_to_token + if meta_vocab_dir is not None: + self.load_meta_vocab(meta_vocab_dir=meta_vocab_dir) + self.config = { + k: v for k, v in locals().items() if k not in [ + "self", "__class__", "vocab_path", 'img_dir', 'meta_vocab_dir'] + } + + def __call__(self, item: Union[str, dict, list], key=lambda x: x, + meta: Optional[list] = None, + padding=False, return_text=False, *args, **kwargs): + """ + item: str or dict + the question item + key: function + determine how to get the text of each item + padding: bool + whether to pad the seq_idx + return_text: bool + whether to return text tokens + """ + if isinstance(item, list): + ret = { + "seq_idx": [], + "meta_idx": [] + } + if return_text: + ret["seq_token"] = [] + ret["meta"] = [] + for i in item: + r = self._convert_to_ids(i, key, meta, padding, return_text, *args, **kwargs) + ret["seq_idx"].append(r["seq_idx"]) + ret["meta_idx"].append(r["meta_idx"]) + if return_text: + ret["seq_token"].append(r["seq_token"]) + ret["meta"].append(r["meta"]) + else: + ret = self._convert_to_ids(item, key, meta, padding, return_text, *args, **kwargs) + + return ret + + def _convert_to_ids(self, item: Union[str, dict, list], key=lambda x: x, + meta: Optional[list] = None, + padding=False, return_text=False, *args, **kwargs): + token_item = self.tokenize(item, key) + token_idx = [] + for _, w in enumerate(token_item): + if isinstance(w, FigureSegment): + # image + try: + fig_id = f"{w.src[10:-1]}" + fig_index = item['ques_figure_ids'].index(fig_id) + fig_src = item['ques_figure_paths'][fig_index] + im = Image.open(fig_src) + im = im.resize((56, 56)) + token_idx.append(to_grayscale(im)) + except Exception: + warnings.warn('Open image error!') + token_idx.append(self.stoi['word'][self.img_token]) + else: + # word + token_idx.append(self.stoi['word'].get(w) or self.stoi['word'][self.unk_token]) + + meta_idxs = {} + meta_items = {} + if meta is None: + meta = self.meta + for m in meta: + meta_idx = [] + if isinstance(item, dict) and m in item: + meta_item = item[m] + if isinstance(meta_item, str): + if meta_item.startswith('['): + # a list of labels ['+', '-', '/'] + meta_item = eval(meta_item) + elif isinstance(meta_item, list): + pass + else: + raise Exception("Side information must be a list!") + meta_items[m] = meta_item + for k in meta_item: + meta_idx.append(self.stoi[m].get(k) or self.stoi[m][self.unk_token]) + meta_idxs[m] = meta_idx + ret = { + "seq_idx": self.padding(token_idx, self.max_length) if padding else token_idx, + "meta_idx": meta_idxs + } + + if return_text: + ret["seq_token"] = token_item + ret["meta"] = meta_items + return ret + + def load_meta_vocab(self, meta_vocab_dir): + for m in self.meta: + try: + with open(os.path.join(meta_vocab_dir, f'meta_{m}.txt'), "rt", encoding="utf-8") as f: + meta = f.read().strip().split('\n') + self.stoi[m] = {word: index for index, word in enumerate(meta)} + self.itos[m] = {i: s for s, i in self.stoi[m].items()} + except Exception: + self.stoi[m] = None + self.itos[m] = None + + def set_meta_vocab(self, items: list, meta: List[str] = None, silent=True): + self.meta = meta if meta is not None else self.meta + for m in self.meta: + meta = set() + if m in items[0]: + for item in items: + meta_item = item[m] + if isinstance(meta_item, str): + if meta_item.startswith('['): + # a list of labels ['+', '-', '/'] + meta_item = eval(meta_item) + elif isinstance(meta_item, list): + pass + else: + raise Exception("Side information must be a list!") + meta = meta | set(meta_item) + meta = [self.unk_token] + sorted(meta) + if not silent: + print(f"save meta information {m}: {len(meta)}") + self.stoi[m] = {word: index for index, word in enumerate(meta)} + self.itos[m] = {i: s for s, i in self.stoi[m].items()} + + def set_vocab(self, items: list, key=lambda x: x, lower: bool = False, + trim_min_count: int = 1, do_tokenize: bool = True, silent=True): + """ + Parameters + ----------- + items: list + can be the list of str, or list of dict + key: function + determine how to get the text of each item + trim_min_count : int, optional + the lower bound number for adding a word into vocabulary, by default 1 + silent + """ + token_items = self.tokenize(items, key) if do_tokenize else [key(item) for item in items] + self.vocab.set_vocab(corpus_items=token_items, trim_min_count=trim_min_count, lower=lower, silent=silent) + self.stoi['word'] = self.vocab.token_to_idx + self.itos['word'] = self.vocab.idx_to_token + + @classmethod + def from_pretrained(cls, tokenizer_config_dir, img_dir=None, **kwargs): + """ + Parameters: + ----------- + tokenizer_config_dir: str + must contain tokenizer_config.json and vocab.txt and meta_{meta_name}.txt + img_dir: str + default None + the path of image directory + """ + tokenizer_config_path = os.path.join(tokenizer_config_dir, "tokenizer_config.json") + pretrained_vocab_path = os.path.join(tokenizer_config_dir, "vocab.txt") + with open(tokenizer_config_path, "r", encoding="utf-8") as rf: + tokenizer_config = json.load(rf) + tokenizer_config.update(kwargs) + return cls( + vocab_path=pretrained_vocab_path, + meta_vocab_dir=tokenizer_config_dir, + img_dir=img_dir, + max_length=tokenizer_config["max_length"], + tokenize_method=tokenizer_config["tokenize_method"], + meta=tokenizer_config["meta"], + img_token=tokenizer_config["img_token"], + unk_token=tokenizer_config["unk_token"], + pad_token=tokenizer_config["pad_token"]) + + def save_pretrained(self, tokenizer_config_dir): + """Save tokenizer into local files + + Parameters: + ----------- + tokenizer_config_dir: str + save tokenizer params in `/tokenizer_config.json` and save words in `vocab.txt` + and save metas in `meta_{meta_name}.txt` + """ + if not os.path.exists(tokenizer_config_dir): + os.makedirs(tokenizer_config_dir, exist_ok=True) + tokenizer_config_path = os.path.join(tokenizer_config_dir, "tokenizer_config.json") + self.vocab.save_vocab(os.path.join(tokenizer_config_dir, "vocab.txt")) + for m in self.meta: + save_list(self.stoi[m], os.path.join(tokenizer_config_dir, f'meta_{m}.txt')) + with open(tokenizer_config_path, "w", encoding="utf-8") as wf: + json.dump(self.config, wf, ensure_ascii=False, indent=2) + + def padding(self, idx, max_length, type='word'): + padding_idx = idx + [self.stoi[type][self.pad_token]] * (max_length - len(idx)) + return padding_idx + + def set_img_dir(self, path): + self.img_dir = path + + + +class Lines: + ''' + 原始数据行读取。这玩意删不掉,因为要先读取数据喂给 tokenizer, + 再用 tokenizer 过一遍数据得到 Dataset,就很难绷 + ''' + def __init__(self, filename, skip=0, preserve_newline=False): + self.filename = filename + with open(filename, "r", encoding="utf-8") as f: + self.length = len(f.readlines()) - skip + assert self.length > 0, f'{filename} is empty. Or file length is less than skip length.' + self.skip = skip + self.preserve_newline = preserve_newline + + def __len__(self): + return self.length + + def __iter__(self): + for i in range(len(self)): + yield self[i] + + def __getitem__(self, item): + d = self.skip + 1 + if isinstance(item, int): + if item < self.length: + line = linecache.getline(self.filename, + item % len(self) + d) + if self.preserve_newline: + return json.loads(line) + else: + return json.loads(line.strip('\r\n')) + + elif isinstance(item, slice): + low = 0 if item.start is None else item.start + low = clip(low, -len(self), len(self) - 1) + if low < 0: + low += len(self) + high = len(self) if item.stop is None else item.stop + high = clip(high, -len(self), len(self)) + if high < 0: + high += len(self) + ls = [] + for i in range(low, high): + line = linecache.getline(self.filename, i + d) + if not self.preserve_newline: + line = line.strip('\r\n') + ls.append(json.loads(line)) + + return ls + + raise IndexError('index must be int or slice') + + + +class QuesnetDataset(Dataset): + ''' + 对 Question Loader 的重构,删除了一些不太用得到的接口 + TODO: 这里需要重写逻辑,不然和原来的区别不大,还是想要把 Lines 给合并过来 + ''' + def __init__(self, questions: Lines, + tokenizer: QuesNetTokenizer, + content_key=lambda x: x['ques_content'], + meta_key=lambda x: x['know_name'], + answer_key=lambda x: x['ques_answer'], + option_key=lambda x: x['ques_options'], + pipeline = None, + skip=0): + self.questions = questions + self.skip = skip + self.tokenizer = tokenizer + self.content_key = content_key + self.meta_key = meta_key + self.answer_key = answer_key + self.option_key = option_key + self.pipeline = pipeline + + + def __len__(self): + return len(self.questions) + + + def __getitem__(self, index): + if isinstance(index, int): + line = self.questions[index] + + # 进行后续处理 + qid = line['ques_id'] + token = self.tokenizer(line, key=self.content_key, meta=self.meta_key) + content = token['seq_idx'] + meta = token['meta_idx'] + if self.answer_key(line).isalpha() and len(self.answer_key(line)) == 1 and ord(self.answer_key(line)) < 128 and len(self.option_key(line)) > 0: + answer_idx = ord(self.answer_key(line).upper()) - ord('A') + options = self.option_key(line) + answer = self.tokenizer(options.pop(answer_idx), meta=self.meta_key)['seq_idx'] + false_options = [(self.tokenizer(option, meta=self.meta_key))['seq_idx'] for option in options] + else: + answer = (self.tokenizer(self.answer_key(line), meta=self.meta_key))['seq_idx'] + false_options = [[0], [0], [0]] + + qs = Question(id=qid, content=content, answer=answer, + false_options=false_options, labels=meta) + if callable(self.pipeline): + qs = self.pipeline(qs) + + return qs + + + elif isinstance(index, slice): + results = [] + for i in range(*index.indices(len(self))): + results.append(self[i]) + return results + + else: + raise TypeError('Invalid argument type. Index type should be int or slice.') + + + + +class QuestionLoader(DataLoader): + def __init__(self, question, batch_size, shuffle=False, num_workers=0, pin_memory=False): + super(QuestionLoader, self).__init__( + dataset=question, + batch_size=batch_size, + shuffle=shuffle, + num_workers=num_workers, + pin_memory=pin_memory + ) + + + +class EmbeddingDataset(Dataset): + def __init__(self, data, data_type='image'): + self.data = data + self.data_type = data_type + assert self.data_type in ['image', 'meta'] + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + if self.data_type == 'image': + return to_tensor(self.data[idx]) + elif self.data_type == 'meta': + return self.data[idx] + + +class PrefetchIter: + """Iterator on data and labels, with states for save and restore.""" + # 这里还没有重构完 + + def __init__(self, data, *label, length=None, batch_size=1, shuffle=True): + self.data = data + self.label = label + self.batch_size = batch_size + self.queue = queue.Queue(maxsize=8) + self.length = length if length is not None else len(data) + + assert all(self.length == len(lab) for lab in label), \ + 'data and label must have same lengths' + + self.index = list(range(len(self))) + if shuffle: + random.shuffle(self.index) + self.thread = None + self.pos = 0 + + def __len__(self): + return math.ceil(self.length / self.batch_size) + + def __iter__(self): + return self + + def __next__(self): + if self.thread is None: + self.thread = threading.Thread(target=self.produce, daemon=True) + self.thread.start() + + if self.pos >= len(self.index): + raise StopIteration + + item = self.queue.get() + if isinstance(item, Exception): + raise item + else: + self.pos += 1 + return item + + def produce(self): + for i in range(self.pos, len(self.index)): + try: + index = self.index[i] + + bs = self.batch_size + + if callable(self.data): + data_batch = self.data(index * bs, (index + 1) * bs) + else: + data_batch = self.data[index * bs:(index + 1) * bs] + + label_batch = [label[index * bs:(index + 1) * bs] + for label in self.label] + if label_batch: + self.queue.put([data_batch] + label_batch) + else: + self.queue.put(data_batch) + except Exception as e: + self.queue.put(e) + return + + +sigint_handler = signal.getsignal(signal.SIGINT) + +def critical(f): + it = iter(f) + signal_received = () + + def handler(sig, frame): + nonlocal signal_received + signal_received = (sig, frame) + logging.debug('SIGINT received. Delaying KeyboardInterrupt.') + + while True: + try: + signal.signal(signal.SIGINT, handler) + yield next(it) + signal.signal(signal.SIGINT, sigint_handler) + if signal_received: + sigint_handler(*signal_received) + except StopIteration: + break + + +def pretrain_embedding_layer(dataset: EmbeddingDataset, ae: AE, lr: float = 1e-3, log_step: int = 1, epochs: int = 3, + batch_size: int = 4, device=torch.device('cpu')): + train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) + optim = torch.optim.Adam(ae.parameters(), lr=lr) + ae.train() + ae.to(device) + train_type = dataset.data_type + for i in range(epochs): + for batch, item in tqdm(enumerate(train_dataloader)): + loss = ae.loss(item.to(device)) + optim.zero_grad() + loss.backward() + optim.step() + if batch % log_step == 0: + logger.info(f"[Epoch{i}][Batch{batch}]Training {train_type} Embedding layer, loss:{loss}") + return ae + +def optimizer(*models, **kwargs): + _cur_optim = [m.optim_cls(m.parameters(), **kwargs) + if hasattr(m, 'optim_cls') + else torch.optim.Adam(m.parameters(), **kwargs) + for m in models] + if len(_cur_optim) == 1: + return _cur_optim[0] + else: + return _cur_optim + + +def train_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save_embs = False, train_params = None): + """ pretrain quesnet + + Parameters + ---------- + path : str + path of question file + output_dir : str + output path· + tokenizer : QuesNetTokenizer + quesnet tokenizer + save_embs : bool, optional + whether to save pretrained word/image/meta embeddings seperately + train_params : dict, optional + the training parameters and model parameters, by default None + - "n_epochs": int, default = 1 + train param, number of epochs + - "batch_size": int, default = 6 + train param, batch size + - "lr": float, default = 1e-3 + train param, learning rate + - "save_every": int, default = 0 + train param, save steps interval + - "log_steps": int, default = 10 + train param, log steps interval + - "device": str, default = 'cpu' + train param, 'cpu' or 'cuda' + - "max_steps": int, default = 0 + train param, stop training when reach max steps + - "emb_size": int, default = 256 + model param, the embedding size of word, figure, meta info + - "feat_size": int, default = 256 + model param, the size of question infer vector + + Examples + ---------- + >>> tokenizer = QuesNetTokenizer(meta=['know_name']) + >>> items = [{"ques_content": "若复数$z=1+2 i+i^{3}$,则$|z|=$,$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$", + ... "ques_id": "726cdbec-33a9-11ec-909c-98fa9b625adb", + ... "know_name": "['代数', '集合', '集合的相等']" + ... }] + >>> tokenizer.set_vocab(items, key=lambda x: x['ques_content'], trim_min_count=1, silent=True) + >>> pretrain_quesnet('./data/standard_luna_data.json', './testQuesNet', tokenizer) # doctest: +SKIP + """ + os.makedirs(output_dir, exist_ok=True) + device = torch.device(train_params['device']) + + default_train_params = { + # train params + "n_epochs": 1, + "batch_size": 8, + "lr": 1e-3, + 'save_every': 0, + 'log_steps': 10, + 'device': 'cpu', + 'max_steps': 0, + # model params + 'emb_size': 256, + 'feat_size': 256, + } + if train_params is not None: + default_train_params.update(train_params) + train_params = default_train_params + # 参数设定完成 + + tokenizer = QuesNetTokenizer(meta=['know_name'], img_dir=img_dir) + lines = Lines(path) + tokenizer.set_vocab(lines, key=lambda x: x['ques_content'], + trim_min_count=2, silent=False) + tokenizer.set_meta_vocab(lines, silent=False) + tokenizer.save_pretrained(output_dir) + data = QuesnetDataset(lines, tokenizer) + model = QuesNetForPreTraining(_stoi=tokenizer.stoi, feat_size=train_params['feat_size'], + emb_size=train_params['emb_size']).to(device) + + # 开始处理数据集 + emb_dict = tokenizer.stoi['word'] + emb_dict_rev = tokenizer.itos['word'] + emb_size = train_params['emb_size'] + meta_size = model.quesnet.meta_size + w2v_corpus = [] + img_corpus = [] + meta_corpus = [] + for _, qs in enumerate(tqdm(data)): + text_content = [] + for c in qs.content: + if isinstance(c, int): + text_content.append(emb_dict_rev[c]) + else: + img_corpus.append(c) + for a in qs.answer: + if isinstance(a, int): + text_content.append(emb_dict_rev[a]) + else: + img_corpus.append(a) + w2v_corpus.append(text_content) + meta_vector = torch.zeros(meta_size, dtype=torch.float) + for m in qs.labels[model.quesnet.meta]: + meta_vector.add_(torch.nn.functional.one_hot(torch.tensor(m, dtype=torch.int64), + model.quesnet.meta_size).to(torch.float)) + meta_corpus.append(meta_vector) + + + # train word2vec for text embedding + if pretrain_dir != None: + model.quesnet.load_emb(np.load(os.path.join(output_dir, 'w2v_embs.npy'))) + else: + gensim_w2v = Word2Vec(sentences=[[item] for item in emb_dict.keys()], min_count=1, + vector_size=emb_size) + gensim_w2v.init_weights() + gensim_w2v.train(corpus_iterable=w2v_corpus, total_examples=len(w2v_corpus), epochs=train_params['n_epochs']) + w2v_emb = gensim_w2v.syn1neg + emb_weights = [] + for key, item in emb_dict.items(): + w2v_index = gensim_w2v.wv.key_to_index[key] + emb_weights.append(w2v_emb[w2v_index]) + emb_weights = np.array(emb_weights) + model.quesnet.load_emb(emb_weights) + if save_embs: + np.save(os.path.join(output_dir, 'w2v_embs.npy'), emb_weights) + logger.info('quesnet Word Embedding loaded') + + + # 这里先用原先的接口 + # train auto-encoder loss for image embedding + if pretrain_dir != None: + model.quesnet.load_img(torch.load(os.path.join(pretrain_dir, 'trained_ie.pt'))) + else: + img_dataset = EmbeddingDataset(data=img_corpus, data_type='image') + trained_ie = pretrain_embedding_layer(dataset=img_dataset, ae=model.quesnet.ie, lr=train_params['lr'], + log_step=train_params['log_steps'], batch_size=train_params['batch_size'], + epochs=train_params['n_epochs'], device=device) + if save_embs: + torch.save(trained_ie.state_dict(), os.path.join(output_dir, 'trained_ie.pt')) + model.quesnet.load_img(trained_ie) + logger.info('quesnet Image Embedding loaded') + + + # train auto-encoder loss for meta embedding + if pretrain_dir != None: + model.quesnet.load_meta(torch.load(os.path.join(pretrain_dir, 'trained_me.pt'))) + else: + meta_dateset = EmbeddingDataset(data=meta_corpus, data_type='meta') + trained_me = pretrain_embedding_layer(dataset=meta_dateset, ae=model.quesnet.me, lr=train_params['lr'], + log_step=train_params['log_steps'], batch_size=train_params['batch_size'], + epochs=train_params['n_epochs'], device=device) + if save_embs: + torch.save(trained_me.state_dict(), os.path.join(output_dir, 'trained_me.pt')) + model.quesnet.load_meta(trained_me) + logger.info('quesnet Meta Embedding loaded') + + logger.info("quesnet Word, Image and Meta Embeddings training is done") + # DONE for datasets + + # TODO: 把 Quesnet 的接口移过来 + # 下面的代码还没有完整测试 + + # HLM and DOO training + data.pipeline = partial(model.quesnet.make_batch, device=device, pretrain=True) + model.train() + optim = optimizer(model, lr=train_params['lr']) + n_batches = 0 + for epoch in range(0, train_params['n_epochs']): + train_iter = PrefetchIter(data, train_params['batch_size']) + bar = enumerate(tqdm(train_iter, initial=train_iter.pos), + train_iter.pos) + for i, batch in critical(bar): + n_batches += 1 + loss = model(batch).loss + optim.zero_grad() + loss.backward() + optim.step() + + if train_params['save_every'] > 0 and i % train_params['save_every'] == 0: + # model.save(os.path.join(output_dir, f'QuesNet_{epoch}.{i}')) + model.save_pretrained(output_dir) + + if train_params['log_steps'] > 0 and i % train_params['log_steps'] == 0: + logger.info(f"{epoch}.{i}---loss: {loss.item()}") + + if train_params['max_steps'] > 0 and n_batches % train_params['max_steps'] == 0: + break + + # model.save(os.path.join(output_dir, f'QuesNet_{epoch}')) + + model.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) + + + From dda10377e8000c9eb3eae65fb2d34582ce9187cd Mon Sep 17 00:00:00 2001 From: KenelmQLH <1097824882@qq.com> Date: Sun, 2 Jul 2023 15:14:47 +0000 Subject: [PATCH 33/58] [Doc] Add knowledge_prediction --- examples/downstream/knowledge/konw_pred.ipynb | 215 ++++++++++++++++++ examples/downstream/knowledge/utils.py | 131 +++++++++++ .../paper_seg/{data.py => load_data.py} | 0 examples/downstream/paper_seg/paper_seg.ipynb | 15 +- .../paper_seg/samples/train/math/paper_1.txt | 82 +++++++ 5 files changed, 436 insertions(+), 7 deletions(-) create mode 100644 examples/downstream/knowledge/konw_pred.ipynb create mode 100644 examples/downstream/knowledge/utils.py rename examples/downstream/paper_seg/{data.py => load_data.py} (100%) create mode 100644 examples/downstream/paper_seg/samples/train/math/paper_1.txt diff --git a/examples/downstream/knowledge/konw_pred.ipynb b/examples/downstream/knowledge/konw_pred.ipynb new file mode 100644 index 00000000..b79bec6f --- /dev/null +++ b/examples/downstream/knowledge/konw_pred.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 层级知识点预测" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import yaml\n", + "import tqdm\n", + "import torch\n", + "import numpy as np\n", + "from EduNLP.Pretrain import BertTokenizer\n", + "from EduNLP.ModelZoo.bert import BertForKnowledgePrediction\n", + "from EduNLP.Pretrain import finetune_bert_for_knowledge_prediction\n", + "from EduNLP.ModelZoo import load_items\n", + "\n", + "from utils import compute_perfs_per_layer, get_onehot_label_topk, metric, compute_perfs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data = load_items(\"/path/to/data/train.jsonl\")\n", + "test_data = load_items(\"/path/to/data/test.jsonl\")\n", + "\n", + "pretrained_model_dir =\"/path/to/bert/checkpoint\"\n", + "checkpoint_dir = \"/path/to/knowledge_model/checkpoint\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 训练" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 以bert为例\n", + "data_params = {\n", + " \"stem_key\": \"ques_content\",\n", + " \"label_key\": \"know_list\"\n", + "}\n", + "train_params = {\n", + " \"num_train_epochs\": 1,\n", + " \"per_device_train_batch_size\": 2,\n", + " \"per_device_eval_batch_size\": 2,\n", + " \"no_cuda\": True,\n", + "}\n", + "model_params = {\n", + " \"num_classes_list\": [10, 27, 963],\n", + " \"num_total_classes\": 1000,\n", + "}\n", + " \n", + "\n", + "\"\"\"\n", + "数据格式:\n", + "{\n", + " 'ques_content': 'question...',\n", + " 'know_list': [lay_1_id, lay_2_id, lay_3_id]\n", + "}\n", + "\"\"\"\n", + "\n", + "# train without eval_items\n", + "finetune_bert_for_knowledge_prediction(\n", + " train_data,\n", + " checkpoint_dir,\n", + " pretrained_model=pretrained_model_dir,\n", + " train_params=train_params,\n", + " data_params=data_params,\n", + " model_params=model_params\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 加载模型和评估数据" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class EvalDataset(torch.utils.data.Dataset):\n", + " def __init__(self, data) -> None:\n", + " self.data = data\n", + " self.num_classes = model_params['num_classes_list']\n", + " self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)\n", + "\n", + " def __getitem__(self, idx):\n", + " text, labels = self.data[idx][\"ques_content\"], self.data[idx][\"know_list\"]\n", + " token_dicts = self.tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')\n", + " for k, v in token_dicts.items():\n", + " token_dicts[k] = torch.squeeze(v, dim=0)\n", + " one_hot_labels = [1. if idx in labels else 0. for idx in range(self.num_classes)]\n", + " return token_dicts, torch.FloatTensor(one_hot_labels)\n", + "\n", + " def __len__(self):\n", + " return len(self.data)\n", + "\n", + "test_dataset = EvalDataset(test_data)\n", + "eval_dataloader = EvalDataset(\n", + " test_data,\n", + " batch_size=1,\n", + " shuffle=False,\n", + " num_workers=4,\n", + ")\n", + "\n", + "model = BertForKnowledgePrediction.from_pretrained(checkpoint_dir)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 评估" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "device = \"cuda\" if not train_params[\"no_cuda\"] else \"cpu\"\n", + "\n", + "# 层级知识标签-配置信息\n", + "levels = len(model_params[\"num_classes_list\"])\n", + "classes_offset_list = [0, 10, 37]\n", + "classes_border_list = [[0, 9], [10, 36], [37, 1000]] # 层级id边界\n", + "hierarchy_dict = {} # child_know_id_to_parent_know_id\n", + "\n", + "# 评估top_k结果\n", + "top_k_list=[10, 20, 30]\n", + "\n", + "model.eval()\n", + "perfs_per_layer = [np.array([0 for _ in range(4)], dtype=np.int32) for _ in range(levels)]\n", + "total_perfs = np.array([0 for _ in range(4)], dtype=np.int32)\n", + "\n", + "k_total_perfs_list = [ np.array([0 for _ in range(4)], dtype=np.int32)for _ in range(len(top_k_list)) ]\n", + "for i, eval_batch in tqdm.tqdm(enumerate(eval_dataloader)):\n", + " input_data, eval_batch_labels = eval_batch\n", + " input_data = input_data.to(device)\n", + " _, output_logits = model(**input_data)\n", + "\n", + " local_perfs_per_layer, local_perfs = compute_perfs_per_layer(\n", + " output_logits.cpu().detach().numpy(),\n", + " eval_batch_labels.cpu().detach().numpy(),\n", + " hierarchy_dict,\n", + " classes_border_list,\n", + " keep_consistency=True\n", + " )\n", + " perfs_per_layer = [perfs_per_layer[idx] + local_perfs_per_layer[idx] for idx in range(levels)]\n", + " total_perfs += local_perfs\n", + " \n", + " # for recall@k\n", + " for i_k, k in enumerate(top_k_list):\n", + " pred_topk = get_onehot_label_topk(\n", + " classes_border_list, classes_offset_list, scores_list=output_logits.cpu().detach().numpy(), top_num=k)\n", + " flat_pred_topk = np.array([x[3] for x in pred_topk])\n", + " k_total_perfs = compute_perfs(flat_pred_topk, eval_batch_labels.cpu().detach().numpy().tolist())\n", + " k_total_perfs_list[i_k] += k_total_perfs\n", + "\n", + "micro_precision, micro_recall, micro_f1, total_acc = metric(*total_perfs)\n", + "\n", + "# metrics for per layer top_k\n", + "for i_k, k_total_perfs in enumerate(k_total_perfs_list):\n", + " k = top_k_list[i_k]\n", + " precision, recall, f1, acc = metric(*k_total_perfs)\n", + " print(f\"TOPK={k}: Precision@{k}: {precision:.4f}, Recall@{k}: {recall:.4f}, F1@{k}: {f1:.4f}, Acc@{k}: {acc:.4f}\")\n", + "\n", + "# metrics for per layer\n", + "for layer_idx, perfs in enumerate(perfs_per_layer):\n", + " precision, recall, f1, acc = metric(*perfs)\n", + " print(f\"Layer {layer_idx + 1}: Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Acc: {acc:.4f}\")\n", + "print(f\"Eval Results: Micro-Precision: {micro_precision:.4f}, \"\n", + " + f\"Micro-Recall: {micro_recall:.4f}, Micro-F1: {micro_f1:.4f}, Acc: {total_acc:.4f}\")\n", + "\n", + "eval(eval_dataloader, )" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/downstream/knowledge/utils.py b/examples/downstream/knowledge/utils.py new file mode 100644 index 00000000..581d4513 --- /dev/null +++ b/examples/downstream/knowledge/utils.py @@ -0,0 +1,131 @@ +import numpy as np +import torch +import heapq +from EduNLP.Pretrain import BertTokenizer + + +def get_onehot_label_topk(classes_border_list, classes_offset_list, scores_list: np.ndarray, top_num=1): + """ + Get the predicted labels based on the topK. + + Args: + classes_border_list + classes_offset_list + scores_list: The all classes predicted scores provided by network + top_num: The max topK number (default: 5) + Returns: + predicted_onehot_labels: The predicted labels (onehot) + """ + pred_onehot_labels = [] + scores_list = np.ndarray.tolist(scores_list) + border, offset = classes_border_list, classes_offset_list + num_level = len(border) + for scores in scores_list: + onehot_labels_list = [0] * len(scores) + hlabels = {} + for level in range(num_level): + begin, end = border[level][0], border[level][1] + cur_scores = scores[begin: end + 1] + cur_offset = offset[level] + cur_onehot_labels_list = [0] * len(cur_scores) + # pred_onehot_scores[level].append(cur_scores) + max_num_index_list = list(map(cur_scores.index, heapq.nlargest(top_num, cur_scores))) + for i in max_num_index_list: + cur_onehot_labels_list[i] = 1 + onehot_labels_list[i + cur_offset] = 1 + hlabels[level] = cur_onehot_labels_list + # pred_onehot_scores[-1].append(scores) + hlabels[num_level] = onehot_labels_list + pred_onehot_labels.append(hlabels) + return pred_onehot_labels + + +def compute_perfs(pred_labels: np.ndarray, true_labels: np.ndarray) -> tuple: + # TP: number of labels which is predicted as True and is actually True. + TP = np.sum(pred_labels * true_labels) + # FP: number of labels which is predicted as True and is actually False. + FP = np.sum(((pred_labels - true_labels) > 0).astype(np.int32)) + # FN: number of labels which is predicted as False and is actually True. + FN = np.sum(((true_labels - pred_labels) > 0).astype(np.int32)) + # FP: number of labels which is predicted as False and is actually False. + TN = np.sum(((pred_labels + true_labels) == 0).astype(np.int32)) + return np.array([TP, FP, FN, TN], dtype=np.int32) + + +def compute_perfs_per_layer(outputs: np.ndarray, true_labels: np.ndarray, hierarchy: dict, classes_border_list: list, keep_consistency: bool = True, threshold=0.5) -> tuple: + def _make_labels_consistent(input_labels: np.ndarray, hierarchy: dict): + input_labels = input_labels.astype(np.int32) + while len(hierarchy) > 0: + bottom_labels = set(hierarchy.keys()) - set(hierarchy.values()) + for child in bottom_labels: + mask = (input_labels[:, child] == 1).astype(np.int32) + input_labels[:, hierarchy[child]] |= mask + for k in bottom_labels: + hierarchy.pop(k) + return input_labels + + preds = [] + for (start, end) in classes_border_list: + threshold_labels = (outputs[:, start:end + 1] >= threshold).astype(np.int32) + max_labels = (outputs[:, start:end + 1] == outputs[:, start:end + 1].max(axis=1)[:,None]).astype(np.int32) + preds.append(threshold_labels | max_labels) + pred_labels = np.concatenate(preds, axis=-1) + del preds + if keep_consistency: + pred_labels = _make_labels_consistent(pred_labels, hierarchy.copy()) + true_labels = _make_labels_consistent(true_labels, hierarchy.copy()) + # get perfs per layer + perfs_per_layer = [] + for (start, end) in classes_border_list: + perfs_per_layer.append(compute_perfs(pred_labels[:, start:end + 1], true_labels[:, start:end + 1])) + total_perfs = compute_perfs(pred_labels, true_labels) + return perfs_per_layer, total_perfs + + +def compute_topk_recall(topk_preds: list, true_labels: list) -> tuple: + rs = [] + for pred, label in zip(topk_preds, true_labels): + _r = len(set(pred) & set(label)) / len(label) + rs.append(_r) + return np.mean(rs) + + +def quantile(array: torch.Tensor, ratio: float): + """ + get quantile of array + """ + assert ratio >= 0 and ratio <= 1 + assert len(array.shape) == 1 + sorted_array = torch.sort(array, dim=-1, descending=True)[0] + index = min(int(len(array) * ratio + 0.5), len(array)) + return sorted_array[index].item() + + +def metric(TP, FP, FN, TN): + def _f1_score(precision, recall): + if precision + recall == 0: + return 0. + else: + return 2 * precision * recall / (precision + recall) + if TP + FP == 0: + precision = 0 + else: + precision = TP / (TP + FP) + if TP + FN == 0: + recall = 0 + else: + recall = TP / (TP + FN) + micro_f1 = _f1_score(precision, recall) + acc = (TP + TN) / (TP + FP + FN + TN) + return precision, recall, micro_f1, acc + + +def metric_ood(Ex_in_joint: torch.FloatTensor, Ex_out_joint: torch.FloatTensor, quantile_value: float): + assert len(Ex_in_joint.shape) == 1 and len(Ex_out_joint.shape) == 1 + pred_ood_in = torch.sum((Ex_in_joint <= quantile_value).float()).item() + pred_ood_out = torch.sum((Ex_out_joint <= quantile_value).float()).item() + true_ood_num = Ex_out_joint.shape[0] + TP = pred_ood_out + FP = pred_ood_in + FN = true_ood_num - pred_ood_out + return metric(TP, FP, FN, 0) \ No newline at end of file diff --git a/examples/downstream/paper_seg/data.py b/examples/downstream/paper_seg/load_data.py similarity index 100% rename from examples/downstream/paper_seg/data.py rename to examples/downstream/paper_seg/load_data.py diff --git a/examples/downstream/paper_seg/paper_seg.ipynb b/examples/downstream/paper_seg/paper_seg.ipynb index 598e0290..dfede4d6 100644 --- a/examples/downstream/paper_seg/paper_seg.ipynb +++ b/examples/downstream/paper_seg/paper_seg.ipynb @@ -27,7 +27,7 @@ "import torch\n", "from torch.utils.data import DataLoader\n", "from torch.utils.tensorboard import SummaryWriter\n", - "from data import VecDataset\n", + "from load_data import VecDataset\n", "from trainer import MyTrainer\n", "from model import PaperSegModel\n", "from utils import get_logger, ROOT_DIR" @@ -59,9 +59,9 @@ "metadata": {}, "outputs": [], "source": [ - "args.train_data_path = f\"{args.data_path}/train/paper_txt_tagged/{args.subject}\"\n", - "args.valid_data_path = f\"{args.data_path}/valid/paper_txt_tagged/{args.subject}\"\n", - "args.test_data_path = f\"{args.data_path}/test/paper_txt_tagged/{args.subject}\"" + "args.train_data_path = f\"{args.data_path}/train/{args.subject}/paper_txt_tagged\"\n", + "args.valid_data_path = f\"{args.data_path}/valid/{args.subject}/paper_txt_tagged\"\n", + "args.test_data_path = f\"{args.data_path}/test/{args.subject}/paper_txt_tagged\"" ] }, { @@ -95,7 +95,7 @@ "source": [ "train_set = VecDataset(\n", " text_data_dir=args.train_data_path,\n", - " emb_data_path=args.train_data_path.replace(\"paper_txt_tagged\", f\"paper_emb_{args.pretrained_model_type}{i2v_postfix}\") + \".train.pt\",\n", + " emb_data_path=args.train_data_path.replace(\"paper_txt_tagged\", \"emb.train.pt\",\n", " mode=\"train\",\n", " pretrained_model_type=args.pretrained_model_type,\n", " pretrained_model_dir=args.pretrained_model_dir,\n", @@ -103,7 +103,7 @@ " )\n", "valid_set = VecDataset(\n", " text_data_dir=args.valid_data_path,\n", - " emb_data_path=args.valid_data_path.replace(\"paper_txt_tagged\", f\"paper_emb_{args.pretrained_model_type}{i2v_postfix}\") + \".valid.pt\",\n", + " emb_data_path=args.valid_data_path.replace(\"paper_txt_tagged\", \"emb.train.pt\",\n", " mode=\"valid\",\n", " pretrained_model_type=args.pretrained_model_type,\n", " pretrained_model_dir=args.pretrained_model_dir,\n", @@ -186,7 +186,8 @@ " model=model,\n", " logger=logger,\n", ")\n", - "trainer.valid(test_loader)" + "trainer.valid(test_loader)\n", + "logger.info(\"Finish testing ... \")" ] } ], diff --git a/examples/downstream/paper_seg/samples/train/math/paper_1.txt b/examples/downstream/paper_seg/samples/train/math/paper_1.txt new file mode 100644 index 00000000..559bdd16 --- /dev/null +++ b/examples/downstream/paper_seg/samples/train/math/paper_1.txt @@ -0,0 +1,82 @@ +================= +2017年云南省临沧市临翔区民族中学高考数学三模试卷(文科) +选择题 +================= +1. 已知集合,,则 \ ( ) $ +A. B. C. D. +================= +2. 已知复数,则复数的模为 \ ( ) $ +A. B. C. D. +================= +3. 已知点,,向量,若,则为 \ ( ) $ +A. B. C. D. +================= +4. 已知函数满足,且当时,成立,若,,,则,,的大小关系是 \ ( ) $ +A. B. C. D. +================= +5.如图的程序框图的算法思路源于数学名著几何原本中的“辗转相除法”,执行该程序框图图中“”表示除以的余数\ ( ) ab485270b=( +A. B. C. D. +================= +6. 某三棱锥的三视图如图所示,则该三棱锥的表面积为 \ ( ) $ +A. B. C. D. +================= +7. 曲线在点处的切线与轴、轴围成的封闭图形的面积为 \ ( ) $ +A. B. C. D. +================= +8. 已知,则 \ ( ) $ +A. B. C. D. +================= +9. 下列说法正确的个数是 \ ( ) $ +若为奇函数,则; +“在中,若,则”的逆命题是假命题; +“三个数,,成等比数列”是“”的既不充分也不必要条件; +命题“,”的否定是“,”. +A. B. C. D. +================= +10. 将函数的图象向右平移个单位后得到的图象的一个对称轴是 \ ( ) $ +A. B. C. D. +================= +11. 已知等差数列的公差,且,,成等比数列,若,是数列的前项和,则的最小值为 \ ( ) $ +A. B. C. D. +================= +12. 已知焦点为的抛物线上有一点,以为圆心,为半径的圆被轴截得的弦长为,则 \ ( ) $ +A. B. C. D. +================= +填空题 +================= +13. 点是不等式组表示的平面区域内的一动点,且不等式恒成立,则的取值范围是 _____ . +================= +14. 已知的内角,,所对的边分别为,,,且,,,则的值为 _____ . +================= +15. 已知正四面体的棱长为,为棱的中点,过作其外接球的截面,则截面面积的最小值为 _____ . +================= +16. 设函数的图象与的图象关于直线对称,且,则 _____ . +================= +简答题 +================= +17. 已知数列的前项和 +Ⅰ求数列的通项公式; +Ⅱ若,求数列的前项和. +================= +18. +================= +19. 如图,在直角梯形中,,,,是中点,将沿折起,使得面; +Ⅰ求证:平面平面; +Ⅱ若是的中点求三棱锥的体积. +================= +20. 已知椭圆:的离心率为,过的左焦点的直线:,直线被圆:截得的弦长为. +Ⅰ求椭圆的方程; +Ⅱ设的右焦点为,在圆上是否存在点,满足,若存在,指出有几个这样的点不必求出点的坐标;若不存在,说明理由. +================= +21. 已知函数为常数 +当时,求函数的单调区间; +求时,不等式恒成立,求实数的取值范围. +================= +22. 在直角坐标系中,曲线为参数,,其中,在以为极点,轴正半轴为极轴的极坐标系中,曲线:,曲线. +Ⅰ求与交点的直角坐标系; +Ⅱ若与相交于点,与相交于点,求的最大值. +================= +23. 设函数 +Ⅰ解不等式; +Ⅱ当时,,求实数的取值范围. +================= From dea9530fd090f5da76ce6a3cdaa115503c9d82c4 Mon Sep 17 00:00:00 2001 From: wintermelon008 <1904346407@qq.com> Date: Mon, 3 Jul 2023 00:43:26 +0800 Subject: [PATCH 34/58] [docs] Add demo for quality evaluate --- examples/downstream/quality_evaluate.ipynb | 292 +++++++++++++++++++++ examples/downstream/train.py | 129 +++++++++ 2 files changed, 421 insertions(+) create mode 100644 examples/downstream/quality_evaluate.ipynb create mode 100644 examples/downstream/train.py diff --git a/examples/downstream/quality_evaluate.ipynb b/examples/downstream/quality_evaluate.ipynb new file mode 100644 index 00000000..7b199588 --- /dev/null +++ b/examples/downstream/quality_evaluate.ipynb @@ -0,0 +1,292 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 下游任务Demo:质量评估" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. 数据准备" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.utils.data import Dataset\n", + "from transformers import BertTokenizer as HfBertTokenizer\n", + "import torch\n", + "import tqdm\n", + "import pandas as pd\n", + "\n", + "class IdsDataset(Dataset):\n", + " def __init__(self, \n", + " mode,\n", + " data_path=\"../data\",\n", + " tokenizer=None,\n", + " ):\n", + " self.mode = mode\n", + " self.data = pd.read_csv(data_path)\n", + " self.data['answers']=self.data['answers'].fillna('')\n", + " self.data = self.data.to_dict(orient=\"records\") # [:20]\n", + "\n", + " self.tokenizer = tokenizer\n", + " self.preprocess()\n", + "\n", + " def preprocess(self):\n", + " for item in tqdm(self.data):\n", + " return_tensors = None if isinstance(self.tokenizer, HfBertTokenizer) else False\n", + "\n", + " answers_encodings = self.tokenizer(item['contexts'], truncation=True, padding=True, max_length=512, return_tensors=return_tensors)\n", + " contexts_encodings = self.tokenizer(item['answers'], truncation=True, padding=True, max_length=512, return_tensors=return_tensors)\n", + " item[\"answers_encodings\"] = answers_encodings\n", + " item[\"contexts_encodings\"] = contexts_encodings\n", + "\n", + " def __getitem__(self, index):\n", + " item = self.data[index]\n", + " return item\n", + "\n", + " def __len__(self):\n", + " return len(self.data)\n", + " \n", + " def collate_fn(self, batch_data):\n", + " first = batch_data[0]\n", + " batch = {\n", + " k: [item[k] for item in batch_data] for k in first.keys()\n", + " }\n", + " batch[\"answers_encodings\"] = self.tokenizer.batch_pad(batch[\"answers_encodings\"], return_tensors=True)\n", + " batch[\"contexts_encodings\"] = self.tokenizer.batch_pad(batch[\"contexts_encodings\"], return_tensors=True)\n", + " batch[\"score\"] = torch.as_tensor(batch[\"score\"])\n", + " batch[\"label\"] = torch.as_tensor(batch[\"label\"])\n", + " return batch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 以 Bert 为例\n", + "from EduNLP.Pretrain import BertTokenizer\n", + "from torch.utils.data import DataLoader\n", + "\n", + "tokenizer = BertTokenizer.from_pretrained(path=\"/path/to/bert/checkpoint\")\n", + "trainData = IdsDataset(mode='train', data_path=\"/path/to/train.csv\", tokenizer=tokenizer)\n", + "validData = IdsDataset(mode='valid', data_path=\"/path/to/valid.csv\", tokenizer=tokenizer)\n", + "testData = IdsDataset(mode='test', data_path=\"/path/to/test.csv\", tokenizer=tokenizer)\n", + "\n", + "train_Dataloader = DataLoader(trainData, shuffle=True, num_workers=0, pin_memory=True, collate_fn=trainData.collate_fn)\n", + "valid_Dataloader = DataLoader(validData, shuffle=True, num_workers=0, pin_memory=True, collate_fn=validData.collate_fn)\n", + "test_Dataloader = DataLoader(testData, shuffle=True, num_workers=0, pin_memory=True, collate_fn=testData.collate_fn)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. 质量评估" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch import nn\n", + "from transformers import BertModel\n", + "from transformers.modeling_outputs import ModelOutput\n", + "from EduNLP.ModelZoo.base_model import BaseModel\n", + "import os\n", + "import json\n", + "\n", + "\n", + "class Global_Layer(nn.Module):\n", + " \"\"\"\n", + " MultiModal global layer\n", + " \"\"\"\n", + " def __init__(self, input_unit, output_unit, hidden_size, dropout_rate=0.5):\n", + " super().__init__()\n", + " self.net = nn.Sequential(\n", + " nn.Linear(input_unit, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Dropout(dropout_rate),\n", + " nn.Linear(hidden_size, output_unit),\n", + " # nn.Softmax(dim=1)\n", + " )\n", + " \n", + " def forward(self, x):\n", + " x = self.net(x)\n", + " return x\n", + " \n", + " \n", + "class QualityLoss(nn.Module):\n", + " def __init__(self, mode='train'):\n", + " super(QualityLoss, self).__init__()\n", + " if mode=='train':\n", + " self.classify_loss_fn = nn.CrossEntropyLoss()\n", + " self.logits_loss_fn = nn.MSELoss()\n", + " else:\n", + " self.classify_loss_fn = nn.CrossEntropyLoss(reduction='sum')\n", + " self.logits_loss_fn = nn.MSELoss(reduction='sum')\n", + "\n", + " def forward(self, pred_score, pred_label,score, label, lamb=0.5):\n", + " # Loss\n", + " score_loss = self.logits_loss_fn(pred_score, score.float())\n", + " label_loss = self.classify_loss_fn(pred_label, label)\n", + " losses = score_loss*lamb + label_loss*(1-lamb)\n", + " return losses\n", + " \n", + " \n", + "class TrainForQualityOutput(ModelOutput):\n", + " loss: torch.FloatTensor = None\n", + " score_logits: torch.FloatTensor = None\n", + " label_logits: torch.FloatTensor = None\n", + "\n", + "\n", + "class QualityModel(BaseModel):\n", + " def __init__(self, pretrained_model_type=\"bert\", pretrained_model_dir=None, emb_mode=\"index\", hidden_size=None, num_labels=3, dropout_rate=0.5):\n", + " super().__init__()\n", + " self.pretrained_model_type = pretrained_model_type\n", + " self.emb_mode = emb_mode\n", + " self.num_labels = num_labels\n", + " if emb_mode == \"index\":\n", + " assert hidden_size is None\n", + " self.bert = BertModel.from_pretrained(pretrained_model_dir)\n", + " self.hidden_size = self.bert.config.hidden_size # 768\n", + " else: # vector\n", + " assert hidden_size is not None\n", + " self.hidden_size = hidden_size\n", + "\n", + " self.score_decoder = Global_Layer(input_unit=self.hidden_size*2,\n", + " output_unit=1,\n", + " hidden_size=self.hidden_size,\n", + " dropout_rate=dropout_rate)\n", + " self.label_decoder = Global_Layer(input_unit=self.hidden_size*2,\n", + " output_unit=num_labels,\n", + " hidden_size=self.hidden_size,\n", + " dropout_rate=dropout_rate) \n", + " self.criterion = QualityLoss()\n", + "\n", + " self.config = {k: v for k, v in locals().items() if k not in [\"self\", \"__class__\", \"bert\"]}\n", + " self.config['architecture'] = 'QualityModel'\n", + "\n", + " def forward(self,\n", + " context_vectors=None,\n", + " answer_vectors=None,\n", + " contexts_encodings=None,\n", + " answers_encodings=None,\n", + " score=None,\n", + " label=None,\n", + " **argv,\n", + " ):\n", + " \"\"\"\n", + " batch_sentences : [batch_size, seq]\n", + " \"\"\"\n", + " if self.emb_mode == \"index\":\n", + " if self.pretrained_model_type in [\"bert\", \"roberta\"]:\n", + " contexts_encoder_out = self.bert(**contexts_encodings)\n", + " answers_encoder_out = self.bert(**answers_encodings)\n", + " context_vectors = contexts_encoder_out[1] # [batch_size, hidden_size]\n", + " answer_vectors = answers_encoder_out[1] # [batch_size, hidden_size]\n", + "\n", + " elif self.pretrained_model_type == \"jiuzhang\":\n", + " contexts_encoder_out = self.bert(\n", + " input_ids=contexts_encodings[\"input_ids\"],\n", + " attention_mask=contexts_encodings[\"attention_mask\"],\n", + " )\n", + " answers_encoder_out = self.bert(\n", + " input_ids=answers_encodings[\"input_ids\"],\n", + " attention_mask=answers_encodings[\"attention_mask\"],\n", + " )\n", + " context_vectors = contexts_encoder_out[\"last_hidden_state\"][:, 0, :]\n", + " answer_vectors = answers_encoder_out[\"last_hidden_state\"][:, 0, :]\n", + " \n", + " elif self.pretrained_model_type == \"disenq\":\n", + " contexts_encoder_out = self.bert(**contexts_encodings)\n", + " answers_encoder_out = self.bert(**answers_encodings)\n", + " context_vectors = contexts_encoder_out[1]\n", + " answer_vectors = answers_encoder_out[1]\n", + " else:\n", + " assert context_vectors is not None and answer_vectors is not None\n", + " pooler_state = torch.cat([context_vectors, answer_vectors],dim=-1)\n", + " score_logits = self.score_decoder(pooler_state).squeeze(-1)\n", + " label_logits = self.label_decoder(pooler_state)\n", + " \n", + " loss = None\n", + " if score is not None and label is not None:\n", + " loss = self.criterion(score_logits, label_logits, score, label, lamb=0.5)\n", + "\n", + " return TrainForQualityOutput(\n", + " loss=loss,\n", + " score_logits=score_logits,\n", + " label_logits=label_logits,\n", + " )\n", + " \n", + " @classmethod\n", + " def from_config(cls, config_path, **kwargs):\n", + " with open(config_path, \"r\", encoding=\"utf-8\") as rf:\n", + " model_config = json.load(rf)\n", + " model_config.update(kwargs)\n", + " return cls(\n", + " pretrained_model_dir=model_config[\"pretrained_model_dir\"],\n", + " emb_mode=model_config[\"emb_mode\"],\n", + " hidden_size=model_config[\"hidden_size\"],\n", + " num_labels=model_config[\"num_labels\"],\n", + " dropout_rate=model_config[\"dropout_rate\"],\n", + " pretrained_model_type=model_config[\"pretrained_model_type\"]\n", + " )\n", + " \n", + " def save_config(self, config_dir):\n", + " config_path = os.path.join(config_dir, \"config.json\")\n", + " with open(config_path, \"w\", encoding=\"utf-8\") as wf:\n", + " json.dump(self.config, wf, ensure_ascii=False, indent=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from train import MyTrainer\n", + "\n", + "# Initial model\n", + "checkpoint_dir=\"your/checkpoint_dir\"\n", + "device = \"cuda:0\"\n", + "model = QualityModel.from_pretrained(checkpoint_dir).to(device)\n", + "trainer = MyTrainer(\n", + " model=model,\n", + ")\n", + "trainer.train(train_Dataloader, valid_Dataloader)\n", + "trainer.valid(valid_Dataloader)\n", + "trainer.valid(test_Dataloader)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "edunlp", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.7.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/downstream/train.py b/examples/downstream/train.py new file mode 100644 index 00000000..0075caf4 --- /dev/null +++ b/examples/downstream/train.py @@ -0,0 +1,129 @@ +import os +import sys +sys.path.append(os.path.dirname(__file__)) +import math +import numpy as np +import torch +from tqdm import tqdm +import logging +from datetime import datetime +from argparse import ArgumentParser +from torch.utils.data import DataLoader +from torchmetrics import MetricCollection, Accuracy, Precision, Recall, MeanSquaredError, MeanSquaredLogError,R2Score,PearsonCorrCoef +import platform +from torch.utils.tensorboard import SummaryWriter + + +def my_cuda_tensor(items, device): + for k, v in items.items(): + if isinstance(v, torch.Tensor): + items[k] = v.to(device) + elif isinstance(v, dict): + items[k] = my_cuda_tensor(v, device) + + return items + + +class MyTrainer(object): + def __init__(self, args, model, optimizer=None, scheduler=None, logger=None, tensorboard_writer=None, **kwargs): + self.args = args + self.model = model + self.optimizer = optimizer + self.scheduler = scheduler + self.logger = logger + self.tensorboard_writer = tensorboard_writer + + self.classify_metric_collection = MetricCollection([ + Accuracy(task="multiclass", num_classes=3, average="micro"), + # Precision(task="multiclass", num_classes=3, average="micro"), + # Recall(task="multiclass", num_classes=3, average="micro") + ]).to(args.device) + self.logits_metric_collection = MetricCollection([ + MeanSquaredError(), + # MeanSquaredLogError(), + R2Score(), + PearsonCorrCoef() + ]).to(args.device) + + def train(self, train_dataloader, valid_dataloader): + self._global_step = 0 + size = len(train_dataloader.dataset) // train_dataloader.batch_size + + best_valid_loss = 9999 + best_epoch = None + for epoch in tqdm(range(self.args.epochs)): + self.model.train() + # train + train_loss = 0 + self.logger.info(f"------ epoch {epoch} ------") + for idx, batch in enumerate(train_dataloader): + batch = my_cuda_tensor(batch, self.args.device) + # Compute prediction error + outputs = self.model(**batch) + score_logits, label_logits = outputs.score_logits, outputs.label_logits + + pred_label = torch.argmax(label_logits, dim=1) + self.classify_metric_collection.update(pred_label, batch["label"]) + self.logits_metric_collection.update(score_logits, batch["score"].float()) + + self.tensorboard_writer.add_scalar("train_loss", outputs.loss.item(), self._global_step) + self._global_step +=1 + train_loss += outputs.loss.item() + + # Backpropagation + loss = outputs.loss / self.args.grad_accum + loss.backward() + # 梯度积累 + if (idx+1) % self.args.grad_accum == 0: + self.optimizer.step() + self.optimizer.zero_grad() + train_loss /= size + + # train_metric + total_train_classify_metric = {k: v.item() for k,v in self.classify_metric_collection.compute().items()} + total_train_logits_metric = {k: v.item() for k,v in self.logits_metric_collection.compute().items()} + self.logger.info(f"train metric for epoch: {total_train_classify_metric},{total_train_logits_metric}") + self.classify_metric_collection.reset() + self.logits_metric_collection.reset() + + valid_loss, total_valid_classify_metric, total_valid_logits_metric = self.valid(valid_dataloader) + # log + for k, v in total_valid_classify_metric.items(): + self.tensorboard_writer.add_scalar(f"classify_metric/{k}", v, epoch) + for k, v in total_valid_logits_metric.items(): + self.tensorboard_writer.add_scalar(f"logits_metric/{k}", v, epoch) + self.tensorboard_writer.add_scalars(f"EpochLoss", {"TrainLoss": train_loss, "ValidLoss": valid_loss}, epoch) + + if valid_loss < best_valid_loss: + self.model.save_pretrained(self.args.checkpoint_dir) + best_valid_loss = valid_loss + best_epoch = epoch + self.logger.info(f"saving best model at epoch {epoch}...") + + self.logger.info(f"Finish training, best model is at epoch {best_epoch}...") + + def valid(self, valid_dataloader): + self.model.eval() + size = len(valid_dataloader.dataset) // valid_dataloader.batch_size + valid_loss = 0 + with torch.no_grad(): + for idx, batch in enumerate(valid_dataloader): + batch = my_cuda_tensor(batch, self.args.device) + # Compute prediction error + outputs = self.model(**batch) + score_logits, label_logits = outputs.score_logits, outputs.label_logits + valid_loss += outputs.loss.item() + + pred_label = torch.argmax(label_logits, dim=1) + self.classify_metric_collection.update(pred_label, batch["label"]) + self.logits_metric_collection.update(score_logits, batch["score"].float()) + + valid_loss /= size + + total_valid_classify_metric = {k: v.item() for k,v in self.classify_metric_collection.compute().items()} + total_valid_logits_metric = {k: v.item() for k,v in self.logits_metric_collection.compute().items()} + self.logger.info(f"Validation metric: {total_valid_classify_metric},{total_valid_logits_metric}") + self.classify_metric_collection.reset() + self.logits_metric_collection.reset() + + return valid_loss, total_valid_classify_metric, total_valid_logits_metric From 8087e3602b14bf78295a92b598885381f5963ebb Mon Sep 17 00:00:00 2001 From: shangzi <2716440431@qq.com> Date: Tue, 4 Jul 2023 13:23:42 +0000 Subject: [PATCH 35/58] fix demos --- .../downstream/difficulty_prediction.ipynb | 12 +- .../discrimination_prediction.ipynb | 12 +- examples/downstream/utils.py | 303 +----------------- 3 files changed, 19 insertions(+), 308 deletions(-) diff --git a/examples/downstream/difficulty_prediction.ipynb b/examples/downstream/difficulty_prediction.ipynb index 98ad02f7..9ac47ca5 100644 --- a/examples/downstream/difficulty_prediction.ipynb +++ b/examples/downstream/difficulty_prediction.ipynb @@ -24,11 +24,13 @@ "from torchmetrics import MeanAbsoluteError, PearsonCorrCoef, SpearmanCorrCoef\n", "import os\n", "from EduNLP.Pretrain import BertTokenizer \n", + "from EduNLP.ModelZoo.base_model import BaseModel\n", "import json\n", "from utils import Dataset_bert, load_json, get_val, get_train\n", "\n", "ROOT = os.path.dirname(os.path.dirname(__file__))\n", - "DATA_DIR = os.path.join(ROOT, \"data\")" + "DATA_DIR = os.path.join(ROOT, \"data\")\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"]= \"0\"" ] }, { @@ -103,7 +105,7 @@ " logits: torch.FloatTensor = None\n", " labels: torch.FloatTensor = None\n", "\n", - "class BertPrediction(nn.Module):\n", + "class BertPrediction(BaseModel):\n", " \n", " def __init__(self, pretrained_model_dir=None, classifier_dropout=0.5):\n", " super(BertPrediction, self).__init__() \n", @@ -122,9 +124,9 @@ " content=None,\n", " labels=None,\n", " ):\n", - " input_ids = content['input_ids'].squeeze(0)\n", - " attention_mask = content['attention_mask'].squeeze(0)\n", - " token_type_ids = content['token_type_ids'].squeeze(0)\n", + " input_ids = content['input_ids']\n", + " attention_mask = content['attention_mask']\n", + " token_type_ids = content['token_type_ids']\n", " item_embed = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)['last_hidden_state'][:, 0, :]\n", " logits = self.sigmoid(self.classifier(item_embed))\n", " loss = F.mse_loss(logits.squeeze(0), labels)\n", diff --git a/examples/downstream/discrimination_prediction.ipynb b/examples/downstream/discrimination_prediction.ipynb index ab3a98d4..914d5e17 100644 --- a/examples/downstream/discrimination_prediction.ipynb +++ b/examples/downstream/discrimination_prediction.ipynb @@ -35,11 +35,13 @@ "from torchmetrics import MeanAbsoluteError, PearsonCorrCoef, SpearmanCorrCoef\n", "import os\n", "from EduNLP.Pretrain import BertTokenizer \n", + "from EduNLP.ModelZoo.base_model import BaseModel\n", "import json\n", "from utils import Dataset_bert, pre_disc\n", "\n", "ROOT = os.path.dirname(os.path.dirname(__file__))\n", - "DATA_DIR = os.path.join(ROOT, \"data\")" + "DATA_DIR = os.path.join(ROOT, \"data\")\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"]= \"0\"" ] }, { @@ -101,7 +103,7 @@ " logits: torch.FloatTensor = None\n", " labels: torch.FloatTensor = None\n", "\n", - "class BertPrediction(nn.Module):\n", + "class BertPrediction(BaseModel):\n", " \n", " def __init__(self, pretrained_model_dir=None, config=None, classifier_dropout=0.5):\n", "\n", @@ -123,9 +125,9 @@ " labels=None,\n", " ):\n", " \n", - " input_ids = content['input_ids'].squeeze(0)\n", - " attention_mask = content['attention_mask'].squeeze(0)\n", - " token_type_ids = content['token_type_ids'].squeeze(0) \n", + " input_ids = content['input_ids']\n", + " attention_mask = content['attention_mask']\n", + " token_type_ids = content['token_type_ids'] \n", " item_embed = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)['last_hidden_state'][:, 0, :]\n", " logits = self.sigmoid(self.classifier(item_embed))\n", " loss = F.mse_loss(logits.squeeze(0), labels)\n", diff --git a/examples/downstream/utils.py b/examples/downstream/utils.py index 935dff6b..d55cf48a 100644 --- a/examples/downstream/utils.py +++ b/examples/downstream/utils.py @@ -1,56 +1,9 @@ from torch.utils.data import Dataset -from EduNLP.ModelZoo.utils import pad_sequence import torch -import numpy as np -from EduNLP.I2V import W2V, Bert -from EduNLP.Pretrain import BertTokenizer from tqdm import tqdm -from gensim.models import KeyedVectors import json -import os -import re -import warnings -import jieba import pandas as pd - -def check2mkdir(file_path): - dir_path = os.path.dirname(file_path) - if not os.path.exists(dir_path): - os.makedirs(dir_path) - -def save_json(save_data, output_path): - print("[save_json] start : {}".format(output_path)) - check2mkdir(output_path) - with open(output_path,'w+',encoding="utf-8") as f: - for row_dic in save_data: - try: - jsondata=json.dumps(row_dic, ensure_ascii=False) - f.write(jsondata + "\n") - except Exception as e: - print("[Exception] at {}:\n{}\n".format(row_dic, e)) - raise Exception("[save_json] 出现错误") - print("[save_json] num = {}, open_path = {}".format(len(save_data), output_path)) - - -def get_json(open_path, error_handler="raise"): - print("[get_json] start : {}".format(open_path)) - load_data = [] - i = 0 - with open(open_path, 'r', encoding="utf-8") as f: - try: - for line in f: - load_data.append(json.loads(line)) - i += 1 - except Exception as e: - if error_handler == "ignore": - warnings.warn("[Warning] at line {}:\n{}\n".format(i, e)) - else: - print("[Exception] at line {}:\n{}\n".format(i, e)) - raise Exception("[get_json] 出现错误") - print("[get_json] num = {}, open_path = {}".format(len(load_data), open_path)) - return load_data - def load_json(open_path): print("[load_json] start : {}".format(open_path)) with open(open_path, "r", encoding='utf-8') as f: @@ -93,279 +46,33 @@ def get_val(val): test_gap.append([start, end]) start = end return test_data, test_gap - -class BaseDataset(Dataset): - def __init__(self, items, tokenizer, mode="train", labal_key="difficulty"): - self.tokenizer = tokenizer - self.items = items - self.mode = mode - self.labal_key = labal_key - - - def __getitem__(self, index): - item = self.items[index] - ret = self.tokenizer(item["content"]) - if self.mode in ["train", "val"]: - ret["labels"] = item[self.labal_key] - return ret - - def __len__(self): - return len(self.items) - - - def collate_fn(self, batch_data): - bert_tokenizer=self.tokenizer.tokenizer - pad_idx = bert_tokenizer.vocab.get(bert_tokenizer.unk_token) - first = batch_data[0] - batch = { - k: [item[k] for item in batch_data] for k in first.keys() - } - for k,v in batch.items(): - if k != "labels": - batch[k] = pad_sequence(v, pad_val=pad_idx) - - batch = {key: torch.as_tensor(val) for key, val in batch.items()} - # print("[debug] batch final: ", batch) - return batch class Dataset_bert(Dataset): def __init__(self, items, tokenizer): self.tokenizer = tokenizer self.items = items - self.preprocess() - def preprocess(self): - for item in tqdm(self.items): - content_vector = self.tokenizer(item['content'], return_tensors="np") - item["content"] = content_vector - - def __getitem__(self, index): - item = self.items[index] - return item - - def __len__(self): - return len(self.items) - def collate_fn(self, batch_data): - first = batch_data[0] - batch = { - k: [item[k] for item in batch_data] for k in first.keys() - } - batch["content"] = self.tokenizer.bert_tokenizer.pad(batch["content"], return_tensors='pt') - batch["labels"] = torch.as_tensor(batch["labels"]) - return batch - -class Dataset_bert_jiuzhang(Dataset): - def __init__(self, items, tokenizer): - self.tokenizer = tokenizer - self.items = items - - self.preprocess() def preprocess(self): for item in tqdm(self.items): - content_vector = self.tokenizer(str(item['content']), return_tensors="pt", max_length=512, truncation=True) + content_vector = self.tokenizer(str(item['content']), return_tensors="pt", max_length=512, truncation=True) + #content_vector = str() item["content"] = content_vector def __getitem__(self, index): item = self.items[index] return item - - def __len__(self): - return len(self.items) - - def collate_fn(self, batch_data): - first = batch_data[0] - batch = { - k: [item[k] for item in batch_data] for k in first.keys() - } - batch["content"] = self.tokenizer(str(batch["content"]), return_tensors='pt', max_length=512, truncation=True) #jiuzhang - batch["labels"] = torch.as_tensor(batch["labels"]) - return batch - -class DiffiultyDataset_w2v(Dataset): - def __init__(self, items, pretrained_path, mode="train"): - self.pretrained_path = pretrained_path - self.items = items - self.mode = mode - self.i2v = W2V("pure_text", "w2v", pretrained_path) - self.preprocess() - - def preprocess(self): - for item in tqdm(self.items): - item['content'] = torch.FloatTensor(np.array(self.i2v.infer_item_vector([item["content"]]))).squeeze(0) - - def __getitem__(self, index): - item = self.items[index] - return item - - def __len__(self): - return len(self.items) - - - def collate_fn(self, batch_data): - first = batch_data[0] - batch = { - k: [item[k] for item in batch_data] for k in first.keys() - } - batch["content"] = torch.stack(batch["content"]) - batch["labels"] = torch.as_tensor(batch["labels"]) - batch = {key: torch.as_tensor(val).squeeze(0) for key, val in batch.items()} - return batch - -class PubWord2Vector(object): - def __init__(self, pretrained_path, language=""): - self.language = language - if language == "mix": - assert os.path.isdir(pretrained_path) - - self.eng_w2v_path = f"{pretrained_path}/glove.6B.300d.word2vec" - self.chs_w2v_path = f"{pretrained_path}/sgns.baidubaike.bigram-char.bz2" - - self.eng_w2v = KeyedVectors.load_word2vec_format(self.eng_w2v_path, binary=False) - self.chs_w2v = KeyedVectors.load_word2vec_format(self.chs_w2v_path, binary=False) - else: - assert os.path.isfile(pretrained_path) - try: - self.w2v = KeyedVectors.load_word2vec_format(pretrained_path, binary=False) - except Exception as e: - print(e) - self.w2v = KeyedVectors.load_word2vec_format(pretrained_path, binary=True) - def word_to_embedding(self, word): - if self.language: - if re.search(u'[\u4E00-\u9FA5]', word): - if word in self.chs_w2v: - return self.chs_w2v[word] - else: - count = 0 - temp_array = np.zeros([300]) - for i in word: - if i in self.chs_w2v: - temp_array += self.chs_w2v[i] - count += 1 - if count != 0: - temp_array /= count - return temp_array - else: - if word in self.eng_w2v: - return self.eng_w2v[word] - elif word.lower() in self.eng_w2v: - return self.eng_w2v[word.lower()] - else: - temp_array = np.zeros([300]) - return temp_array - else: - if word in self.w2v: - return self.w2v[word] - elif word.lower() in self.w2v: - return self.w2v[word.lower()] - else: - temp_array = np.zeros([300]) - return temp_array - - @property - def vector_size(self): - return 300 - -class DiffiultyDataset_w2v_pub(Dataset): - def __init__(self, items, pretrained_path, mode="train"): - self.pretrained_path = pretrained_path - self.items = items - self.mode = mode - self.i2v = PubWord2Vector(pretrained_path) - self.preprocess() - - def preprocess(self): - for item in tqdm(self.items): - words = jieba.lcut(item["content"]) - item_vector = [] - for word in words: - temp_emb = self.i2v.word_to_embedding(word) - item_vector.append(temp_emb) - item_vector = torch.FloatTensor(np.mean(item_vector, axis=0)) if item_vector else torch.FloatTensor(np.zeros([300])) - item['content'] = item_vector - - def __getitem__(self, index): - item = self.items[index] - return item - + def __len__(self): return len(self.items) - - + def collate_fn(self, batch_data): first = batch_data[0] batch = { k: [item[k] for item in batch_data] for k in first.keys() } - - batch["content"] = torch.stack(batch["content"]) + batch["content"] = self.tokenizer(str(batch["content"]), return_tensors='pt', max_length=512, truncation=True) batch["labels"] = torch.as_tensor(batch["labels"]) - batch = {key: torch.as_tensor(val).squeeze(0) for key, val in batch.items()} return batch -class DiscriminationDataset(Dataset): - def __init__(self, items, tokenizer, mode="train"): - self.tokenizer = tokenizer - self.items = items - self.mode = mode - - - def __getitem__(self, index): - item = self.items[index] - ret = self.tokenizer(item["content"]) - if self.mode in ["train", "val"]: - ret["labels"] = item["labels"] - return ret - - def __len__(self): - return len(self.items) - - - def collate_fn(self, batch_data): - bert_tokenizer=self.tokenizer.bert_tokenizer - pad_idx = bert_tokenizer.vocab.get(bert_tokenizer.unk_token) - first = batch_data[0] - batch = { - k: [item[k] for item in batch_data] for k in first.keys() - } - for k,v in batch.items(): - if k != "labels": - batch[k] = pad_sequence(v, pad_val=pad_idx) - - batch["content"] = torch.tensor( [it.cpu().detach().numpy() for it in batch["content"]] ) - batch["labels"] = torch.tensor([it.cpu().detach().numpy() for it in batch["labels"]]) - return batch - - -class ReliabilityDataset(Dataset): - def __init__(self, items, tokenizer, mode="train"): - self.tokenizer = tokenizer - self.items = items - self.mode = mode - - - def __getitem__(self, index): - item = self.items[index] - ret = self.tokenizer(item["content"]) - if self.mode in ["train", "val"]: - ret["labels"] = item["reliability"] - return ret - - def __len__(self): - return len(self.items) - - - def collate_fn(self, batch_data): - bert_tokenizer=self.tokenizer.bert_tokenizer - pad_idx = bert_tokenizer.vocab.get(bert_tokenizer.unk_token) - first = batch_data[0] - batch = { - k: [item[k] for item in batch_data] for k in first.keys() - } - for k,v in batch.items(): - if k != "labels": - batch[k] = pad_sequence(v, pad_val=pad_idx) - - batch = {key: torch.as_tensor(val) for key, val in batch.items()} - return batch \ No newline at end of file From 024d997d6df954c5ef90c7897dbf2a8979c71da1 Mon Sep 17 00:00:00 2001 From: KenelmQLH <1097824882@qq.com> Date: Wed, 5 Jul 2023 18:02:14 +0000 Subject: [PATCH 36/58] update examples --- examples/downstream/knowledge/konw_pred.ipynb | 17 ++++++++++------- examples/downstream/knowledge/utils.py | 13 +------------ examples/downstream/paper_seg/load_data.py | 1 + examples/downstream/paper_seg/paper_seg.ipynb | 6 +++--- tests/test_pretrain/test_pretrained_quesnet.py | 4 ---- 5 files changed, 15 insertions(+), 26 deletions(-) diff --git a/examples/downstream/knowledge/konw_pred.ipynb b/examples/downstream/knowledge/konw_pred.ipynb index b79bec6f..d1e82238 100644 --- a/examples/downstream/knowledge/konw_pred.ipynb +++ b/examples/downstream/knowledge/konw_pred.ipynb @@ -104,6 +104,7 @@ "metadata": {}, "outputs": [], "source": [ + "# 针对多标签任务处理标签\n", "class EvalDataset(torch.utils.data.Dataset):\n", " def __init__(self, data) -> None:\n", " self.data = data\n", @@ -112,11 +113,11 @@ "\n", " def __getitem__(self, idx):\n", " text, labels = self.data[idx][\"ques_content\"], self.data[idx][\"know_list\"]\n", - " token_dicts = self.tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')\n", - " for k, v in token_dicts.items():\n", - " token_dicts[k] = torch.squeeze(v, dim=0)\n", + " encodings = self.tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')\n", + " for k, v in encodings.items():\n", + " encodings[k] = torch.squeeze(v, dim=0)\n", " one_hot_labels = [1. if idx in labels else 0. for idx in range(self.num_classes)]\n", - " return token_dicts, torch.FloatTensor(one_hot_labels)\n", + " return encodings, torch.FloatTensor(one_hot_labels)\n", "\n", " def __len__(self):\n", " return len(self.data)\n", @@ -185,9 +186,12 @@ " k_total_perfs = compute_perfs(flat_pred_topk, eval_batch_labels.cpu().detach().numpy().tolist())\n", " k_total_perfs_list[i_k] += k_total_perfs\n", "\n", + "# metric for overall\n", "micro_precision, micro_recall, micro_f1, total_acc = metric(*total_perfs)\n", + "print(f\"Eval Results: Micro-Precision: {micro_precision:.4f}, \"\n", + " + f\"Micro-Recall: {micro_recall:.4f}, Micro-F1: {micro_f1:.4f}, Acc: {total_acc:.4f}\")\n", "\n", - "# metrics for per layer top_k\n", + "# metrics for per top_k\n", "for i_k, k_total_perfs in enumerate(k_total_perfs_list):\n", " k = top_k_list[i_k]\n", " precision, recall, f1, acc = metric(*k_total_perfs)\n", @@ -197,8 +201,7 @@ "for layer_idx, perfs in enumerate(perfs_per_layer):\n", " precision, recall, f1, acc = metric(*perfs)\n", " print(f\"Layer {layer_idx + 1}: Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Acc: {acc:.4f}\")\n", - "print(f\"Eval Results: Micro-Precision: {micro_precision:.4f}, \"\n", - " + f\"Micro-Recall: {micro_recall:.4f}, Micro-F1: {micro_f1:.4f}, Acc: {total_acc:.4f}\")\n", + "\n", "\n", "eval(eval_dataloader, )" ] diff --git a/examples/downstream/knowledge/utils.py b/examples/downstream/knowledge/utils.py index 581d4513..03cac7be 100644 --- a/examples/downstream/knowledge/utils.py +++ b/examples/downstream/knowledge/utils.py @@ -117,15 +117,4 @@ def _f1_score(precision, recall): recall = TP / (TP + FN) micro_f1 = _f1_score(precision, recall) acc = (TP + TN) / (TP + FP + FN + TN) - return precision, recall, micro_f1, acc - - -def metric_ood(Ex_in_joint: torch.FloatTensor, Ex_out_joint: torch.FloatTensor, quantile_value: float): - assert len(Ex_in_joint.shape) == 1 and len(Ex_out_joint.shape) == 1 - pred_ood_in = torch.sum((Ex_in_joint <= quantile_value).float()).item() - pred_ood_out = torch.sum((Ex_out_joint <= quantile_value).float()).item() - true_ood_num = Ex_out_joint.shape[0] - TP = pred_ood_out - FP = pred_ood_in - FN = true_ood_num - pred_ood_out - return metric(TP, FP, FN, 0) \ No newline at end of file + return precision, recall, micro_f1, acc \ No newline at end of file diff --git a/examples/downstream/paper_seg/load_data.py b/examples/downstream/paper_seg/load_data.py index 24d49265..6422ea00 100644 --- a/examples/downstream/paper_seg/load_data.py +++ b/examples/downstream/paper_seg/load_data.py @@ -22,6 +22,7 @@ def __init__(self, pretrained_model_type, pretrained_model_dir, device="cpu", la self.pretrained_model_type = pretrained_model_type self.pretrained_model_dir = pretrained_model_dir self.device = device + self.language = language tokenizer_kwargs = {"tokenizer_config_dir": pretrained_model_dir} diff --git a/examples/downstream/paper_seg/paper_seg.ipynb b/examples/downstream/paper_seg/paper_seg.ipynb index dfede4d6..5d885695 100644 --- a/examples/downstream/paper_seg/paper_seg.ipynb +++ b/examples/downstream/paper_seg/paper_seg.ipynb @@ -95,7 +95,7 @@ "source": [ "train_set = VecDataset(\n", " text_data_dir=args.train_data_path,\n", - " emb_data_path=args.train_data_path.replace(\"paper_txt_tagged\", \"emb.train.pt\",\n", + " emb_data_path=args.train_data_path.replace(\"paper_txt_tagged\", \"emb.train.pt\"),\n", " mode=\"train\",\n", " pretrained_model_type=args.pretrained_model_type,\n", " pretrained_model_dir=args.pretrained_model_dir,\n", @@ -103,7 +103,7 @@ " )\n", "valid_set = VecDataset(\n", " text_data_dir=args.valid_data_path,\n", - " emb_data_path=args.valid_data_path.replace(\"paper_txt_tagged\", \"emb.train.pt\",\n", + " emb_data_path=args.valid_data_path.replace(\"paper_txt_tagged\", \"emb.valid.pt\"),\n", " mode=\"valid\",\n", " pretrained_model_type=args.pretrained_model_type,\n", " pretrained_model_dir=args.pretrained_model_dir,\n", @@ -112,7 +112,7 @@ " )\n", "test_set = VecDataset(\n", " text_data_dir=args.test_data_path,\n", - " emb_data_path=args.test_data_path.replace(\"paper_txt_tagged\", f\"paper_emb_{args.pretrained_model_type}{i2v_postfix}\") + \".test.pt\",\n", + " emb_data_path=args.valid_data_path.replace(\"paper_txt_tagged\", \"emb.valid.pt\"),\n", " mode=\"test\",\n", " pretrained_model_type=args.pretrained_model_type,\n", " pretrained_model_dir=args.pretrained_model_dir,\n", diff --git a/tests/test_pretrain/test_pretrained_quesnet.py b/tests/test_pretrain/test_pretrained_quesnet.py index cb95bbfe..95a700ef 100644 --- a/tests/test_pretrain/test_pretrained_quesnet.py +++ b/tests/test_pretrain/test_pretrained_quesnet.py @@ -1,10 +1,7 @@ -from lib2to3.pgen2 import token import os - os.environ["CUDA_VISIBLE_DEVICES"] = "" os.environ["WANDB_DISABLED"] = "true" import pytest -import torch from EduNLP.ModelZoo.quesnet import QuesNet from EduNLP.Pretrain import QuesNetTokenizer, Question, pretrain_quesnet # from EduNLP.Pretrain import train_quesnet_for_property_prediction, train_quesnet_for_knowledge_prediction @@ -12,7 +9,6 @@ from EduNLP.Vector.quesnet import QuesNetModel from EduNLP.I2V import QuesNet as QuesNetI2V, get_pretrained_i2v from EduNLP.utils import abs_current_dir, path_append - from conftest import TEST_GPU From 01d97ecc39206d5f9feea5eed2695253bc534bab Mon Sep 17 00:00:00 2001 From: shangzi <2716440431@qq.com> Date: Tue, 11 Jul 2023 12:59:43 +0000 Subject: [PATCH 37/58] update demos --- .../downstream/difficulty_prediction.ipynb | 50 ++++++++++++------ .../discrimination_prediction.ipynb | 52 ++++++++++++------- 2 files changed, 68 insertions(+), 34 deletions(-) diff --git a/examples/downstream/difficulty_prediction.ipynb b/examples/downstream/difficulty_prediction.ipynb index 9ac47ca5..0d4b10a9 100644 --- a/examples/downstream/difficulty_prediction.ipynb +++ b/examples/downstream/difficulty_prediction.ipynb @@ -18,7 +18,7 @@ "from torch import nn\n", "import numpy as np\n", "from transformers.modeling_outputs import ModelOutput\n", - "from transformers import BertModel, TrainingArguments, Trainer\n", + "from transformers import BertModel, TrainingArguments, Trainer, PretrainedConfig\n", "import torch.nn.functional as F\n", "from sklearn.metrics import ndcg_score, mean_squared_error\n", "from torchmetrics import MeanAbsoluteError, PearsonCorrCoef, SpearmanCorrCoef\n", @@ -78,11 +78,11 @@ } ], "source": [ - "output_dir = \"output/difficulty\" #模型保存路径\n", - "pretrained_model_dir = os.path.join(DATA_DIR, \"bert_math_768\") #bert路径,也可以更换为其他模型的路径,如disenqnet, roberta等\n", - "train_data = load_json(os.path.join(DATA_DIR, \"train\", \"高中数学.json\")) #训练集\n", + "output_dir = \"output/difficulty\" #设置模型保存路径\n", + "pretrained_model_dir = os.path.join(DATA_DIR, \"bert_math_768\") #预训练的bert路径,也可以更换为其他模型的路径,如disenqnet, roberta等\n", + "train_data = load_json(os.path.join(DATA_DIR, \"train\", \"高中数学.json\")) #加载训练集\n", "train_items = get_train(train_data)\n", - "val_data = load_json(os.path.join(DATA_DIR, \"test\", \"高中数学paper.json\")) #测试集\n", + "val_data = load_json(os.path.join(DATA_DIR, \"test\", \"高中数学paper.json\")) #加载测试集\n", "val_items, val_gap = get_val(val_data)" ] }, @@ -105,18 +105,21 @@ " logits: torch.FloatTensor = None\n", " labels: torch.FloatTensor = None\n", "\n", - "class BertPrediction(BaseModel):\n", + "class BertForDifficultyPrediction(BaseModel):\n", " \n", " def __init__(self, pretrained_model_dir=None, classifier_dropout=0.5):\n", - " super(BertPrediction, self).__init__() \n", - " self.bert = BertModel.from_pretrained(pretrained_model_dir)\n", + " super(BertForDifficultyPrediction, self).__init__() \n", + " self.bert = BertModel.from_pretrained(pretrained_model_dir, ignore_mismatched_sizes=True)\n", " hidden_size = self.bert.config.hidden_size\n", - " # print(hidden_size)\n", " self.classifier_dropout = classifier_dropout\n", " self.dropout = nn.Dropout(classifier_dropout)\n", " self.classifier = nn.Linear(hidden_size, 1)\n", " self.sigmoid = nn.Sigmoid()\n", "\n", + " self.config = {k: v for k, v in locals().items() if k not in [\"self\", \"__class__\"]}\n", + " self.config['architecture'] = 'BertForDifficultyPrediction'\n", + " self.config = PretrainedConfig.from_dict(self.config)\n", + "\n", " def forward(self,\n", " input_ids=None,\n", " attention_mask=None,\n", @@ -134,7 +137,17 @@ " loss = loss,\n", " logits = logits,\n", " labels = labels\n", - " )" + " )\n", + " \n", + " @classmethod\n", + " def from_config(cls, config_path, *args, **kwargs):\n", + " with open(config_path, \"r\", encoding=\"utf-8\") as rf:\n", + " model_config = json.load(rf)\n", + " model_config.update(kwargs)\n", + " return cls(\n", + " pretrained_model_dir=model_config['pretrained_model_dir'],\n", + " classifier_dropout=model_config.get(\"classifier_dropout\", 0.5), \n", + " )" ] }, { @@ -203,10 +216,10 @@ " pretrained_model_dir,\n", " train_items=None,\n", " val_items=None,\n", - " test_items=None,\n", " train_params=None):\n", - " model = BertPrediction(pretrained_model_dir=pretrained_model_dir)\n", - " tokenizer = BertTokenizer(add_special_tokens=True)\n", + " tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)\n", + " model = BertForDifficultyPrediction(pretrained_model_dir=pretrained_model_dir)\n", + " model.bert.resize_token_embeddings(len(tokenizer.bert_tokenizer))\n", " # training parameters\n", " if train_params is not None:\n", " epochs = train_params['epochs'] if 'epochs' in train_params else 1\n", @@ -238,7 +251,7 @@ " num_train_epochs=epochs,\n", " per_device_train_batch_size=batch_size,\n", " per_device_eval_batch_size=batch_size,\n", - " evaluation_strategy = \"steps\", # epoch\n", + " evaluation_strategy = \"steps\", \n", " eval_steps=logging_steps*5,\n", " \n", " save_steps=save_steps,\n", @@ -260,9 +273,14 @@ " compute_metrics=compute_metrics\n", " )\n", "\n", - " trainer.train()\n", + " trainer.train() #训练模型\n", " trainer.save_model(output_dir)\n", - " tokenizer.save_pretrained(output_dir)" + " trainer.model.save_config(output_dir)\n", + " tokenizer.save_pretrained(output_dir) #保存训练后的模型\n", + " '''\n", + " 如果希望直接加载训练好的模型用于测试,可以将\"pretrained_model_dir\"设为训练好的模型的路径,然后直接使用trainer.evaluate()在测试集上评估\n", + " '''\n", + " trainer.evaluate() #在测试集上评估" ] }, { diff --git a/examples/downstream/discrimination_prediction.ipynb b/examples/downstream/discrimination_prediction.ipynb index 914d5e17..a71955d4 100644 --- a/examples/downstream/discrimination_prediction.ipynb +++ b/examples/downstream/discrimination_prediction.ipynb @@ -29,7 +29,7 @@ "from torch import nn\n", "import numpy as np\n", "from transformers.modeling_outputs import ModelOutput\n", - "from transformers import BertModel, TrainingArguments, Trainer\n", + "from transformers import BertModel, TrainingArguments, Trainer, PretrainedConfig\n", "import torch.nn.functional as F\n", "from sklearn.metrics import ndcg_score, mean_squared_error\n", "from torchmetrics import MeanAbsoluteError, PearsonCorrCoef, SpearmanCorrCoef\n", @@ -74,14 +74,14 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "output_dir = os.path.join(ROOT, \"output/discrimination\") #模型保存路径\n", - "pretrained_model_dir = \"../data/bert_math_768\" #bert路径,也可以更换为其他模型的路径,如disenqnet, roberta等\n", - "train_items = pre_disc(os.path.join(DATA_DIR, \"train\", \"ctt_train.csv\")) #训练集\n", - "val_items = pre_disc(os.path.join(DATA_DIR, \"test\", \"ctt_test.csv\")) #测试集" + "output_dir = os.path.join(ROOT, \"output/discrimination\") #设置模型保存路径\n", + "pretrained_model_dir = \"../data/bert_math_768\" #预训练的bert路径,也可以更换为其他模型的路径,如disenqnet, roberta等\n", + "train_items = pre_disc(os.path.join(DATA_DIR, \"train\", \"ctt_train.csv\")) #加载训练集\n", + "val_items = pre_disc(os.path.join(DATA_DIR, \"test\", \"ctt_test.csv\")) #加载测试集" ] }, { @@ -103,20 +103,21 @@ " logits: torch.FloatTensor = None\n", " labels: torch.FloatTensor = None\n", "\n", - "class BertPrediction(BaseModel):\n", + "class BertForDiscriminationPrediction(BaseModel):\n", " \n", " def __init__(self, pretrained_model_dir=None, config=None, classifier_dropout=0.5):\n", - "\n", - " super(BertPrediction, self).__init__()\n", - "\n", - " self.bert = BertModel.from_pretrained(pretrained_model_dir) \n", - " \n", + " super(BertForDiscriminationPrediction, self).__init__()\n", + " self.bert = BertModel.from_pretrained(pretrained_model_dir, ignore_mismatched_sizes=True) \n", " hidden_size = self.bert.config.hidden_size\n", " self.classifier_dropout = classifier_dropout\n", " self.dropout = nn.Dropout(classifier_dropout)\n", " self.classifier = nn.Linear(hidden_size, 1)\n", " self.sigmoid = nn.Sigmoid()\n", "\n", + " self.config = {k: v for k, v in locals().items() if k not in [\"self\", \"__class__\"]}\n", + " self.config['architecture'] = 'BertForDiscriminationPrediction'\n", + " self.config = PretrainedConfig.from_dict(self.config)\n", + "\n", " def forward(self,\n", " input_ids=None,\n", " attention_mask=None,\n", @@ -135,7 +136,17 @@ " loss = loss,\n", " logits = logits,\n", " labels = labels\n", - " )" + " )\n", + " \n", + " @classmethod\n", + " def from_config(cls, config_path, *args, **kwargs):\n", + " with open(config_path, \"r\", encoding=\"utf-8\") as rf:\n", + " model_config = json.load(rf)\n", + " model_config.update(kwargs)\n", + " return cls(\n", + " pretrained_model_dir=model_config['pretrained_model_dir'],\n", + " classifier_dropout=model_config.get(\"classifier_dropout\", 0.5), \n", + " )\n" ] }, { @@ -199,8 +210,9 @@ " train_items=None,\n", " val_items=None,\n", " train_params=None):\n", - " model = BertPrediction(pretrained_model_dir=pretrained_model_dir) \n", - " tokenizer = BertTokenizer(add_special_tokens=True)\n", + " tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)\n", + " model = DiscriminationPredictionOutput(pretrained_model_dir=pretrained_model_dir) \n", + " model.bert.resize_token_embeddings(len(tokenizer.bert_tokenizer))\n", " \n", " if train_params is not None:\n", " epochs = train_params['epochs'] if 'epochs' in train_params else 1\n", @@ -253,10 +265,14 @@ " compute_metrics=compute_metrics,\n", " )\n", "\n", - " trainer.train()\n", + " trainer.train() #训练模型\n", " trainer.save_model(output_dir)\n", - " tokenizer.save_pretrained(output_dir)\n", - " #trainer.evaluate()" + " trainer.model.save_config(output_dir)\n", + " tokenizer.save_pretrained(output_dir) #保存训练后的模型\n", + " '''\n", + " 如果希望直接加载训练好的模型用于测试,可以将\"pretrained_model_dir\"设为训练好的模型的路径,然后直接使用trainer.evaluate()在测试集上评估\n", + " '''\n", + " trainer.evaluate() #在测试集上评估" ] }, { From a3c281cbc697435318e9f6748b403a598160e306 Mon Sep 17 00:00:00 2001 From: KenelmQLH <1097824882@qq.com> Date: Wed, 12 Jul 2023 05:36:04 +0000 Subject: [PATCH 38/58] clean the extra code --- examples/downstream/knowledge/konw_pred.ipynb | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/downstream/knowledge/konw_pred.ipynb b/examples/downstream/knowledge/konw_pred.ipynb index d1e82238..c1a70468 100644 --- a/examples/downstream/knowledge/konw_pred.ipynb +++ b/examples/downstream/knowledge/konw_pred.ipynb @@ -200,10 +200,7 @@ "# metrics for per layer\n", "for layer_idx, perfs in enumerate(perfs_per_layer):\n", " precision, recall, f1, acc = metric(*perfs)\n", - " print(f\"Layer {layer_idx + 1}: Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Acc: {acc:.4f}\")\n", - "\n", - "\n", - "eval(eval_dataloader, )" + " print(f\"Layer {layer_idx + 1}: Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Acc: {acc:.4f}\")" ] } ], From 0629ec36feb187cba81540a3cc7b488e5decda21 Mon Sep 17 00:00:00 2001 From: shangzi <2716440431@qq.com> Date: Wed, 12 Jul 2023 10:02:59 +0000 Subject: [PATCH 39/58] update demos --- .../downstream/difficulty_prediction.ipynb | 160 ++++++++++++++++-- .../discrimination_prediction.ipynb | 153 ++++++++++++++++- 2 files changed, 292 insertions(+), 21 deletions(-) diff --git a/examples/downstream/difficulty_prediction.ipynb b/examples/downstream/difficulty_prediction.ipynb index 0d4b10a9..add1f6c5 100644 --- a/examples/downstream/difficulty_prediction.ipynb +++ b/examples/downstream/difficulty_prediction.ipynb @@ -23,6 +23,7 @@ "from sklearn.metrics import ndcg_score, mean_squared_error\n", "from torchmetrics import MeanAbsoluteError, PearsonCorrCoef, SpearmanCorrCoef\n", "import os\n", + "import tqdm\n", "from EduNLP.Pretrain import BertTokenizer \n", "from EduNLP.ModelZoo.base_model import BaseModel\n", "import json\n", @@ -58,7 +59,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 加载数据,定义预训练模型路径" + "### 加载数据,定义路径" ] }, { @@ -78,20 +79,28 @@ } ], "source": [ - "output_dir = \"output/difficulty\" #设置模型保存路径\n", - "pretrained_model_dir = os.path.join(DATA_DIR, \"bert_math_768\") #预训练的bert路径,也可以更换为其他模型的路径,如disenqnet, roberta等\n", + "output_dir = \"path/to/output_dir\" #设置模型保存路径\n", + "pretrained_model_dir = os.path.join(DATA_DIR, \"bert_math_768\") #以预训练的bert路径为例,也可以更换为其他模型的路径,如disenqnet, roberta等\n", + "checkpoint_dir = \"path/to/difficulty_prediction_checkpoint\"\n", "train_data = load_json(os.path.join(DATA_DIR, \"train\", \"高中数学.json\")) #加载训练集\n", "train_items = get_train(train_data)\n", "val_data = load_json(os.path.join(DATA_DIR, \"test\", \"高中数学paper.json\")) #加载测试集\n", "val_items, val_gap = get_val(val_data)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 训练" + ] + }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "### 定义网络结构" + "#### 定义网络结构" ] }, { @@ -155,7 +164,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 定义评价指标" + "#### 定义评价指标" ] }, { @@ -199,7 +208,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 定义训练和测试相关参数" + "#### 定义训练和测试相关参数" ] }, { @@ -276,11 +285,7 @@ " trainer.train() #训练模型\n", " trainer.save_model(output_dir)\n", " trainer.model.save_config(output_dir)\n", - " tokenizer.save_pretrained(output_dir) #保存训练后的模型\n", - " '''\n", - " 如果希望直接加载训练好的模型用于测试,可以将\"pretrained_model_dir\"设为训练好的模型的路径,然后直接使用trainer.evaluate()在测试集上评估\n", - " '''\n", - " trainer.evaluate() #在测试集上评估" + " tokenizer.save_pretrained(output_dir) #保存训练后的模型" ] }, { @@ -288,7 +293,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 训练和测试" + "#### 训练模型" ] }, { @@ -305,6 +310,137 @@ " train_params= None\n", " )" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 测试" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 用于测试的网络结构" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class DifficultyPredictionEval(BaseModel): \n", + " def __init__(self, pretrained_model_dir=None, classifier_dropout=0.5):\n", + " super(DifficultyPredictionEval, self).__init__()\n", + " self.bert = BertModel.from_pretrained(pretrained_model_dir, ignore_mismatched_sizes=True)\n", + " hidden_size = self.bert.config.hidden_size\n", + " self.classifier_dropout = classifier_dropout\n", + " self.dropout = nn.Dropout(classifier_dropout)\n", + " self.classifier = nn.Linear(hidden_size, 1)\n", + " self.sigmoid = nn.Sigmoid()\n", + "\n", + " self.config = {k: v for k, v in locals().items() if k not in [\"self\", \"__class__\"]}\n", + " self.config['architecture'] = 'BertForDifficultyPrediction'\n", + " self.config = PretrainedConfig.from_dict(self.config)\n", + "\n", + " def forward(self,\n", + " input_ids=None,\n", + " attention_mask=None,\n", + " token_type_ids=None,\n", + " ):\n", + "\n", + " item_embed = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)['last_hidden_state'][:, 0, :]\n", + " logits = self.sigmoid(self.classifier(item_embed))\n", + " return logits\n", + " \n", + " @classmethod\n", + " def from_config(cls, config_path, **kwargs):\n", + " with open(config_path, \"r\", encoding=\"utf-8\") as rf:\n", + " model_config = json.load(rf)\n", + " model_config.update(kwargs)\n", + " return cls(\n", + " pretrained_model_dir=model_config['pretrained_model_dir'],\n", + " classifier_dropout=model_config.get(\"classifier_dropout\", 0.5), \n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 加载测试集和模型" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class EvalDataset(torch.utils.data.Dataset):\n", + " def __init__(self, items, tokenizer):\n", + " self.tokenizer = tokenizer\n", + " self.items = items\n", + " \n", + " def __getitem__(self, index):\n", + " content, labels = self.items[index][\"content\"], self.items[index][\"labels\"]\n", + " encodings = self.tokenizer(str(content), max_length=512, truncation=True, return_tensors=\"pt\")\n", + " for k, v in encodings.items():\n", + " encodings[k] = v\n", + " return encodings, torch.as_tensor([labels])\n", + " \n", + " def __len__(self):\n", + " return len(self.items)\n", + " \n", + "tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)\n", + "eval_dataloader = EvalDataset(\n", + " items=val_items,\n", + " tokenizer=tokenizer,\n", + " )\n", + "model = DifficultyPredictionEval.from_pretrained(checkpoint_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 在测试集上评估" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_eval_results(pres, golds):\n", + " logits = torch.as_tensor(pres)\n", + " labels = torch.as_tensor(golds)\n", + " ret = {\n", + " \"mae\": MAE(logits, labels).tolist(),\n", + " \"mse\": mean_squared_error(golds, pres),\n", + " \"rmse\": np.sqrt(mean_squared_error(golds, pres)),\n", + " \"pcc\": PCC(logits, labels).tolist(),\n", + " \"scc\": SCC(logits, labels).tolist(),\n", + " 'ndcg @all, @10, @20, @30': testdata_metrics(val_gap, golds, pres).tolist(),\n", + " }\n", + " return ret\n", + "\n", + "model.eval()\n", + "pred_list = []\n", + "label_list = []\n", + "for i, eval_batch in tqdm.tqdm(enumerate(eval_dataloader)):\n", + " input_data, eval_batch_labels = eval_batch\n", + " output_logits = model(**input_data)\n", + " pred_list.append(output_logits.squeeze(0).tolist()[0])\n", + " label_list.append(eval_batch_labels.tolist()[0])\n", + "\n", + "results = get_eval_results(pred_list, label_list)\n", + "print(f\"Test results: {results}\")\n", + "\n" + ] } ], "metadata": { diff --git a/examples/downstream/discrimination_prediction.ipynb b/examples/downstream/discrimination_prediction.ipynb index a71955d4..0b1c4f18 100644 --- a/examples/downstream/discrimination_prediction.ipynb +++ b/examples/downstream/discrimination_prediction.ipynb @@ -34,6 +34,7 @@ "from sklearn.metrics import ndcg_score, mean_squared_error\n", "from torchmetrics import MeanAbsoluteError, PearsonCorrCoef, SpearmanCorrCoef\n", "import os\n", + "import tqdm\n", "from EduNLP.Pretrain import BertTokenizer \n", "from EduNLP.ModelZoo.base_model import BaseModel\n", "import json\n", @@ -80,16 +81,24 @@ "source": [ "output_dir = os.path.join(ROOT, \"output/discrimination\") #设置模型保存路径\n", "pretrained_model_dir = \"../data/bert_math_768\" #预训练的bert路径,也可以更换为其他模型的路径,如disenqnet, roberta等\n", + "checkpoint_dir = \"path/to/difficulty_prediction_checkpoint\"\n", "train_items = pre_disc(os.path.join(DATA_DIR, \"train\", \"ctt_train.csv\")) #加载训练集\n", "val_items = pre_disc(os.path.join(DATA_DIR, \"test\", \"ctt_test.csv\")) #加载测试集" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 训练" + ] + }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "### 定义网络结构" + "#### 定义网络结构" ] }, { @@ -154,7 +163,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 定义评价指标" + "#### 定义评价指标" ] }, { @@ -192,7 +201,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 定义训练和测试相关参数" + "#### 定义训练和测试相关参数" ] }, { @@ -268,11 +277,7 @@ " trainer.train() #训练模型\n", " trainer.save_model(output_dir)\n", " trainer.model.save_config(output_dir)\n", - " tokenizer.save_pretrained(output_dir) #保存训练后的模型\n", - " '''\n", - " 如果希望直接加载训练好的模型用于测试,可以将\"pretrained_model_dir\"设为训练好的模型的路径,然后直接使用trainer.evaluate()在测试集上评估\n", - " '''\n", - " trainer.evaluate() #在测试集上评估" + " tokenizer.save_pretrained(output_dir) #保存训练后的模型" ] }, { @@ -280,7 +285,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 训练和测试" + "#### 训练模型" ] }, { @@ -297,6 +302,136 @@ " train_params= None\n", ")" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 测试" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 用于测试的网络结构" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class DiscriminationPredictionEval(BaseModel): \n", + " def __init__(self, pretrained_model_dir=None, classifier_dropout=0.5):\n", + " super(DiscriminationPredictionEval, self).__init__()\n", + " self.bert = BertModel.from_pretrained(pretrained_model_dir, ignore_mismatched_sizes=True)\n", + " hidden_size = self.bert.config.hidden_size\n", + " self.classifier_dropout = classifier_dropout\n", + " self.dropout = nn.Dropout(classifier_dropout)\n", + " self.classifier = nn.Linear(hidden_size, 1)\n", + " self.sigmoid = nn.Sigmoid()\n", + "\n", + " self.config = {k: v for k, v in locals().items() if k not in [\"self\", \"__class__\"]}\n", + " self.config['architecture'] = 'BertForDifficultyPrediction'\n", + " self.config = PretrainedConfig.from_dict(self.config)\n", + "\n", + " def forward(self,\n", + " input_ids=None,\n", + " attention_mask=None,\n", + " token_type_ids=None,\n", + " ):\n", + "\n", + " item_embed = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)['last_hidden_state'][:, 0, :]\n", + " logits = self.sigmoid(self.classifier(item_embed))\n", + " return logits\n", + " \n", + " @classmethod\n", + " def from_config(cls, config_path, **kwargs):\n", + " with open(config_path, \"r\", encoding=\"utf-8\") as rf:\n", + " model_config = json.load(rf)\n", + " model_config.update(kwargs)\n", + " return cls(\n", + " pretrained_model_dir=model_config['pretrained_model_dir'],\n", + " classifier_dropout=model_config.get(\"classifier_dropout\", 0.5), \n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 加载测试集和模型" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class EvalDataset(torch.utils.data.Dataset):\n", + " def __init__(self, items, tokenizer):\n", + " self.tokenizer = tokenizer\n", + " self.items = items\n", + " \n", + " def __getitem__(self, index):\n", + " content, labels = self.items[index][\"content\"], self.items[index][\"labels\"]\n", + " encodings = self.tokenizer(str(content), max_length=512, truncation=True, return_tensors=\"pt\")\n", + " for k, v in encodings.items():\n", + " encodings[k] = v\n", + " return encodings, torch.as_tensor([labels])\n", + " \n", + " def __len__(self):\n", + " return len(self.items)\n", + " \n", + "tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)\n", + "eval_dataloader = EvalDataset(\n", + " items=val_items,\n", + " tokenizer=tokenizer,\n", + " )\n", + "model = DiscriminationPredictionEval.from_pretrained(checkpoint_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 在测试集上评估" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_eval_results(pres, golds):\n", + " logits = torch.as_tensor(pres)\n", + " labels = torch.as_tensor(golds)\n", + " ret = {\n", + " \"mae\": MAE(logits, labels).tolist(),\n", + " \"mse\": mean_squared_error(golds, pres),\n", + " \"rmse\": np.sqrt(mean_squared_error(golds, pres)),\n", + " \"pcc\": PCC(logits, labels).tolist(),\n", + " \"scc\": SCC(logits, labels).tolist(),\n", + " 'ndcg @all, @10, @20, @30': testdata_metrics(golds, pres).tolist(),\n", + " }\n", + " return ret\n", + "\n", + "model.eval()\n", + "pred_list = []\n", + "label_list = []\n", + "for i, eval_batch in tqdm.tqdm(enumerate(eval_dataloader)):\n", + " input_data, eval_batch_labels = eval_batch\n", + " output_logits = model(**input_data)\n", + " pred_list.append(output_logits.squeeze(0).tolist()[0])\n", + " label_list.append(eval_batch_labels.tolist()[0])\n", + "\n", + "results = get_eval_results(pred_list, label_list)\n", + "print(f\"Test results: {results}\")" + ] } ], "metadata": { From 932d602d8728299e4a9e1214be3939974fb1b70d Mon Sep 17 00:00:00 2001 From: shangzi <2716440431@qq.com> Date: Fri, 14 Jul 2023 07:30:24 +0000 Subject: [PATCH 40/58] Fixed some bugs --- .../downstream/difficulty_prediction.ipynb | 195 ++++------------- .../discrimination_prediction.ipynb | 199 ++++-------------- examples/downstream/utils.py | 29 --- 3 files changed, 89 insertions(+), 334 deletions(-) diff --git a/examples/downstream/difficulty_prediction.ipynb b/examples/downstream/difficulty_prediction.ipynb index add1f6c5..57001873 100644 --- a/examples/downstream/difficulty_prediction.ipynb +++ b/examples/downstream/difficulty_prediction.ipynb @@ -10,15 +10,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from torch import nn\n", "import numpy as np\n", - "from transformers.modeling_outputs import ModelOutput\n", - "from transformers import BertModel, TrainingArguments, Trainer, PretrainedConfig\n", + "from transformers import BertModel, TrainingArguments, Trainer, PretrainedConfig, DataCollatorWithPadding\n", "import torch.nn.functional as F\n", "from sklearn.metrics import ndcg_score, mean_squared_error\n", "from torchmetrics import MeanAbsoluteError, PearsonCorrCoef, SpearmanCorrCoef\n", @@ -26,8 +25,9 @@ "import tqdm\n", "from EduNLP.Pretrain import BertTokenizer \n", "from EduNLP.ModelZoo.base_model import BaseModel\n", + "from EduNLP.Pretrain import EduDataset\n", "import json\n", - "from utils import Dataset_bert, load_json, get_val, get_train\n", + "from utils import load_json, get_val, get_train\n", "\n", "ROOT = os.path.dirname(os.path.dirname(__file__))\n", "DATA_DIR = os.path.join(ROOT, \"data\")\n", @@ -36,18 +36,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/data/shangzi/anaconda3/envs/tgen/lib/python3.8/site-packages/torchmetrics/utilities/prints.py:36: UserWarning: Metric `SpearmanCorrcoef` will save all targets and predictions in the buffer. For large datasets, this may lead to large memory footprint.\n", - " warnings.warn(*args, **kwargs)\n" - ] - } - ], + "outputs": [], "source": [ "MAE = MeanAbsoluteError()\n", "PCC = PearsonCorrCoef()\n", @@ -64,20 +55,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[load_json] start : /data/shangzi/edunlp/gaokao-prediction/data/train/高中数学.json\n", - "[load_json] num = 3600, open_path = /data/shangzi/edunlp/gaokao-prediction/data/train/高中数学.json\n", - "[load_json] start : /data/shangzi/edunlp/gaokao-prediction/data/test/高中数学paper.json\n", - "[load_json] num = 7, open_path = /data/shangzi/edunlp/gaokao-prediction/data/test/高中数学paper.json\n" - ] - } - ], + "outputs": [], "source": [ "output_dir = \"path/to/output_dir\" #设置模型保存路径\n", "pretrained_model_dir = os.path.join(DATA_DIR, \"bert_math_768\") #以预训练的bert路径为例,也可以更换为其他模型的路径,如disenqnet, roberta等\n", @@ -109,15 +89,9 @@ "metadata": {}, "outputs": [], "source": [ - "class DifficultyPredictionOutput(ModelOutput):\n", - " loss: torch.FloatTensor = None\n", - " logits: torch.FloatTensor = None\n", - " labels: torch.FloatTensor = None\n", - "\n", - "class BertForDifficultyPrediction(BaseModel):\n", - " \n", + "class BertForDifficultyPrediction(BaseModel): \n", " def __init__(self, pretrained_model_dir=None, classifier_dropout=0.5):\n", - " super(BertForDifficultyPrediction, self).__init__() \n", + " super(BertForDifficultyPrediction, self).__init__()\n", " self.bert = BertModel.from_pretrained(pretrained_model_dir, ignore_mismatched_sizes=True)\n", " hidden_size = self.bert.config.hidden_size\n", " self.classifier_dropout = classifier_dropout\n", @@ -133,20 +107,16 @@ " input_ids=None,\n", " attention_mask=None,\n", " token_type_ids=None,\n", - " content=None,\n", " labels=None,\n", " ):\n", - " input_ids = content['input_ids']\n", - " attention_mask = content['attention_mask']\n", - " token_type_ids = content['token_type_ids']\n", + " \n", " item_embed = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)['last_hidden_state'][:, 0, :]\n", - " logits = self.sigmoid(self.classifier(item_embed))\n", - " loss = F.mse_loss(logits.squeeze(0), labels)\n", - " return DifficultyPredictionOutput(\n", - " loss = loss,\n", - " logits = logits,\n", - " labels = labels\n", - " )\n", + "\n", + " logits = self.sigmoid(self.classifier(item_embed)).squeeze(0)\n", + " loss = None\n", + " if labels is not None:\n", + " loss = F.mse_loss(logits, labels) if labels is not None else None\n", + " return loss, logits\n", " \n", " @classmethod\n", " def from_config(cls, config_path, *args, **kwargs):\n", @@ -164,51 +134,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### 定义评价指标" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def compute_metrics(pred):\n", - " logits = torch.as_tensor(pred.predictions[0]).squeeze(0)\n", - " logits = logits.view([logits.size()[0]],-1)\n", - " labels = torch.as_tensor(pred.label_ids)\n", - " print(\"logits\", logits)\n", - " print(\"labels\", labels)\n", - " pres = logits.numpy().tolist()\n", - " golds = labels.numpy().tolist()\n", - " ret = {\n", - " \"mae\": MAE(logits, labels),\n", - " \"mse\": mean_squared_error(golds, pres),\n", - " \"rmse\": np.sqrt(mean_squared_error(golds, pres)),\n", - " \"pcc\": PCC(logits, labels),\n", - " \"scc\": SCC(logits, labels),\n", - " 'ndcg': testdata_metrics(val_gap, golds, pres).tolist(),\n", - " }\n", - " return ret\n", - "def testdata_metrics(val_gap, diff, pred):\n", - " diff, pred = np.array(diff), np.array(pred)\n", - " idx = np.where(diff>0)[0]\n", - " ndcg = []\n", - " for s, e in val_gap:\n", - " _diff, _pred = diff[s:e], pred[s:e]\n", - " if _diff[0]==-1:\n", - " _diff = [i+1 for i in range(len(_diff))]\n", - " ndcg.append([ndcg_score([_diff], [_pred]), ndcg_score([_diff], [_pred], k=10), ndcg_score([_diff], [_pred], k=20), ndcg_score([_diff], [_pred], k=30)])\n", - " ndcg = np.mean(ndcg, axis=0)\n", - " return ndcg" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 定义训练和测试相关参数" + "#### 定义训练相关参数" ] }, { @@ -217,6 +143,9 @@ "metadata": {}, "outputs": [], "source": [ + "class BertDataset(EduDataset):\n", + " pass\n", + "\n", "class MyTrainer(Trainer):\n", " pass\n", "\n", @@ -250,8 +179,8 @@ " logging_dir = f\"{ROOT}/log\"\n", "\n", "\n", - " train_dataset = Dataset_bert(train_items, tokenizer)\n", - " eval_dataset = Dataset_bert(val_items, tokenizer)\n", + " train_dataset = BertDataset(tokenizer=tokenizer, items=train_items, stem_key=\"content\", label_key=\"labels\")\n", + " eval_dataset = BertDataset(tokenizer=tokenizer, items=val_items, stem_key=\"content\", label_key=\"labels\")\n", "\n", " training_args = TrainingArguments(\n", " output_dir=output_dir,\n", @@ -272,14 +201,13 @@ " gradient_accumulation_steps=gradient_accumulation_steps,\n", " learning_rate=5e-5,\n", " )\n", - "\n", + " data_collator = DataCollatorWithPadding(tokenizer.bert_tokenizer)\n", " trainer = MyTrainer(\n", " model=model,\n", " args=training_args,\n", - " data_collator=train_dataset.collate_fn,\n", + " data_collator=data_collator,\n", " train_dataset=train_dataset,\n", " eval_dataset=eval_dataset,\n", - " compute_metrics=compute_metrics\n", " )\n", "\n", " trainer.train() #训练模型\n", @@ -318,54 +246,6 @@ "### 测试" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 用于测试的网络结构" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class DifficultyPredictionEval(BaseModel): \n", - " def __init__(self, pretrained_model_dir=None, classifier_dropout=0.5):\n", - " super(DifficultyPredictionEval, self).__init__()\n", - " self.bert = BertModel.from_pretrained(pretrained_model_dir, ignore_mismatched_sizes=True)\n", - " hidden_size = self.bert.config.hidden_size\n", - " self.classifier_dropout = classifier_dropout\n", - " self.dropout = nn.Dropout(classifier_dropout)\n", - " self.classifier = nn.Linear(hidden_size, 1)\n", - " self.sigmoid = nn.Sigmoid()\n", - "\n", - " self.config = {k: v for k, v in locals().items() if k not in [\"self\", \"__class__\"]}\n", - " self.config['architecture'] = 'BertForDifficultyPrediction'\n", - " self.config = PretrainedConfig.from_dict(self.config)\n", - "\n", - " def forward(self,\n", - " input_ids=None,\n", - " attention_mask=None,\n", - " token_type_ids=None,\n", - " ):\n", - "\n", - " item_embed = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)['last_hidden_state'][:, 0, :]\n", - " logits = self.sigmoid(self.classifier(item_embed))\n", - " return logits\n", - " \n", - " @classmethod\n", - " def from_config(cls, config_path, **kwargs):\n", - " with open(config_path, \"r\", encoding=\"utf-8\") as rf:\n", - " model_config = json.load(rf)\n", - " model_config.update(kwargs)\n", - " return cls(\n", - " pretrained_model_dir=model_config['pretrained_model_dir'],\n", - " classifier_dropout=model_config.get(\"classifier_dropout\", 0.5), \n", - " )" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -399,7 +279,7 @@ " items=val_items,\n", " tokenizer=tokenizer,\n", " )\n", - "model = DifficultyPredictionEval.from_pretrained(checkpoint_dir)" + "model = BertForDifficultyPrediction.from_pretrained(checkpoint_dir)" ] }, { @@ -415,7 +295,7 @@ "metadata": {}, "outputs": [], "source": [ - "def get_eval_results(pres, golds):\n", + "def compute_metrics(pres, golds):\n", " logits = torch.as_tensor(pres)\n", " labels = torch.as_tensor(golds)\n", " ret = {\n", @@ -428,18 +308,29 @@ " }\n", " return ret\n", "\n", + "def testdata_metrics(val_gap, diff, pred):\n", + " diff, pred = np.array(diff), np.array(pred)\n", + " idx = np.where(diff>0)[0]\n", + " ndcg = []\n", + " for s, e in val_gap:\n", + " _diff, _pred = diff[s:e], pred[s:e]\n", + " if _diff[0]==-1:\n", + " _diff = [i+1 for i in range(len(_diff))]\n", + " ndcg.append([ndcg_score([_diff], [_pred]), ndcg_score([_diff], [_pred], k=10), ndcg_score([_diff], [_pred], k=20), ndcg_score([_diff], [_pred], k=30)])\n", + " ndcg = np.mean(ndcg, axis=0)\n", + " return ndcg\n", + "\n", "model.eval()\n", "pred_list = []\n", "label_list = []\n", "for i, eval_batch in tqdm.tqdm(enumerate(eval_dataloader)):\n", " input_data, eval_batch_labels = eval_batch\n", - " output_logits = model(**input_data)\n", - " pred_list.append(output_logits.squeeze(0).tolist()[0])\n", + " _, output_logits = model(**input_data)\n", + " pred_list.append(output_logits.tolist()[0])\n", " label_list.append(eval_batch_labels.tolist()[0])\n", "\n", - "results = get_eval_results(pred_list, label_list)\n", - "print(f\"Test results: {results}\")\n", - "\n" + "results = compute_metrics(pred_list, label_list)\n", + "print(f\"Test results: {results}\")\n" ] } ], diff --git a/examples/downstream/discrimination_prediction.ipynb b/examples/downstream/discrimination_prediction.ipynb index 0b1c4f18..61afaa19 100644 --- a/examples/downstream/discrimination_prediction.ipynb +++ b/examples/downstream/discrimination_prediction.ipynb @@ -10,26 +10,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-06-28 03:30:52.333651: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", - "2023-06-28 03:30:52.381526: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", - "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2023-06-28 03:30:54.202449: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" - ] - } - ], + "outputs": [], "source": [ "import torch\n", "from torch import nn\n", "import numpy as np\n", - "from transformers.modeling_outputs import ModelOutput\n", - "from transformers import BertModel, TrainingArguments, Trainer, PretrainedConfig\n", + "from transformers import BertModel, TrainingArguments, Trainer, PretrainedConfig, DataCollatorWithPadding\n", "import torch.nn.functional as F\n", "from sklearn.metrics import ndcg_score, mean_squared_error\n", "from torchmetrics import MeanAbsoluteError, PearsonCorrCoef, SpearmanCorrCoef\n", @@ -37,8 +25,9 @@ "import tqdm\n", "from EduNLP.Pretrain import BertTokenizer \n", "from EduNLP.ModelZoo.base_model import BaseModel\n", + "from EduNLP.Pretrain import EduDataset\n", "import json\n", - "from utils import Dataset_bert, pre_disc\n", + "from utils import pre_disc\n", "\n", "ROOT = os.path.dirname(os.path.dirname(__file__))\n", "DATA_DIR = os.path.join(ROOT, \"data\")\n", @@ -47,18 +36,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/data/shangzi/anaconda3/envs/tgen/lib/python3.8/site-packages/torchmetrics/utilities/prints.py:36: UserWarning: Metric `SpearmanCorrcoef` will save all targets and predictions in the buffer. For large datasets, this may lead to large memory footprint.\n", - " warnings.warn(*args, **kwargs)\n" - ] - } - ], + "outputs": [], "source": [ "MAE = MeanAbsoluteError()\n", "PCC = PearsonCorrCoef()\n", @@ -70,7 +50,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 加载数据,定义预训练模型路径" + "### 加载数据,定义路径" ] }, { @@ -79,9 +59,9 @@ "metadata": {}, "outputs": [], "source": [ - "output_dir = os.path.join(ROOT, \"output/discrimination\") #设置模型保存路径\n", - "pretrained_model_dir = \"../data/bert_math_768\" #预训练的bert路径,也可以更换为其他模型的路径,如disenqnet, roberta等\n", - "checkpoint_dir = \"path/to/difficulty_prediction_checkpoint\"\n", + "output_dir = \"path/to/output_dir\" #设置模型保存路径\n", + "pretrained_model_dir = os.path.join(DATA_DIR, \"bert_math_768\") #以预训练的bert路径为例,也可以更换为其他模型的路径,如disenqnet, roberta等\n", + "checkpoint_dir = \"path/to/discrimination_prediction_checkpoint\"\n", "train_items = pre_disc(os.path.join(DATA_DIR, \"train\", \"ctt_train.csv\")) #加载训练集\n", "val_items = pre_disc(os.path.join(DATA_DIR, \"test\", \"ctt_test.csv\")) #加载测试集" ] @@ -107,16 +87,10 @@ "metadata": {}, "outputs": [], "source": [ - "class DiscriminationPredictionOutput(ModelOutput):\n", - " loss: torch.FloatTensor = None\n", - " logits: torch.FloatTensor = None\n", - " labels: torch.FloatTensor = None\n", - "\n", - "class BertForDiscriminationPrediction(BaseModel):\n", - " \n", - " def __init__(self, pretrained_model_dir=None, config=None, classifier_dropout=0.5):\n", + "class BertForDiscriminationPrediction(BaseModel): \n", + " def __init__(self, pretrained_model_dir=None, classifier_dropout=0.5):\n", " super(BertForDiscriminationPrediction, self).__init__()\n", - " self.bert = BertModel.from_pretrained(pretrained_model_dir, ignore_mismatched_sizes=True) \n", + " self.bert = BertModel.from_pretrained(pretrained_model_dir, ignore_mismatched_sizes=True)\n", " hidden_size = self.bert.config.hidden_size\n", " self.classifier_dropout = classifier_dropout\n", " self.dropout = nn.Dropout(classifier_dropout)\n", @@ -131,21 +105,16 @@ " input_ids=None,\n", " attention_mask=None,\n", " token_type_ids=None,\n", - " content=None,\n", " labels=None,\n", " ):\n", - " \n", - " input_ids = content['input_ids']\n", - " attention_mask = content['attention_mask']\n", - " token_type_ids = content['token_type_ids'] \n", + " \n", " item_embed = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)['last_hidden_state'][:, 0, :]\n", - " logits = self.sigmoid(self.classifier(item_embed))\n", - " loss = F.mse_loss(logits.squeeze(0), labels)\n", - " return DiscriminationPredictionOutput(\n", - " loss = loss,\n", - " logits = logits,\n", - " labels = labels\n", - " )\n", + "\n", + " logits = self.sigmoid(self.classifier(item_embed)).squeeze(0)\n", + " loss = None\n", + " if labels is not None:\n", + " loss = F.mse_loss(logits, labels) if labels is not None else None\n", + " return loss, logits\n", " \n", " @classmethod\n", " def from_config(cls, config_path, *args, **kwargs):\n", @@ -155,45 +124,7 @@ " return cls(\n", " pretrained_model_dir=model_config['pretrained_model_dir'],\n", " classifier_dropout=model_config.get(\"classifier_dropout\", 0.5), \n", - " )\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 定义评价指标" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def compute_metrics(pred):\n", - " logits = torch.as_tensor(pred.predictions[0]).squeeze(0)\n", - " logits = logits.view([logits.size()[0]],-1)\n", - " labels = torch.as_tensor(pred.label_ids)\n", - " pres = logits.numpy().tolist()\n", - " golds = labels.numpy().tolist()\n", - " ret = {\n", - " \"mae\": MAE(logits, labels),\n", - " \"mse\": mean_squared_error(golds, pres),\n", - " \"rmse\": np.sqrt(mean_squared_error(golds, pres)),\n", - " \"pcc\": PCC(logits, labels),\n", - " \"scc\": SCC(logits, labels),\n", - " 'ndcg': testdata_metrics(golds, pres).tolist(),\n", - " }\n", - " return ret\n", - "\n", - "def testdata_metrics(diff, pred):\n", - " diff, pred = np.array(diff), np.array(pred)\n", - " ndcg = []\n", - " ndcg.append([ndcg_score([diff], [pred]), ndcg_score([diff], [pred], k=10), ndcg_score([diff], [pred], k=20), ndcg_score([diff], [pred], k=30)])\n", - " ndcg = np.mean(ndcg, axis=0)\n", - " return ndcg" + " )" ] }, { @@ -201,7 +132,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### 定义训练和测试相关参数" + "#### 定义训练相关参数" ] }, { @@ -210,6 +141,9 @@ "metadata": {}, "outputs": [], "source": [ + "class BertDataset(EduDataset):\n", + " pass\n", + "\n", "class MyTrainer(Trainer):\n", " pass\n", "\n", @@ -220,9 +154,9 @@ " val_items=None,\n", " train_params=None):\n", " tokenizer = BertTokenizer.from_pretrained(pretrained_model_dir)\n", - " model = DiscriminationPredictionOutput(pretrained_model_dir=pretrained_model_dir) \n", + " model = BertForDiscriminationPrediction(pretrained_model_dir=pretrained_model_dir)\n", " model.bert.resize_token_embeddings(len(tokenizer.bert_tokenizer))\n", - " \n", + " # training parameters\n", " if train_params is not None:\n", " epochs = train_params['epochs'] if 'epochs' in train_params else 1\n", " batch_size = train_params['batch_size'] if 'batch_size' in train_params else 64\n", @@ -242,8 +176,9 @@ " gradient_accumulation_steps = 1\n", " logging_dir = f\"{ROOT}/log\"\n", "\n", - " train_dataset = Dataset_bert(train_items, tokenizer)\n", - " eval_dataset = Dataset_bert(val_items, tokenizer)\n", + "\n", + " train_dataset = BertDataset(tokenizer=tokenizer, items=train_items, stem_key=\"content\", label_key=\"labels\")\n", + " eval_dataset = BertDataset(tokenizer=tokenizer, items=val_items, stem_key=\"content\", label_key=\"labels\")\n", "\n", " training_args = TrainingArguments(\n", " output_dir=output_dir,\n", @@ -252,7 +187,7 @@ " num_train_epochs=epochs,\n", " per_device_train_batch_size=batch_size,\n", " per_device_eval_batch_size=batch_size,\n", - " evaluation_strategy = \"steps\", # epoch\n", + " evaluation_strategy = \"steps\", \n", " eval_steps=logging_steps*5,\n", " \n", " save_steps=save_steps,\n", @@ -264,14 +199,13 @@ " gradient_accumulation_steps=gradient_accumulation_steps,\n", " learning_rate=5e-5,\n", " )\n", - "\n", + " data_collator = DataCollatorWithPadding(tokenizer.bert_tokenizer)\n", " trainer = MyTrainer(\n", " model=model,\n", " args=training_args,\n", - " data_collator=train_dataset.collate_fn,\n", + " data_collator=data_collator,\n", " train_dataset=train_dataset,\n", " eval_dataset=eval_dataset,\n", - " compute_metrics=compute_metrics,\n", " )\n", "\n", " trainer.train() #训练模型\n", @@ -310,54 +244,6 @@ "### 测试" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 用于测试的网络结构" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class DiscriminationPredictionEval(BaseModel): \n", - " def __init__(self, pretrained_model_dir=None, classifier_dropout=0.5):\n", - " super(DiscriminationPredictionEval, self).__init__()\n", - " self.bert = BertModel.from_pretrained(pretrained_model_dir, ignore_mismatched_sizes=True)\n", - " hidden_size = self.bert.config.hidden_size\n", - " self.classifier_dropout = classifier_dropout\n", - " self.dropout = nn.Dropout(classifier_dropout)\n", - " self.classifier = nn.Linear(hidden_size, 1)\n", - " self.sigmoid = nn.Sigmoid()\n", - "\n", - " self.config = {k: v for k, v in locals().items() if k not in [\"self\", \"__class__\"]}\n", - " self.config['architecture'] = 'BertForDifficultyPrediction'\n", - " self.config = PretrainedConfig.from_dict(self.config)\n", - "\n", - " def forward(self,\n", - " input_ids=None,\n", - " attention_mask=None,\n", - " token_type_ids=None,\n", - " ):\n", - "\n", - " item_embed = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)['last_hidden_state'][:, 0, :]\n", - " logits = self.sigmoid(self.classifier(item_embed))\n", - " return logits\n", - " \n", - " @classmethod\n", - " def from_config(cls, config_path, **kwargs):\n", - " with open(config_path, \"r\", encoding=\"utf-8\") as rf:\n", - " model_config = json.load(rf)\n", - " model_config.update(kwargs)\n", - " return cls(\n", - " pretrained_model_dir=model_config['pretrained_model_dir'],\n", - " classifier_dropout=model_config.get(\"classifier_dropout\", 0.5), \n", - " )" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -391,7 +277,7 @@ " items=val_items,\n", " tokenizer=tokenizer,\n", " )\n", - "model = DiscriminationPredictionEval.from_pretrained(checkpoint_dir)" + "model = BertForDiscriminationPrediction.from_pretrained(checkpoint_dir)" ] }, { @@ -407,7 +293,7 @@ "metadata": {}, "outputs": [], "source": [ - "def get_eval_results(pres, golds):\n", + "def compute_metrics(pres, golds):\n", " logits = torch.as_tensor(pres)\n", " labels = torch.as_tensor(golds)\n", " ret = {\n", @@ -420,16 +306,23 @@ " }\n", " return ret\n", "\n", + "def testdata_metrics(diff, pred):\n", + " diff, pred = np.array(diff), np.array(pred)\n", + " ndcg = []\n", + " ndcg.append([ndcg_score([diff], [pred]), ndcg_score([diff], [pred], k=10), ndcg_score([diff], [pred], k=20), ndcg_score([diff], [pred], k=30)])\n", + " ndcg = np.mean(ndcg, axis=0)\n", + " return ndcg\n", + "\n", "model.eval()\n", "pred_list = []\n", "label_list = []\n", "for i, eval_batch in tqdm.tqdm(enumerate(eval_dataloader)):\n", " input_data, eval_batch_labels = eval_batch\n", - " output_logits = model(**input_data)\n", - " pred_list.append(output_logits.squeeze(0).tolist()[0])\n", + " _, output_logits = model(**input_data)\n", + " pred_list.append(output_logits.tolist()[0])\n", " label_list.append(eval_batch_labels.tolist()[0])\n", "\n", - "results = get_eval_results(pred_list, label_list)\n", + "results = compute_metrics(pred_list, label_list)\n", "print(f\"Test results: {results}\")" ] } diff --git a/examples/downstream/utils.py b/examples/downstream/utils.py index d55cf48a..52746160 100644 --- a/examples/downstream/utils.py +++ b/examples/downstream/utils.py @@ -1,6 +1,3 @@ -from torch.utils.data import Dataset -import torch -from tqdm import tqdm import json import pandas as pd @@ -47,32 +44,6 @@ def get_val(val): start = end return test_data, test_gap -class Dataset_bert(Dataset): - def __init__(self, items, tokenizer): - self.tokenizer = tokenizer - self.items = items - self.preprocess() - def preprocess(self): - for item in tqdm(self.items): - content_vector = self.tokenizer(str(item['content']), return_tensors="pt", max_length=512, truncation=True) - #content_vector = str() - item["content"] = content_vector - - def __getitem__(self, index): - item = self.items[index] - return item - - def __len__(self): - return len(self.items) - - def collate_fn(self, batch_data): - first = batch_data[0] - batch = { - k: [item[k] for item in batch_data] for k in first.keys() - } - batch["content"] = self.tokenizer(str(batch["content"]), return_tensors='pt', max_length=512, truncation=True) - batch["labels"] = torch.as_tensor(batch["labels"]) - return batch From d3ca9628c9967d50b6659c954894a51507c8e598 Mon Sep 17 00:00:00 2001 From: nnnyt <793313994@qq.com> Date: Fri, 14 Jul 2023 16:06:50 +0800 Subject: [PATCH 41/58] [doc] reorganize downstream tasks --- .../difficulty_prediction.ipynb | 0 .../{ => difficulty_prediction}/utils.py | 21 ++++--------------- .../discrimination_prediction.ipynb | 0 .../discrimination_prediction/utils.py | 15 +++++++++++++ .../konwledge_prediction.ipynb} | 0 .../utils.py | 0 .../load_data.py | 0 .../model.py | 0 .../paper_segmentation.ipynb} | 0 .../samples/train/math/paper_1.txt | 0 .../trainer.py | 0 .../utils.py | 0 .../quality_evaluation.ipynb} | 0 .../{ => quality_evaluation}/train.py | 0 .../similarity_prediction.ipynb | 0 15 files changed, 19 insertions(+), 17 deletions(-) rename examples/downstream/{ => difficulty_prediction}/difficulty_prediction.ipynb (100%) rename examples/downstream/{ => difficulty_prediction}/utils.py (69%) rename examples/downstream/{ => discrimination_prediction}/discrimination_prediction.ipynb (100%) create mode 100644 examples/downstream/discrimination_prediction/utils.py rename examples/downstream/{knowledge/konw_pred.ipynb => knowledge_prediction/konwledge_prediction.ipynb} (100%) rename examples/downstream/{knowledge => knowledge_prediction}/utils.py (100%) rename examples/downstream/{paper_seg => paper_segmentation}/load_data.py (100%) rename examples/downstream/{paper_seg => paper_segmentation}/model.py (100%) rename examples/downstream/{paper_seg/paper_seg.ipynb => paper_segmentation/paper_segmentation.ipynb} (100%) rename examples/downstream/{paper_seg => paper_segmentation}/samples/train/math/paper_1.txt (100%) rename examples/downstream/{paper_seg => paper_segmentation}/trainer.py (100%) rename examples/downstream/{paper_seg => paper_segmentation}/utils.py (100%) rename examples/downstream/{quality_evaluate.ipynb => quality_evaluation/quality_evaluation.ipynb} (100%) rename examples/downstream/{ => quality_evaluation}/train.py (100%) rename examples/downstream/{ => similarity_prediction}/similarity_prediction.ipynb (100%) diff --git a/examples/downstream/difficulty_prediction.ipynb b/examples/downstream/difficulty_prediction/difficulty_prediction.ipynb similarity index 100% rename from examples/downstream/difficulty_prediction.ipynb rename to examples/downstream/difficulty_prediction/difficulty_prediction.ipynb diff --git a/examples/downstream/utils.py b/examples/downstream/difficulty_prediction/utils.py similarity index 69% rename from examples/downstream/utils.py rename to examples/downstream/difficulty_prediction/utils.py index 52746160..fe503a26 100644 --- a/examples/downstream/utils.py +++ b/examples/downstream/difficulty_prediction/utils.py @@ -1,6 +1,7 @@ import json import pandas as pd + def load_json(open_path): print("[load_json] start : {}".format(open_path)) with open(open_path, "r", encoding='utf-8') as f: @@ -8,17 +9,6 @@ def load_json(open_path): print("[load_json] num = {}, open_path = {}".format(len(load_q), open_path)) return load_q -def pre_disc(csv_path): - items = pd.read_csv(csv_path) - stem = items["stem"].tolist() - disc = items["disc"].tolist() - data = [] - for i in range(len(stem)): - dic = {} - dic["content"] = stem[i] - dic["labels"] = disc[i] - data.append(dic) - return data def get_train(train): train_data = [] @@ -29,6 +19,7 @@ def get_train(train): train_data.append(dic) return train_data + def get_val(val): test_data, test_gap = [], [] start, end = 0, 0 @@ -38,12 +29,8 @@ def get_val(val): dic = {} dic['content'] = item["stem"] dic['labels'] = item['diff'] - #dic["labels"] = dic.pop("difficulty") + # dic["labels"] = dic.pop("difficulty") test_data.append(dic) - test_gap.append([start, end]) + test_gap.append([start, end]) start = end return test_data, test_gap - - - - diff --git a/examples/downstream/discrimination_prediction.ipynb b/examples/downstream/discrimination_prediction/discrimination_prediction.ipynb similarity index 100% rename from examples/downstream/discrimination_prediction.ipynb rename to examples/downstream/discrimination_prediction/discrimination_prediction.ipynb diff --git a/examples/downstream/discrimination_prediction/utils.py b/examples/downstream/discrimination_prediction/utils.py new file mode 100644 index 00000000..f2bbe167 --- /dev/null +++ b/examples/downstream/discrimination_prediction/utils.py @@ -0,0 +1,15 @@ +import json +import pandas as pd + + +def pre_disc(csv_path): + items = pd.read_csv(csv_path) + stem = items["stem"].tolist() + disc = items["disc"].tolist() + data = [] + for i in range(len(stem)): + dic = {} + dic["content"] = stem[i] + dic["labels"] = disc[i] + data.append(dic) + return data diff --git a/examples/downstream/knowledge/konw_pred.ipynb b/examples/downstream/knowledge_prediction/konwledge_prediction.ipynb similarity index 100% rename from examples/downstream/knowledge/konw_pred.ipynb rename to examples/downstream/knowledge_prediction/konwledge_prediction.ipynb diff --git a/examples/downstream/knowledge/utils.py b/examples/downstream/knowledge_prediction/utils.py similarity index 100% rename from examples/downstream/knowledge/utils.py rename to examples/downstream/knowledge_prediction/utils.py diff --git a/examples/downstream/paper_seg/load_data.py b/examples/downstream/paper_segmentation/load_data.py similarity index 100% rename from examples/downstream/paper_seg/load_data.py rename to examples/downstream/paper_segmentation/load_data.py diff --git a/examples/downstream/paper_seg/model.py b/examples/downstream/paper_segmentation/model.py similarity index 100% rename from examples/downstream/paper_seg/model.py rename to examples/downstream/paper_segmentation/model.py diff --git a/examples/downstream/paper_seg/paper_seg.ipynb b/examples/downstream/paper_segmentation/paper_segmentation.ipynb similarity index 100% rename from examples/downstream/paper_seg/paper_seg.ipynb rename to examples/downstream/paper_segmentation/paper_segmentation.ipynb diff --git a/examples/downstream/paper_seg/samples/train/math/paper_1.txt b/examples/downstream/paper_segmentation/samples/train/math/paper_1.txt similarity index 100% rename from examples/downstream/paper_seg/samples/train/math/paper_1.txt rename to examples/downstream/paper_segmentation/samples/train/math/paper_1.txt diff --git a/examples/downstream/paper_seg/trainer.py b/examples/downstream/paper_segmentation/trainer.py similarity index 100% rename from examples/downstream/paper_seg/trainer.py rename to examples/downstream/paper_segmentation/trainer.py diff --git a/examples/downstream/paper_seg/utils.py b/examples/downstream/paper_segmentation/utils.py similarity index 100% rename from examples/downstream/paper_seg/utils.py rename to examples/downstream/paper_segmentation/utils.py diff --git a/examples/downstream/quality_evaluate.ipynb b/examples/downstream/quality_evaluation/quality_evaluation.ipynb similarity index 100% rename from examples/downstream/quality_evaluate.ipynb rename to examples/downstream/quality_evaluation/quality_evaluation.ipynb diff --git a/examples/downstream/train.py b/examples/downstream/quality_evaluation/train.py similarity index 100% rename from examples/downstream/train.py rename to examples/downstream/quality_evaluation/train.py diff --git a/examples/downstream/similarity_prediction.ipynb b/examples/downstream/similarity_prediction/similarity_prediction.ipynb similarity index 100% rename from examples/downstream/similarity_prediction.ipynb rename to examples/downstream/similarity_prediction/similarity_prediction.ipynb From 68918b319b63779fd8b197838a1fe790dc2b5928 Mon Sep 17 00:00:00 2001 From: wintermelon008 <1904346407@qq.com> Date: Fri, 14 Jul 2023 22:15:36 +0800 Subject: [PATCH 42/58] rewrite the Dataset --- EduNLP/Pretrain/quesnet_vec_new.py | 142 ++++++++++++----------------- 1 file changed, 60 insertions(+), 82 deletions(-) diff --git a/EduNLP/Pretrain/quesnet_vec_new.py b/EduNLP/Pretrain/quesnet_vec_new.py index 78082fea..3dec4dc1 100644 --- a/EduNLP/Pretrain/quesnet_vec_new.py +++ b/EduNLP/Pretrain/quesnet_vec_new.py @@ -20,6 +20,7 @@ import signal import os import json +import copy import linecache import numpy as np from PIL import Image @@ -307,104 +308,85 @@ def padding(self, idx, max_length, type='word'): def set_img_dir(self, path): self.img_dir = path - - -class Lines: - ''' - 原始数据行读取。这玩意删不掉,因为要先读取数据喂给 tokenizer, - 再用 tokenizer 过一遍数据得到 Dataset,就很难绷 - ''' - def __init__(self, filename, skip=0, preserve_newline=False): - self.filename = filename - with open(filename, "r", encoding="utf-8") as f: - self.length = len(f.readlines()) - skip - assert self.length > 0, f'{filename} is empty. Or file length is less than skip length.' - self.skip = skip - self.preserve_newline = preserve_newline - - def __len__(self): - return self.length - - def __iter__(self): - for i in range(len(self)): - yield self[i] - - def __getitem__(self, item): - d = self.skip + 1 - if isinstance(item, int): - if item < self.length: - line = linecache.getline(self.filename, - item % len(self) + d) - if self.preserve_newline: - return json.loads(line) - else: - return json.loads(line.strip('\r\n')) - - elif isinstance(item, slice): - low = 0 if item.start is None else item.start - low = clip(low, -len(self), len(self) - 1) - if low < 0: - low += len(self) - high = len(self) if item.stop is None else item.stop - high = clip(high, -len(self), len(self)) - if high < 0: - high += len(self) - ls = [] - for i in range(low, high): - line = linecache.getline(self.filename, i + d) - if not self.preserve_newline: - line = line.strip('\r\n') - ls.append(json.loads(line)) - - return ls - - raise IndexError('index must be int or slice') - - class QuesnetDataset(Dataset): ''' - 对 Question Loader 的重构,删除了一些不太用得到的接口 - TODO: 这里需要重写逻辑,不然和原来的区别不大,还是想要把 Lines 给合并过来 + 对 Question Loader 与 Lines 的重构,删除了一些不太用得到的接口 ''' - def __init__(self, questions: Lines, - tokenizer: QuesNetTokenizer, - content_key=lambda x: x['ques_content'], - meta_key=lambda x: x['know_name'], - answer_key=lambda x: x['ques_answer'], - option_key=lambda x: x['ques_options'], - pipeline = None, - skip=0): - self.questions = questions + def __init__(self, filename: str, + tokenizer: QuesNetTokenizer = None, + img_dir: str = "", + meta: Optional[list] = None, + content_key=lambda x: x['ques_content'], + meta_key=lambda x: x['know_name'], + answer_key=lambda x: x['ques_answer'], + option_key=lambda x: x['ques_options'], + pipeline=None, + skip=0 + ): + self.filename = filename self.skip = skip - self.tokenizer = tokenizer + self.img_dir = img_dir self.content_key = content_key self.meta_key = meta_key self.answer_key = answer_key self.option_key = option_key self.pipeline = pipeline + + if tokenizer == None: + tokenizer = QuesNetTokenizer(meta=['know_name'], + img_dir=img_dir) + self.tokenizer = tokenizer + self.meta = meta if meta else tokenizer.meta + self.load_data_lines() + tokenizer.set_vocab(self.lines, key=lambda x: x['ques_content'], + trim_min_count=2, silent=False) + tokenizer.set_meta_vocab(self.lines, silent=False) + + def load_data_lines(self): + '''从 Json 文件中按行读取数据''' + # TODO: 暂时全部读取到内存中,不考虑分块 + data_dir = self.filename + skip = self.skip # 从第 skip + 1 行开始读 + self.lines = [] + self.length = 0 + + with open(data_dir, "r", encoding="utf-8") as f: + row = 0 + while True: + row += 1 + line = f.readline() + if row <= skip: + continue + if not line: + break + self.lines.append(json.loads(line.strip())) + + self.length = row - skip - 1 + assert self.length > 0, f'{data_dir} is empty. Or file length is less than skip length.' + def __len__(self): - return len(self.questions) + return len(self.lines) def __getitem__(self, index): if isinstance(index, int): - line = self.questions[index] + line = self.lines[index] # 进行后续处理 qid = line['ques_id'] - token = self.tokenizer(line, key=self.content_key, meta=self.meta_key) + token = self.tokenizer(line, key=self.content_key, meta=self.meta) content = token['seq_idx'] meta = token['meta_idx'] if self.answer_key(line).isalpha() and len(self.answer_key(line)) == 1 and ord(self.answer_key(line)) < 128 and len(self.option_key(line)) > 0: answer_idx = ord(self.answer_key(line).upper()) - ord('A') options = self.option_key(line) - answer = self.tokenizer(options.pop(answer_idx), meta=self.meta_key)['seq_idx'] - false_options = [(self.tokenizer(option, meta=self.meta_key))['seq_idx'] for option in options] + answer = self.tokenizer(options.pop(answer_idx), meta=self.meta)['seq_idx'] + false_options = [(self.tokenizer(option, meta=self.meta))['seq_idx'] for option in options] else: - answer = (self.tokenizer(self.answer_key(line), meta=self.meta_key))['seq_idx'] + answer = (self.tokenizer(self.answer_key(line), meta=self.meta))['seq_idx'] false_options = [[0], [0], [0]] qs = Question(id=qid, content=content, answer=answer, @@ -569,7 +551,7 @@ def optimizer(*models, **kwargs): return _cur_optim -def train_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save_embs = False, train_params = None): +def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save_embs = False, train_params = None): """ pretrain quesnet Parameters @@ -634,13 +616,9 @@ def train_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save_em train_params = default_train_params # 参数设定完成 - tokenizer = QuesNetTokenizer(meta=['know_name'], img_dir=img_dir) - lines = Lines(path) - tokenizer.set_vocab(lines, key=lambda x: x['ques_content'], - trim_min_count=2, silent=False) - tokenizer.set_meta_vocab(lines, silent=False) + dataset = QuesnetDataset(path) + tokenizer = dataset.tokenizer tokenizer.save_pretrained(output_dir) - data = QuesnetDataset(lines, tokenizer) model = QuesNetForPreTraining(_stoi=tokenizer.stoi, feat_size=train_params['feat_size'], emb_size=train_params['emb_size']).to(device) @@ -652,7 +630,7 @@ def train_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save_em w2v_corpus = [] img_corpus = [] meta_corpus = [] - for _, qs in enumerate(tqdm(data)): + for _, qs in enumerate(tqdm(dataset)): text_content = [] for c in qs.content: if isinstance(c, int): @@ -727,12 +705,12 @@ def train_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save_em # 下面的代码还没有完整测试 # HLM and DOO training - data.pipeline = partial(model.quesnet.make_batch, device=device, pretrain=True) + dataset.pipeline = partial(model.quesnet.make_batch, device=device, pretrain=True) model.train() optim = optimizer(model, lr=train_params['lr']) n_batches = 0 for epoch in range(0, train_params['n_epochs']): - train_iter = PrefetchIter(data, train_params['batch_size']) + train_iter = PrefetchIter(dataset, train_params['batch_size']) bar = enumerate(tqdm(train_iter, initial=train_iter.pos), train_iter.pos) for i, batch in critical(bar): From bb5b3f0c25abb51c04c18cad0a3048cfbcd53a32 Mon Sep 17 00:00:00 2001 From: wintermelon008 <1904346407@qq.com> Date: Fri, 21 Jul 2023 12:49:16 +0800 Subject: [PATCH 43/58] Update quesnet_vec --- EduNLP/Pretrain/quesnet_vec.py | 454 ++++++++---------- EduNLP/Pretrain/quesnet_vec_new.py | 739 ----------------------------- 2 files changed, 201 insertions(+), 992 deletions(-) delete mode 100644 EduNLP/Pretrain/quesnet_vec_new.py diff --git a/EduNLP/Pretrain/quesnet_vec.py b/EduNLP/Pretrain/quesnet_vec.py index c46def72..80649ca6 100644 --- a/EduNLP/Pretrain/quesnet_vec.py +++ b/EduNLP/Pretrain/quesnet_vec.py @@ -1,39 +1,31 @@ -"""Pre-process input text, tokenizing, building vocabs, and pre-train word -level vectors.""" +from .pretrian_utils import PretrainedEduTokenizer +from ..SIF.segment.segment import FigureSegment +from ..ModelZoo.quesnet import QuesNetForPreTraining, AE +from EduNLP import logger -import os -import logging -from pickle import NONE -import warnings -import numpy as np -import torch from torch.utils.data import DataLoader, Dataset -import signal -import threading -from tqdm import tqdm -from functools import partial -from collections import namedtuple -from copy import copy -import json -import math -import queue -import random -from PIL import Image from torchvision.transforms.functional import to_grayscale from torchvision.transforms.functional import to_tensor from gensim.models import Word2Vec -from ..SIF.segment.segment import FigureSegment -from ..SIF.segment import seg -from ..SIF.tokenization import tokenize -from ..SIF import EDU_SPYMBOLS -from ..ModelZoo.quesnet import QuesNetForPreTraining, AE -from .pretrian_utils import PretrainedEduTokenizer -from EduNLP import logger +import torch + +import warnings +import queue +import random +import math +import threading +import logging +import signal +import os +import json +import copy import linecache +import numpy as np +from PIL import Image from typing import List, Union, Optional - -Question = namedtuple('Question', - ['id', 'content', 'answer', 'false_options', 'labels']) +from collections import namedtuple +from functools import partial +from tqdm import tqdm def save_list(item2index, path): @@ -44,6 +36,14 @@ def save_list(item2index, path): return +def clip(v, low, high): + return max(low, min(v, high)) + +# Basic unit of Dataset +Question = namedtuple('Question', + ['id', 'content', 'answer', 'false_options', 'labels']) + + class QuesNetTokenizer(PretrainedEduTokenizer): """ Examples @@ -157,16 +157,12 @@ def _convert_to_ids(self, item: Union[str, dict, list], key=lambda x: x, token_idx = [] for _, w in enumerate(token_item): if isinstance(w, FigureSegment): - # image + # image try: - if self.img_dir != '': - # Warnings: you should make sure the figure exists! - im = Image.open(os.path.join(self.img_dir, f'{w.src[10:-1]}.png')) - else: - fig_id = f"{w.src[10:-1]}" - fig_index = item['ques_figure_ids'].index(fig_id) - fig_src = item['ques_figure_paths'][fig_index] - im = Image.open(fig_src) + fig_id = f"{w.src[10:-1]}" + fig_index = item['ques_figure_ids'].index(fig_id) + fig_src = item['ques_figure_paths'][fig_index] + im = Image.open(fig_src) im = im.resize((56, 56)) token_idx.append(to_grayscale(im)) except Exception: @@ -308,168 +304,124 @@ def padding(self, idx, max_length, type='word'): def set_img_dir(self, path): self.img_dir = path - - -def clip(v, low, high): - if v < low: - v = low - if v > high: - v = high - return v - - -class Lines: - def __init__(self, filename, skip=0, preserve_newline=False): + + +class QuesnetDataset(Dataset): + ''' + Quesnet-specific datasets + ''' + def __init__(self, filename: str, + tokenizer: QuesNetTokenizer = None, + img_dir: str = "", + meta: Optional[list] = None, + content_key=lambda x: x['ques_content'], + meta_key=lambda x: x['know_name'], + answer_key=lambda x: x['ques_answer'], + option_key=lambda x: x['ques_options'], + pipeline=None, + skip=0 + ): self.filename = filename - with open(filename, "r", encoding="utf-8") as f: - self.length = len(f.readlines()) - skip - assert self.length > 0, f'{filename} is empty. Or file length is less than skip length.' self.skip = skip - self.preserve_newline = preserve_newline - - def __len__(self): - return self.length - - def __iter__(self): - for i in range(len(self)): - yield self[i] - - def __getitem__(self, item): - d = self.skip + 1 - if isinstance(item, int): - if item < self.length: - line = linecache.getline(self.filename, - item % len(self) + d) - if self.preserve_newline: - return json.loads(line) - else: - return json.loads(line.strip('\r\n')) - - elif isinstance(item, slice): - low = 0 if item.start is None else item.start - low = clip(low, -len(self), len(self) - 1) - if low < 0: - low += len(self) - high = len(self) if item.stop is None else item.stop - high = clip(high, -len(self), len(self)) - if high < 0: - high += len(self) - ls = [] - for i in range(low, high): - line = linecache.getline(self.filename, i + d) - if not self.preserve_newline: - line = line.strip('\r\n') - ls.append(json.loads(line)) - - return ls - - raise IndexError('index must be int or slice') - - -class QuestionLoader: - def __init__(self, ques: Lines, tokenizer: QuesNetTokenizer, - pipeline=None, range=None, meta: Optional[list] = None, - content_key=lambda x: x['ques_content'], - meta_key=lambda x: x['know_name'], - answer_key=lambda x: x['ques_answer'], - option_key=lambda x: x['ques_options'], - skip=0 - ): - """ Read question file as data list. Same behavior on same file. - - Parameters - ---------- - ques_file : str - path of question file - tokenizer : QuesNetTokenizer - pipeline : _type_, optional - _description_, by default None - range : _type_, optional - _description_, by default None - content_key : function, optional - by default lambda x:x['ques_content'] - meta_key : function, optional - by default lambda x:x['know_name'] - answer_key: function, optional - by default lambda x:x['ques_answer'] - option_key: function, optional - by default lambda x:x['ques_options'] - skip: int, optional - skip the first several lines, by default 0 - """ - self.range = None - self.ques = ques - self.range = range or slice(0, len(self), skip) - self.img_dir = tokenizer.img_dir - self.labels = [] - self.stoi = tokenizer.stoi - self.tokenizer = tokenizer - + self.img_dir = img_dir self.content_key = content_key - self.meta = meta if meta else tokenizer.meta self.meta_key = meta_key self.answer_key = answer_key self.option_key = option_key - self.pipeline = pipeline + + if tokenizer == None: + tokenizer = QuesNetTokenizer(meta=['know_name'], + img_dir=img_dir) + self.tokenizer = tokenizer + self.meta = meta if meta else tokenizer.meta + self.load_data_lines() + tokenizer.set_vocab(self.lines, key=lambda x: x['ques_content'], + trim_min_count=2, silent=False) + tokenizer.set_meta_vocab(self.lines, silent=False) + + + def load_data_lines(self): + '''Read data by row from a JSON file + + Important: the data file is loaded during initialization. + ''' + + # TODO: All data is read into memory without chunking. + # This may lead to low efficiency. + data_dir = self.filename + skip = self.skip # Read from Line skip + 1. + self.lines = [] + self.length = 0 + + with open(data_dir, "r", encoding="utf-8") as f: + row = 0 + while True: + row += 1 + line = f.readline() + if row <= skip: + continue + if not line: + break + self.lines.append(json.loads(line.strip())) + + self.length = row - skip - 1 + assert self.length > 0, f'{data_dir} is empty. Or file length is less than skip length.' + + + def __len__(self): + return len(self.lines) - def split_(self, split_ratio): - first_size = int(len(self) * (1 - split_ratio)) - other = copy(self) - self.range = slice(0, first_size, 1) - other.range = slice(first_size, len(other), 1) - return other - def __len__(self): - return len(self.ques) if self.range is None \ - else self.range.stop - self.range.start + def __getitem__(self, index): + if isinstance(index, int): + line = self.lines[index] - def __getitem__(self, x): - if isinstance(x, int): - x += self.range.start - item = slice(x, x + 1, 1) - else: - item = slice(x.start + self.range.start, - x.stop + self.range.start, 1) - qs = [] - if item.start > len(self): - raise IndexError - for line in self.ques[item]: - q = line - qid = q['ques_id'] - token = self.tokenizer(q, key=self.content_key, meta=self.meta) + qid = line['ques_id'] + token = self.tokenizer(line, key=self.content_key, meta=self.meta) content = token['seq_idx'] meta = token['meta_idx'] - if self.answer_key(q).isalpha() and len(self.answer_key(q)) == 1 and ord(self.answer_key(q)) < 128 and len( - self.option_key(q)) > 0: - answer_idx = ord(self.answer_key(q).upper()) - ord('A') - options = self.option_key(q) - answer = self.tokenizer(options.pop(answer_idx), meta=self.meta) - answer = answer['seq_idx'] + if self.answer_key(line).isalpha() and len(self.answer_key(line)) == 1 and ord(self.answer_key(line)) < 128 and len(self.option_key(line)) > 0: + answer_idx = ord(self.answer_key(line).upper()) - ord('A') + options = self.option_key(line) + answer = self.tokenizer(options.pop(answer_idx), meta=self.meta)['seq_idx'] false_options = [(self.tokenizer(option, meta=self.meta))['seq_idx'] for option in options] - qs.append(Question(qid, content, answer, false_options, meta)) else: - answer = (self.tokenizer(self.answer_key(q), meta=self.meta))['seq_idx'] - qs.append(Question(qid, content, answer, [[0], [0], [0]], meta)) - - if callable(self.pipeline): - qs = self.pipeline(qs) - if isinstance(x, int): - return qs[0] - else: + answer = (self.tokenizer(self.answer_key(line), meta=self.meta))['seq_idx'] + false_options = [[0], [0], [0]] + + qs = Question(id=qid, content=content, answer=answer, + false_options=false_options, labels=meta) + if callable(self.pipeline): + qs = self.pipeline(qs) + return qs + + + elif isinstance(index, slice): + results = [] + for i in range(*index.indices(len(self))): + results.append(self[i]) + return results + + else: + raise TypeError('Invalid argument type. Index type should be int or slice.') + +class EmbeddingDataset(Dataset): + def __init__(self, data, data_type='image'): + self.data = data + self.data_type = data_type + assert self.data_type in ['image', 'meta'] + def __len__(self): + return len(self.data) -def optimizer(*models, **kwargs): - _cur_optim = [m.optim_cls(m.parameters(), **kwargs) - if hasattr(m, 'optim_cls') - else torch.optim.Adam(m.parameters(), **kwargs) - for m in models] - if len(_cur_optim) == 1: - return _cur_optim[0] - else: - return _cur_optim - + def __getitem__(self, idx): + if self.data_type == 'image': + return to_tensor(self.data[idx]) + elif self.data_type == 'meta': + return self.data[idx] + class PrefetchIter: """Iterator on data and labels, with states for save and restore.""" @@ -532,32 +484,10 @@ def produce(self): except Exception as e: self.queue.put(e) return - - -class EmbeddingDataset(Dataset): - def __init__(self, data, data_type='image'): - self.data = data - self.data_type = data_type - assert self.data_type in ['image', 'meta'] - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - if self.data_type == 'image': - return to_tensor(self.data[idx]) - elif self.data_type == 'meta': - return self.data[idx] - - -def pretrain_iter(ques, batch_size): - _cur_iter = PrefetchIter(ques, batch_size=batch_size) - return _cur_iter - + sigint_handler = signal.getsignal(signal.SIGINT) - def critical(f): it = iter(f) signal_received = () @@ -576,7 +506,7 @@ def handler(sig, frame): sigint_handler(*signal_received) except StopIteration: break - + def pretrain_embedding_layer(dataset: EmbeddingDataset, ae: AE, lr: float = 1e-3, log_step: int = 1, epochs: int = 3, batch_size: int = 4, device=torch.device('cpu')): @@ -595,8 +525,18 @@ def pretrain_embedding_layer(dataset: EmbeddingDataset, ae: AE, lr: float = 1e-3 logger.info(f"[Epoch{i}][Batch{batch}]Training {train_type} Embedding layer, loss:{loss}") return ae - -def pretrain_quesnet(path, output_dir, img_dir=None, save_embs=False, train_params=None): +def optimizer(*models, **kwargs): + _cur_optim = [m.optim_cls(m.parameters(), **kwargs) + if hasattr(m, 'optim_cls') + else torch.optim.Adam(m.parameters(), **kwargs) + for m in models] + if len(_cur_optim) == 1: + return _cur_optim[0] + else: + return _cur_optim + + +def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save_embs = False, train_params = None): """ pretrain quesnet Parameters @@ -638,9 +578,11 @@ def pretrain_quesnet(path, output_dir, img_dir=None, save_embs=False, train_para ... "know_name": "['代数', '集合', '集合的相等']" ... }] >>> tokenizer.set_vocab(items, key=lambda x: x['ques_content'], trim_min_count=1, silent=True) - >>> tokenizer.set_meta_vocab(items, silent=True) >>> pretrain_quesnet('./data/standard_luna_data.json', './testQuesNet', tokenizer) # doctest: +SKIP """ + os.makedirs(output_dir, exist_ok=True) + device = torch.device(train_params['device']) + default_train_params = { # train params "n_epochs": 1, @@ -657,21 +599,13 @@ def pretrain_quesnet(path, output_dir, img_dir=None, save_embs=False, train_para if train_params is not None: default_train_params.update(train_params) train_params = default_train_params - os.makedirs(output_dir, exist_ok=True) - device = torch.device(train_params['device']) - # set tokenizer - tokenizer = QuesNetTokenizer(meta=['know_name'], - img_dir=img_dir) - items = Lines(path, skip=1) - tokenizer.set_vocab(items, key=lambda x: x['ques_content'], - trim_min_count=2, silent=False) - tokenizer.set_meta_vocab(items, silent=False) + dataset = QuesnetDataset(path) + tokenizer = dataset.tokenizer tokenizer.save_pretrained(output_dir) - - ques_dl = QuestionLoader(items, tokenizer) model = QuesNetForPreTraining(_stoi=tokenizer.stoi, feat_size=train_params['feat_size'], emb_size=train_params['emb_size']).to(device) + emb_dict = tokenizer.stoi['word'] emb_dict_rev = tokenizer.itos['word'] emb_size = train_params['emb_size'] @@ -679,7 +613,7 @@ def pretrain_quesnet(path, output_dir, img_dir=None, save_embs=False, train_para w2v_corpus = [] img_corpus = [] meta_corpus = [] - for i, qs in enumerate(tqdm(ques_dl)): + for _, qs in enumerate(tqdm(dataset)): text_content = [] for c in qs.content: if isinstance(c, int): @@ -699,50 +633,61 @@ def pretrain_quesnet(path, output_dir, img_dir=None, save_embs=False, train_para meta_corpus.append(meta_vector) # train word2vec for text embedding - gensim_w2v = Word2Vec(sentences=[[item] for item in emb_dict.keys()], min_count=1, - vector_size=emb_size) - gensim_w2v.init_weights() - gensim_w2v.train(corpus_iterable=w2v_corpus, total_examples=len(w2v_corpus), epochs=train_params['n_epochs']) - w2v_emb = gensim_w2v.syn1neg - emb_weights = [] - for key, item in emb_dict.items(): - w2v_index = gensim_w2v.wv.key_to_index[key] - emb_weights.append(w2v_emb[w2v_index]) - emb_weights = np.array(emb_weights) - model.quesnet.load_emb(emb_weights) + if pretrain_dir != None: + model.quesnet.load_emb(np.load(os.path.join(output_dir, 'w2v_embs.npy'))) + else: + gensim_w2v = Word2Vec(sentences=[[item] for item in emb_dict.keys()], min_count=1, + vector_size=emb_size) + gensim_w2v.init_weights() + gensim_w2v.train(corpus_iterable=w2v_corpus, total_examples=len(w2v_corpus), epochs=train_params['n_epochs']) + w2v_emb = gensim_w2v.syn1neg + emb_weights = [] + for key, item in emb_dict.items(): + w2v_index = gensim_w2v.wv.key_to_index[key] + emb_weights.append(w2v_emb[w2v_index]) + emb_weights = np.array(emb_weights) + model.quesnet.load_emb(emb_weights) + if save_embs: + np.save(os.path.join(output_dir, 'w2v_embs.npy'), emb_weights) logger.info('quesnet Word Embedding loaded') - if save_embs: - np.save(os.path.join(output_dir, 'w2v_embs.npy'), emb_weights) # train auto-encoder loss for image embedding - img_dataset = EmbeddingDataset(data=img_corpus, data_type='image') - trained_ie = pretrain_embedding_layer(dataset=img_dataset, ae=model.quesnet.ie, lr=train_params['lr'], - log_step=train_params['log_steps'], batch_size=train_params['batch_size'], - epochs=train_params['n_epochs'], device=device) - model.quesnet.load_img(trained_ie) + if pretrain_dir != None: + model.quesnet.load_img(torch.load(os.path.join(pretrain_dir, 'trained_ie.pt'))) + else: + img_dataset = EmbeddingDataset(data=img_corpus, data_type='image') + trained_ie = pretrain_embedding_layer(dataset=img_dataset, ae=model.quesnet.ie, lr=train_params['lr'], + log_step=train_params['log_steps'], batch_size=train_params['batch_size'], + epochs=train_params['n_epochs'], device=device) + if save_embs: + torch.save(trained_ie.state_dict(), os.path.join(output_dir, 'trained_ie.pt')) + model.quesnet.load_img(trained_ie) logger.info('quesnet Image Embedding loaded') - if save_embs: - torch.save(trained_ie.state_dict(), os.path.join(output_dir, 'trained_ie.pt')) + # train auto-encoder loss for meta embedding - meta_dateset = EmbeddingDataset(data=meta_corpus, data_type='meta') - trained_me = pretrain_embedding_layer(dataset=meta_dateset, ae=model.quesnet.me, lr=train_params['lr'], - log_step=train_params['log_steps'], batch_size=train_params['batch_size'], - epochs=train_params['n_epochs'], device=device) - model.quesnet.load_meta(trained_me) + if pretrain_dir != None: + model.quesnet.load_meta(torch.load(os.path.join(pretrain_dir, 'trained_me.pt'))) + else: + meta_dateset = EmbeddingDataset(data=meta_corpus, data_type='meta') + trained_me = pretrain_embedding_layer(dataset=meta_dateset, ae=model.quesnet.me, lr=train_params['lr'], + log_step=train_params['log_steps'], batch_size=train_params['batch_size'], + epochs=train_params['n_epochs'], device=device) + if save_embs: + torch.save(trained_me.state_dict(), os.path.join(output_dir, 'trained_me.pt')) + model.quesnet.load_meta(trained_me) logger.info('quesnet Meta Embedding loaded') - if save_embs: - torch.save(trained_me.state_dict(), os.path.join(output_dir, 'trained_me.pt')) - + logger.info("quesnet Word, Image and Meta Embeddings training is done") - + # DONE for datasets + # HLM and DOO training - ques_dl.pipeline = partial(model.quesnet.make_batch, device=device, pretrain=True) + dataset.pipeline = partial(model.quesnet.make_batch, device=device, pretrain=True) model.train() optim = optimizer(model, lr=train_params['lr']) n_batches = 0 for epoch in range(0, train_params['n_epochs']): - train_iter = pretrain_iter(ques_dl, train_params['batch_size']) + train_iter = PrefetchIter(dataset, train_params['batch_size']) bar = enumerate(tqdm(train_iter, initial=train_iter.pos), train_iter.pos) for i, batch in critical(bar): @@ -766,3 +711,6 @@ def pretrain_quesnet(path, output_dir, img_dir=None, save_embs=False, train_para model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) + + + diff --git a/EduNLP/Pretrain/quesnet_vec_new.py b/EduNLP/Pretrain/quesnet_vec_new.py deleted file mode 100644 index 3dec4dc1..00000000 --- a/EduNLP/Pretrain/quesnet_vec_new.py +++ /dev/null @@ -1,739 +0,0 @@ -# !!! 本项目正在重构中 - -from .pretrian_utils import PretrainedEduTokenizer -from ..SIF.segment.segment import FigureSegment -from ..ModelZoo.quesnet import QuesNetForPreTraining, AE -from EduNLP import logger - -from torch.utils.data import DataLoader, Dataset -from torchvision.transforms.functional import to_grayscale -from torchvision.transforms.functional import to_tensor -from gensim.models import Word2Vec -import torch - -import warnings -import queue -import random -import math -import threading -import logging -import signal -import os -import json -import copy -import linecache -import numpy as np -from PIL import Image -from typing import List, Union, Optional -from collections import namedtuple -from functools import partial -from tqdm import tqdm - - -def save_list(item2index, path): - item2index = sorted(item2index.items(), key=lambda kv: kv[1]) - items = [item for item, _ in item2index] - with open(path, "wt", encoding="utf-8") as file: - file.write('\n'.join(items)) - return - - -def clip(v, low, high): - # 将 v 限制在 low~high 之间 - return max(low, min(v, high)) - -# 数据集的基本单元 -Question = namedtuple('Question', - ['id', 'content', 'answer', 'false_options', 'labels']) - - -class QuesNetTokenizer(PretrainedEduTokenizer): - """ - Examples - -------- - >>> tokenizer = QuesNetTokenizer(meta=['knowledge']) - >>> test_items = [{"ques_content": "$\\triangle A B C$ 的内角为 $A, \\quad B, $\\FigureID{test_id}$", - ... "knowledge": "['*', '-', '/']"}, {"ques_content": "$\\triangle A B C$ 的内角为 $A, \\quad B", - ... "knowledge": "['*', '-', '/']"}] - >>> tokenizer.set_vocab(test_items, - ... trim_min_count=1, key=lambda x: x["ques_content"], silent=True) - >>> tokenizer.set_meta_vocab(test_items, silent=True) - >>> token_items = [tokenizer(i, key=lambda x: x["ques_content"]) for i in test_items] - >>> print(token_items[0].keys()) - dict_keys(['seq_idx', 'meta_idx']) - >>> token_items = tokenizer(test_items, key=lambda x: x["ques_content"]) - >>> print(len(token_items["seq_idx"])) - 2 - """ - - def __init__(self, vocab_path=None, meta_vocab_dir=None, img_dir: str = None, - max_length=250, tokenize_method="custom", symbol="mas", add_specials: list = None, - meta: List[str] = None, img_token='', unk_token="", pad_token="", **kwargs): - """ - Parameters - ---------- - img_dir : str, optional - path of images, by default None - vocab_path : str, optional - path of vacab file, by default None - max_length : int, optional - by default 250 - meta : list, optional - the name of meta (side information), by default 'knowledge' - img_token : str, optional - by default '' - unk_token : str, optional - by default "" - pad_token : str, optional - by default "" - """ - if add_specials is None: - add_specials = [img_token] - else: - add_specials = [img_token] + add_specials - self.tokenization_params = { - "formula_params": { - "method": "linear", - "skip_figure_formula": True - } - } - kwargs.update(self.tokenization_params) - super().__init__(vocab_path=vocab_path, max_length=max_length, tokenize_method=tokenize_method, - add_specials=add_specials, unk_token=unk_token, pad_token=pad_token, symbol=symbol, **kwargs) - if meta is None: - meta = ['know_name'] - self.img_dir = img_dir - self.img_token = img_token - self.unk_token = unk_token - self.pad_token = pad_token - self.meta = meta - self.meta_vocab_dir = meta_vocab_dir - - self.stoi = dict() - self.itos = dict() - self.stoi['word'] = self.vocab.token_to_idx - self.itos['word'] = self.vocab.idx_to_token - if meta_vocab_dir is not None: - self.load_meta_vocab(meta_vocab_dir=meta_vocab_dir) - self.config = { - k: v for k, v in locals().items() if k not in [ - "self", "__class__", "vocab_path", 'img_dir', 'meta_vocab_dir'] - } - - def __call__(self, item: Union[str, dict, list], key=lambda x: x, - meta: Optional[list] = None, - padding=False, return_text=False, *args, **kwargs): - """ - item: str or dict - the question item - key: function - determine how to get the text of each item - padding: bool - whether to pad the seq_idx - return_text: bool - whether to return text tokens - """ - if isinstance(item, list): - ret = { - "seq_idx": [], - "meta_idx": [] - } - if return_text: - ret["seq_token"] = [] - ret["meta"] = [] - for i in item: - r = self._convert_to_ids(i, key, meta, padding, return_text, *args, **kwargs) - ret["seq_idx"].append(r["seq_idx"]) - ret["meta_idx"].append(r["meta_idx"]) - if return_text: - ret["seq_token"].append(r["seq_token"]) - ret["meta"].append(r["meta"]) - else: - ret = self._convert_to_ids(item, key, meta, padding, return_text, *args, **kwargs) - - return ret - - def _convert_to_ids(self, item: Union[str, dict, list], key=lambda x: x, - meta: Optional[list] = None, - padding=False, return_text=False, *args, **kwargs): - token_item = self.tokenize(item, key) - token_idx = [] - for _, w in enumerate(token_item): - if isinstance(w, FigureSegment): - # image - try: - fig_id = f"{w.src[10:-1]}" - fig_index = item['ques_figure_ids'].index(fig_id) - fig_src = item['ques_figure_paths'][fig_index] - im = Image.open(fig_src) - im = im.resize((56, 56)) - token_idx.append(to_grayscale(im)) - except Exception: - warnings.warn('Open image error!') - token_idx.append(self.stoi['word'][self.img_token]) - else: - # word - token_idx.append(self.stoi['word'].get(w) or self.stoi['word'][self.unk_token]) - - meta_idxs = {} - meta_items = {} - if meta is None: - meta = self.meta - for m in meta: - meta_idx = [] - if isinstance(item, dict) and m in item: - meta_item = item[m] - if isinstance(meta_item, str): - if meta_item.startswith('['): - # a list of labels ['+', '-', '/'] - meta_item = eval(meta_item) - elif isinstance(meta_item, list): - pass - else: - raise Exception("Side information must be a list!") - meta_items[m] = meta_item - for k in meta_item: - meta_idx.append(self.stoi[m].get(k) or self.stoi[m][self.unk_token]) - meta_idxs[m] = meta_idx - ret = { - "seq_idx": self.padding(token_idx, self.max_length) if padding else token_idx, - "meta_idx": meta_idxs - } - - if return_text: - ret["seq_token"] = token_item - ret["meta"] = meta_items - return ret - - def load_meta_vocab(self, meta_vocab_dir): - for m in self.meta: - try: - with open(os.path.join(meta_vocab_dir, f'meta_{m}.txt'), "rt", encoding="utf-8") as f: - meta = f.read().strip().split('\n') - self.stoi[m] = {word: index for index, word in enumerate(meta)} - self.itos[m] = {i: s for s, i in self.stoi[m].items()} - except Exception: - self.stoi[m] = None - self.itos[m] = None - - def set_meta_vocab(self, items: list, meta: List[str] = None, silent=True): - self.meta = meta if meta is not None else self.meta - for m in self.meta: - meta = set() - if m in items[0]: - for item in items: - meta_item = item[m] - if isinstance(meta_item, str): - if meta_item.startswith('['): - # a list of labels ['+', '-', '/'] - meta_item = eval(meta_item) - elif isinstance(meta_item, list): - pass - else: - raise Exception("Side information must be a list!") - meta = meta | set(meta_item) - meta = [self.unk_token] + sorted(meta) - if not silent: - print(f"save meta information {m}: {len(meta)}") - self.stoi[m] = {word: index for index, word in enumerate(meta)} - self.itos[m] = {i: s for s, i in self.stoi[m].items()} - - def set_vocab(self, items: list, key=lambda x: x, lower: bool = False, - trim_min_count: int = 1, do_tokenize: bool = True, silent=True): - """ - Parameters - ----------- - items: list - can be the list of str, or list of dict - key: function - determine how to get the text of each item - trim_min_count : int, optional - the lower bound number for adding a word into vocabulary, by default 1 - silent - """ - token_items = self.tokenize(items, key) if do_tokenize else [key(item) for item in items] - self.vocab.set_vocab(corpus_items=token_items, trim_min_count=trim_min_count, lower=lower, silent=silent) - self.stoi['word'] = self.vocab.token_to_idx - self.itos['word'] = self.vocab.idx_to_token - - @classmethod - def from_pretrained(cls, tokenizer_config_dir, img_dir=None, **kwargs): - """ - Parameters: - ----------- - tokenizer_config_dir: str - must contain tokenizer_config.json and vocab.txt and meta_{meta_name}.txt - img_dir: str - default None - the path of image directory - """ - tokenizer_config_path = os.path.join(tokenizer_config_dir, "tokenizer_config.json") - pretrained_vocab_path = os.path.join(tokenizer_config_dir, "vocab.txt") - with open(tokenizer_config_path, "r", encoding="utf-8") as rf: - tokenizer_config = json.load(rf) - tokenizer_config.update(kwargs) - return cls( - vocab_path=pretrained_vocab_path, - meta_vocab_dir=tokenizer_config_dir, - img_dir=img_dir, - max_length=tokenizer_config["max_length"], - tokenize_method=tokenizer_config["tokenize_method"], - meta=tokenizer_config["meta"], - img_token=tokenizer_config["img_token"], - unk_token=tokenizer_config["unk_token"], - pad_token=tokenizer_config["pad_token"]) - - def save_pretrained(self, tokenizer_config_dir): - """Save tokenizer into local files - - Parameters: - ----------- - tokenizer_config_dir: str - save tokenizer params in `/tokenizer_config.json` and save words in `vocab.txt` - and save metas in `meta_{meta_name}.txt` - """ - if not os.path.exists(tokenizer_config_dir): - os.makedirs(tokenizer_config_dir, exist_ok=True) - tokenizer_config_path = os.path.join(tokenizer_config_dir, "tokenizer_config.json") - self.vocab.save_vocab(os.path.join(tokenizer_config_dir, "vocab.txt")) - for m in self.meta: - save_list(self.stoi[m], os.path.join(tokenizer_config_dir, f'meta_{m}.txt')) - with open(tokenizer_config_path, "w", encoding="utf-8") as wf: - json.dump(self.config, wf, ensure_ascii=False, indent=2) - - def padding(self, idx, max_length, type='word'): - padding_idx = idx + [self.stoi[type][self.pad_token]] * (max_length - len(idx)) - return padding_idx - - def set_img_dir(self, path): - self.img_dir = path - - -class QuesnetDataset(Dataset): - ''' - 对 Question Loader 与 Lines 的重构,删除了一些不太用得到的接口 - ''' - def __init__(self, filename: str, - tokenizer: QuesNetTokenizer = None, - img_dir: str = "", - meta: Optional[list] = None, - content_key=lambda x: x['ques_content'], - meta_key=lambda x: x['know_name'], - answer_key=lambda x: x['ques_answer'], - option_key=lambda x: x['ques_options'], - pipeline=None, - skip=0 - ): - self.filename = filename - self.skip = skip - self.img_dir = img_dir - self.content_key = content_key - self.meta_key = meta_key - self.answer_key = answer_key - self.option_key = option_key - self.pipeline = pipeline - - if tokenizer == None: - tokenizer = QuesNetTokenizer(meta=['know_name'], - img_dir=img_dir) - self.tokenizer = tokenizer - self.meta = meta if meta else tokenizer.meta - self.load_data_lines() - tokenizer.set_vocab(self.lines, key=lambda x: x['ques_content'], - trim_min_count=2, silent=False) - tokenizer.set_meta_vocab(self.lines, silent=False) - - - def load_data_lines(self): - '''从 Json 文件中按行读取数据''' - # TODO: 暂时全部读取到内存中,不考虑分块 - data_dir = self.filename - skip = self.skip # 从第 skip + 1 行开始读 - self.lines = [] - self.length = 0 - - with open(data_dir, "r", encoding="utf-8") as f: - row = 0 - while True: - row += 1 - line = f.readline() - if row <= skip: - continue - if not line: - break - self.lines.append(json.loads(line.strip())) - - self.length = row - skip - 1 - assert self.length > 0, f'{data_dir} is empty. Or file length is less than skip length.' - - - def __len__(self): - return len(self.lines) - - - def __getitem__(self, index): - if isinstance(index, int): - line = self.lines[index] - - # 进行后续处理 - qid = line['ques_id'] - token = self.tokenizer(line, key=self.content_key, meta=self.meta) - content = token['seq_idx'] - meta = token['meta_idx'] - if self.answer_key(line).isalpha() and len(self.answer_key(line)) == 1 and ord(self.answer_key(line)) < 128 and len(self.option_key(line)) > 0: - answer_idx = ord(self.answer_key(line).upper()) - ord('A') - options = self.option_key(line) - answer = self.tokenizer(options.pop(answer_idx), meta=self.meta)['seq_idx'] - false_options = [(self.tokenizer(option, meta=self.meta))['seq_idx'] for option in options] - else: - answer = (self.tokenizer(self.answer_key(line), meta=self.meta))['seq_idx'] - false_options = [[0], [0], [0]] - - qs = Question(id=qid, content=content, answer=answer, - false_options=false_options, labels=meta) - if callable(self.pipeline): - qs = self.pipeline(qs) - - return qs - - - elif isinstance(index, slice): - results = [] - for i in range(*index.indices(len(self))): - results.append(self[i]) - return results - - else: - raise TypeError('Invalid argument type. Index type should be int or slice.') - - - - -class QuestionLoader(DataLoader): - def __init__(self, question, batch_size, shuffle=False, num_workers=0, pin_memory=False): - super(QuestionLoader, self).__init__( - dataset=question, - batch_size=batch_size, - shuffle=shuffle, - num_workers=num_workers, - pin_memory=pin_memory - ) - - - -class EmbeddingDataset(Dataset): - def __init__(self, data, data_type='image'): - self.data = data - self.data_type = data_type - assert self.data_type in ['image', 'meta'] - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - if self.data_type == 'image': - return to_tensor(self.data[idx]) - elif self.data_type == 'meta': - return self.data[idx] - - -class PrefetchIter: - """Iterator on data and labels, with states for save and restore.""" - # 这里还没有重构完 - - def __init__(self, data, *label, length=None, batch_size=1, shuffle=True): - self.data = data - self.label = label - self.batch_size = batch_size - self.queue = queue.Queue(maxsize=8) - self.length = length if length is not None else len(data) - - assert all(self.length == len(lab) for lab in label), \ - 'data and label must have same lengths' - - self.index = list(range(len(self))) - if shuffle: - random.shuffle(self.index) - self.thread = None - self.pos = 0 - - def __len__(self): - return math.ceil(self.length / self.batch_size) - - def __iter__(self): - return self - - def __next__(self): - if self.thread is None: - self.thread = threading.Thread(target=self.produce, daemon=True) - self.thread.start() - - if self.pos >= len(self.index): - raise StopIteration - - item = self.queue.get() - if isinstance(item, Exception): - raise item - else: - self.pos += 1 - return item - - def produce(self): - for i in range(self.pos, len(self.index)): - try: - index = self.index[i] - - bs = self.batch_size - - if callable(self.data): - data_batch = self.data(index * bs, (index + 1) * bs) - else: - data_batch = self.data[index * bs:(index + 1) * bs] - - label_batch = [label[index * bs:(index + 1) * bs] - for label in self.label] - if label_batch: - self.queue.put([data_batch] + label_batch) - else: - self.queue.put(data_batch) - except Exception as e: - self.queue.put(e) - return - - -sigint_handler = signal.getsignal(signal.SIGINT) - -def critical(f): - it = iter(f) - signal_received = () - - def handler(sig, frame): - nonlocal signal_received - signal_received = (sig, frame) - logging.debug('SIGINT received. Delaying KeyboardInterrupt.') - - while True: - try: - signal.signal(signal.SIGINT, handler) - yield next(it) - signal.signal(signal.SIGINT, sigint_handler) - if signal_received: - sigint_handler(*signal_received) - except StopIteration: - break - - -def pretrain_embedding_layer(dataset: EmbeddingDataset, ae: AE, lr: float = 1e-3, log_step: int = 1, epochs: int = 3, - batch_size: int = 4, device=torch.device('cpu')): - train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) - optim = torch.optim.Adam(ae.parameters(), lr=lr) - ae.train() - ae.to(device) - train_type = dataset.data_type - for i in range(epochs): - for batch, item in tqdm(enumerate(train_dataloader)): - loss = ae.loss(item.to(device)) - optim.zero_grad() - loss.backward() - optim.step() - if batch % log_step == 0: - logger.info(f"[Epoch{i}][Batch{batch}]Training {train_type} Embedding layer, loss:{loss}") - return ae - -def optimizer(*models, **kwargs): - _cur_optim = [m.optim_cls(m.parameters(), **kwargs) - if hasattr(m, 'optim_cls') - else torch.optim.Adam(m.parameters(), **kwargs) - for m in models] - if len(_cur_optim) == 1: - return _cur_optim[0] - else: - return _cur_optim - - -def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save_embs = False, train_params = None): - """ pretrain quesnet - - Parameters - ---------- - path : str - path of question file - output_dir : str - output path· - tokenizer : QuesNetTokenizer - quesnet tokenizer - save_embs : bool, optional - whether to save pretrained word/image/meta embeddings seperately - train_params : dict, optional - the training parameters and model parameters, by default None - - "n_epochs": int, default = 1 - train param, number of epochs - - "batch_size": int, default = 6 - train param, batch size - - "lr": float, default = 1e-3 - train param, learning rate - - "save_every": int, default = 0 - train param, save steps interval - - "log_steps": int, default = 10 - train param, log steps interval - - "device": str, default = 'cpu' - train param, 'cpu' or 'cuda' - - "max_steps": int, default = 0 - train param, stop training when reach max steps - - "emb_size": int, default = 256 - model param, the embedding size of word, figure, meta info - - "feat_size": int, default = 256 - model param, the size of question infer vector - - Examples - ---------- - >>> tokenizer = QuesNetTokenizer(meta=['know_name']) - >>> items = [{"ques_content": "若复数$z=1+2 i+i^{3}$,则$|z|=$,$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$", - ... "ques_id": "726cdbec-33a9-11ec-909c-98fa9b625adb", - ... "know_name": "['代数', '集合', '集合的相等']" - ... }] - >>> tokenizer.set_vocab(items, key=lambda x: x['ques_content'], trim_min_count=1, silent=True) - >>> pretrain_quesnet('./data/standard_luna_data.json', './testQuesNet', tokenizer) # doctest: +SKIP - """ - os.makedirs(output_dir, exist_ok=True) - device = torch.device(train_params['device']) - - default_train_params = { - # train params - "n_epochs": 1, - "batch_size": 8, - "lr": 1e-3, - 'save_every': 0, - 'log_steps': 10, - 'device': 'cpu', - 'max_steps': 0, - # model params - 'emb_size': 256, - 'feat_size': 256, - } - if train_params is not None: - default_train_params.update(train_params) - train_params = default_train_params - # 参数设定完成 - - dataset = QuesnetDataset(path) - tokenizer = dataset.tokenizer - tokenizer.save_pretrained(output_dir) - model = QuesNetForPreTraining(_stoi=tokenizer.stoi, feat_size=train_params['feat_size'], - emb_size=train_params['emb_size']).to(device) - - # 开始处理数据集 - emb_dict = tokenizer.stoi['word'] - emb_dict_rev = tokenizer.itos['word'] - emb_size = train_params['emb_size'] - meta_size = model.quesnet.meta_size - w2v_corpus = [] - img_corpus = [] - meta_corpus = [] - for _, qs in enumerate(tqdm(dataset)): - text_content = [] - for c in qs.content: - if isinstance(c, int): - text_content.append(emb_dict_rev[c]) - else: - img_corpus.append(c) - for a in qs.answer: - if isinstance(a, int): - text_content.append(emb_dict_rev[a]) - else: - img_corpus.append(a) - w2v_corpus.append(text_content) - meta_vector = torch.zeros(meta_size, dtype=torch.float) - for m in qs.labels[model.quesnet.meta]: - meta_vector.add_(torch.nn.functional.one_hot(torch.tensor(m, dtype=torch.int64), - model.quesnet.meta_size).to(torch.float)) - meta_corpus.append(meta_vector) - - - # train word2vec for text embedding - if pretrain_dir != None: - model.quesnet.load_emb(np.load(os.path.join(output_dir, 'w2v_embs.npy'))) - else: - gensim_w2v = Word2Vec(sentences=[[item] for item in emb_dict.keys()], min_count=1, - vector_size=emb_size) - gensim_w2v.init_weights() - gensim_w2v.train(corpus_iterable=w2v_corpus, total_examples=len(w2v_corpus), epochs=train_params['n_epochs']) - w2v_emb = gensim_w2v.syn1neg - emb_weights = [] - for key, item in emb_dict.items(): - w2v_index = gensim_w2v.wv.key_to_index[key] - emb_weights.append(w2v_emb[w2v_index]) - emb_weights = np.array(emb_weights) - model.quesnet.load_emb(emb_weights) - if save_embs: - np.save(os.path.join(output_dir, 'w2v_embs.npy'), emb_weights) - logger.info('quesnet Word Embedding loaded') - - - # 这里先用原先的接口 - # train auto-encoder loss for image embedding - if pretrain_dir != None: - model.quesnet.load_img(torch.load(os.path.join(pretrain_dir, 'trained_ie.pt'))) - else: - img_dataset = EmbeddingDataset(data=img_corpus, data_type='image') - trained_ie = pretrain_embedding_layer(dataset=img_dataset, ae=model.quesnet.ie, lr=train_params['lr'], - log_step=train_params['log_steps'], batch_size=train_params['batch_size'], - epochs=train_params['n_epochs'], device=device) - if save_embs: - torch.save(trained_ie.state_dict(), os.path.join(output_dir, 'trained_ie.pt')) - model.quesnet.load_img(trained_ie) - logger.info('quesnet Image Embedding loaded') - - - # train auto-encoder loss for meta embedding - if pretrain_dir != None: - model.quesnet.load_meta(torch.load(os.path.join(pretrain_dir, 'trained_me.pt'))) - else: - meta_dateset = EmbeddingDataset(data=meta_corpus, data_type='meta') - trained_me = pretrain_embedding_layer(dataset=meta_dateset, ae=model.quesnet.me, lr=train_params['lr'], - log_step=train_params['log_steps'], batch_size=train_params['batch_size'], - epochs=train_params['n_epochs'], device=device) - if save_embs: - torch.save(trained_me.state_dict(), os.path.join(output_dir, 'trained_me.pt')) - model.quesnet.load_meta(trained_me) - logger.info('quesnet Meta Embedding loaded') - - logger.info("quesnet Word, Image and Meta Embeddings training is done") - # DONE for datasets - - # TODO: 把 Quesnet 的接口移过来 - # 下面的代码还没有完整测试 - - # HLM and DOO training - dataset.pipeline = partial(model.quesnet.make_batch, device=device, pretrain=True) - model.train() - optim = optimizer(model, lr=train_params['lr']) - n_batches = 0 - for epoch in range(0, train_params['n_epochs']): - train_iter = PrefetchIter(dataset, train_params['batch_size']) - bar = enumerate(tqdm(train_iter, initial=train_iter.pos), - train_iter.pos) - for i, batch in critical(bar): - n_batches += 1 - loss = model(batch).loss - optim.zero_grad() - loss.backward() - optim.step() - - if train_params['save_every'] > 0 and i % train_params['save_every'] == 0: - # model.save(os.path.join(output_dir, f'QuesNet_{epoch}.{i}')) - model.save_pretrained(output_dir) - - if train_params['log_steps'] > 0 and i % train_params['log_steps'] == 0: - logger.info(f"{epoch}.{i}---loss: {loss.item()}") - - if train_params['max_steps'] > 0 and n_batches % train_params['max_steps'] == 0: - break - - # model.save(os.path.join(output_dir, f'QuesNet_{epoch}')) - - model.save_pretrained(output_dir) - tokenizer.save_pretrained(output_dir) - - - From 21a7d6db665701089ea1c0db22e85191fd223507 Mon Sep 17 00:00:00 2001 From: wintermelon008 <1904346407@qq.com> Date: Mon, 31 Jul 2023 08:51:57 +0800 Subject: [PATCH 44/58] [fix] small bugs fixed --- EduNLP/ModelZoo/disenqnet/__init__.py | 3 +-- EduNLP/Pretrain/quesnet_vec.py | 13 +++++++++++-- tests/test_pretrain/__init__.py | 0 tests/test_pretrain/conftest.py | 2 +- tests/test_pretrain/test_pretrained_bert.py | 2 +- tests/test_pretrain/test_pretrained_disenqnet.py | 3 ++- tests/test_pretrain/test_pretrained_elmo.py | 2 +- tests/test_pretrain/test_pretrained_quesnet.py | 3 ++- 8 files changed, 19 insertions(+), 9 deletions(-) create mode 100644 tests/test_pretrain/__init__.py diff --git a/EduNLP/ModelZoo/disenqnet/__init__.py b/EduNLP/ModelZoo/disenqnet/__init__.py index 263c410b..e774edb1 100644 --- a/EduNLP/ModelZoo/disenqnet/__init__.py +++ b/EduNLP/ModelZoo/disenqnet/__init__.py @@ -1,4 +1,3 @@ # -*- coding: utf-8 -*- -from .disenqnet import DisenQNet, DisenQNetForPreTraining, \ - DisenQNetForPropertyPrediction, DisenQNetForKnowledgePrediction +from .disenqnet import * diff --git a/EduNLP/Pretrain/quesnet_vec.py b/EduNLP/Pretrain/quesnet_vec.py index 80649ca6..8c50a833 100644 --- a/EduNLP/Pretrain/quesnet_vec.py +++ b/EduNLP/Pretrain/quesnet_vec.py @@ -161,12 +161,21 @@ def _convert_to_ids(self, item: Union[str, dict, list], key=lambda x: x, try: fig_id = f"{w.src[10:-1]}" fig_index = item['ques_figure_ids'].index(fig_id) - fig_src = item['ques_figure_paths'][fig_index] + + if self.img_dir != "": + fig_src = os.path.join(self.img_dir, fig_id) + if '.png' in item['ques_figure_paths'][fig_index]: + fig_src += '.png' + elif '.jpg' in item['ques_figure_paths'][fig_index]: + fig_src += '.jpg' + else: + fig_src = item['ques_figure_paths'][fig_index] + im = Image.open(fig_src) im = im.resize((56, 56)) token_idx.append(to_grayscale(im)) except Exception: - warnings.warn('Open image error!') + warnings.warn('Open image error! path = ' + fig_src) token_idx.append(self.stoi['word'][self.img_token]) else: # word diff --git a/tests/test_pretrain/__init__.py b/tests/test_pretrain/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_pretrain/conftest.py b/tests/test_pretrain/conftest.py index e5e7d145..75e1bc94 100644 --- a/tests/test_pretrain/conftest.py +++ b/tests/test_pretrain/conftest.py @@ -7,7 +7,7 @@ from EduNLP.ModelZoo import load_items # TEST_GPU = torch.cuda.is_available() -TEST_GPU = False + @pytest.fixture(scope="module") diff --git a/tests/test_pretrain/test_pretrained_bert.py b/tests/test_pretrain/test_pretrained_bert.py index d8db3cff..146119a3 100644 --- a/tests/test_pretrain/test_pretrained_bert.py +++ b/tests/test_pretrain/test_pretrained_bert.py @@ -9,7 +9,7 @@ from EduNLP.Vector import T2V, BertModel from EduNLP.I2V import Bert, get_pretrained_i2v -from conftest import TEST_GPU +TEST_GPU = False class TestPretrainBert: diff --git a/tests/test_pretrain/test_pretrained_disenqnet.py b/tests/test_pretrain/test_pretrained_disenqnet.py index d57cede9..df7eb4a8 100644 --- a/tests/test_pretrain/test_pretrained_disenqnet.py +++ b/tests/test_pretrain/test_pretrained_disenqnet.py @@ -10,7 +10,8 @@ from EduNLP.Pretrain import DisenQTokenizer, train_disenqnet from EduNLP.Vector import T2V, DisenQModel from EduNLP.I2V import DisenQ, get_pretrained_i2v -from conftest import TEST_GPU + +TEST_GPU = False # TODO diff --git a/tests/test_pretrain/test_pretrained_elmo.py b/tests/test_pretrain/test_pretrained_elmo.py index 8dbec3f3..9f106d84 100644 --- a/tests/test_pretrain/test_pretrained_elmo.py +++ b/tests/test_pretrain/test_pretrained_elmo.py @@ -9,7 +9,7 @@ from EduNLP.Vector import T2V, ElmoModel from EduNLP.I2V import Elmo, get_pretrained_i2v -from conftest import TEST_GPU +TEST_GPU = False class TestPretrainEmlo: diff --git a/tests/test_pretrain/test_pretrained_quesnet.py b/tests/test_pretrain/test_pretrained_quesnet.py index 95a700ef..4e717aa9 100644 --- a/tests/test_pretrain/test_pretrained_quesnet.py +++ b/tests/test_pretrain/test_pretrained_quesnet.py @@ -9,7 +9,8 @@ from EduNLP.Vector.quesnet import QuesNetModel from EduNLP.I2V import QuesNet as QuesNetI2V, get_pretrained_i2v from EduNLP.utils import abs_current_dir, path_append -from conftest import TEST_GPU + +TEST_GPU = False class TestPretrainQuesNet: From 729e506ff2c0196d22c955782b34410e168efed4 Mon Sep 17 00:00:00 2001 From: wintermelon008 <1904346407@qq.com> Date: Mon, 31 Jul 2023 11:45:59 +0800 Subject: [PATCH 45/58] fix bugs for quesnet --- EduNLP/ModelZoo/quesnet/quesnet.py | 78 ++++++++++++------- EduNLP/ModelZoo/quesnet/util.py | 12 ++- EduNLP/Pretrain/quesnet_vec.py | 24 +++--- static/test_data/standard_luna_data.json | 18 ++--- .../test_pretrain/test_pretrained_quesnet.py | 1 + 5 files changed, 87 insertions(+), 46 deletions(-) diff --git a/EduNLP/ModelZoo/quesnet/quesnet.py b/EduNLP/ModelZoo/quesnet/quesnet.py index c7c730fb..fe1b0c84 100644 --- a/EduNLP/ModelZoo/quesnet/quesnet.py +++ b/EduNLP/ModelZoo/quesnet/quesnet.py @@ -114,6 +114,10 @@ def make_batch(self, data, device, pretrain=False): ans_input = [] ans_output = [] false_options = [[] for i in range(3)] + + if not isinstance(data, list): + data = [data] + for q in data: meta = torch.zeros(len(self.stoi[self.meta])).to(device) meta[q.labels.get(self.meta) or []] = 1 @@ -156,7 +160,7 @@ def make_batch(self, data, device, pretrain=False): for i, fo in enumerate(q.false_options): false_options[i].append([0] + fo) - + lembs = SeqBatch(lembs, device=device) rembs = SeqBatch(rembs, device=device) embs = SeqBatch(embs, device=device) @@ -192,6 +196,23 @@ def make_batch(self, data, device, pretrain=False): words = torch.cat(words, dim=0) if words else None ims = torch.cat(ims, dim=0) if ims else None metas = torch.cat(metas, dim=0) if metas else None + + + # print("debug1") + # print(lembs) + # print(rembs) + # print(words) + # print(ims) + # print(metas) + # print(wmask) + # print(imask) + # print(mmask) + # print(embs) + # print(ans_input) + # print(ans_output) + # print(false_opt_input) + + if pretrain: return ( lembs, rembs, words, ims, metas, wmask, imask, mmask, @@ -302,7 +323,7 @@ def __init__(self, _stoi=None, pretrained_embs: np.ndarray = None, pretrained_im self.config = PretrainedConfig.from_dict(self.config) def forward(self, batch): - left, right, words, ims, metas, wmask, imask, mmask, inputs, ans_input, ans_output, false_opt_input = batch + left, right, words, ims, metas, wmask, imask, mmask, inputs, ans_input, ans_output, false_opt_input = batch[0] # high-level loss outputs = self.quesnet(inputs) @@ -310,7 +331,8 @@ def forward(self, batch): h = outputs.hidden x = ans_input.packed() - y, _ = self.ans_decode(PackedSequence(self.quesnet.we(x.data), x.batch_sizes), + + y, _ = self.ans_decode(PackedSequence(self.quesnet.we(x[0].data), x.batch_sizes), h.repeat(self.config.layers, 1, 1)) floss = F.cross_entropy(self.ans_output(y.data), ans_output.packed().data) @@ -318,51 +340,53 @@ def forward(self, batch): torch.ones_like(self.ans_judge(y.data))) for false_opt in false_opt_input: x = false_opt.packed() - y, _ = self.ans_decode(PackedSequence(self.quesnet.we(x.data), x.batch_sizes), + if x == (None, None): + continue + y, _ = self.ans_decode(PackedSequence(self.quesnet.we(x[0].data), x.batch_sizes), h.repeat(self.config.layers, 1, 1)) floss = floss + F.binary_cross_entropy_with_logits(self.ans_judge(y.data), torch.zeros_like(self.ans_judge(y.data))) loss = floss * self.lambda_loss[1] # low-level loss - left_hid = self.quesnet(left).pack_embeded.data[:, :self.rnn_size] - right_hid = self.quesnet(right).pack_embeded.data[:, self.rnn_size:] + left_hid = self.quesnet(left).pack_embeded.data[:, :self.rnn_size].clone() + right_hid = self.quesnet(right).pack_embeded.data[:, self.rnn_size:].clone() wloss = iloss = mloss = None if words is not None: - lwfea = torch.masked_select(left_hid, wmask.unsqueeze(1).bool()) \ - .view(-1, self.rnn_size) - lout = self.lwoutput(lwfea) - rwfea = torch.masked_select(right_hid, wmask.unsqueeze(1).bool()) \ - .view(-1, self.rnn_size) - rout = self.rwoutput(rwfea) - out = self.woutput(torch.cat([lwfea, rwfea], dim=1)) + lwfea = torch.masked_select(left_hid.clone(), wmask.unsqueeze(1).bool()) \ + .view(-1, self.rnn_size).clone() + lout = self.lwoutput(lwfea.clone()) + rwfea = torch.masked_select(right_hid.clone(), wmask.unsqueeze(1).bool()) \ + .view(-1, self.rnn_size).clone() + rout = self.rwoutput(rwfea.clone()) + out = self.woutput(torch.cat([lwfea.clone(), rwfea.clone()], dim=1).clone()) wloss = (F.cross_entropy(out, words) + F.cross_entropy(lout, words) + F. cross_entropy(rout, words)) * self.quesnet.lambda_input[0] / 3 wloss *= self.lambda_loss[0] loss = loss + wloss if ims is not None: - lifea = torch.masked_select(left_hid, imask.unsqueeze(1).bool()) \ - .view(-1, self.rnn_size) - lout = self.lioutput(lifea) - rifea = torch.masked_select(right_hid, imask.unsqueeze(1).bool()) \ - .view(-1, self.rnn_size) - rout = self.rioutput(rifea) - out = self.ioutput(torch.cat([lifea, rifea], dim=1)) + lifea = torch.masked_select(left_hid.clone(), imask.unsqueeze(1).bool()) \ + .view(-1, self.rnn_size).clone() + lout = self.lioutput(lifea.clone()) + rifea = torch.masked_select(right_hid.clone(), imask.unsqueeze(1).bool()) \ + .view(-1, self.rnn_size).clone() + rout = self.rioutput(rifea.clone()) + out = self.ioutput(torch.cat([lifea.clone(), rifea.clone()], dim=1).clone()) iloss = (self.quesnet.ie.loss(ims, out) + self.quesnet.ie.loss(ims, lout) + self.quesnet.ie. loss(ims, rout)) * self.quesnet.lambda_input[1] / 3 iloss *= self.lambda_loss[0] loss = loss + iloss if metas is not None: - lmfea = torch.masked_select(left_hid, mmask.unsqueeze(1).bool()) \ - .view(-1, self.rnn_size) - lout = self.lmoutput(lmfea) - rmfea = torch.masked_select(right_hid, mmask.unsqueeze(1).bool()) \ - .view(-1, self.rnn_size) - rout = self.rmoutput(rmfea) - out = self.moutput(torch.cat([lmfea, rmfea], dim=1)) + lmfea = torch.masked_select(left_hid.clone(), mmask.unsqueeze(1).bool()) \ + .view(-1, self.rnn_size).clone() + lout = self.lmoutput(lmfea.clone()) + rmfea = torch.masked_select(right_hid.clone(), mmask.unsqueeze(1).bool()) \ + .view(-1, self.rnn_size).clone() + rout = self.rmoutput(rmfea.clone()) + out = self.moutput(torch.cat([lmfea.clone(), rmfea.clone()], dim=1).clone()) mloss = (self.quesnet.me.loss(metas, out) + self.quesnet.me.loss(metas, lout) + self.quesnet.me. loss(metas, rout)) * self.quesnet.lambda_input[2] / 3 mloss *= self.lambda_loss[0] diff --git a/EduNLP/ModelZoo/quesnet/util.py b/EduNLP/ModelZoo/quesnet/util.py index 2f03910f..cc11f8b0 100644 --- a/EduNLP/ModelZoo/quesnet/util.py +++ b/EduNLP/ModelZoo/quesnet/util.py @@ -11,7 +11,11 @@ def __init__(self, seqs, dtype=None, device=None): self.dtype = dtype self.device = device self.seqs = seqs - self.lens = [len(x) for x in seqs] + + if not seqs: + self.lens = [0] + else: + self.lens = [len(x) for x in seqs] self.ind = argsort(self.lens)[::-1] self.inv = argsort(self.ind) @@ -19,6 +23,7 @@ def __init__(self, seqs, dtype=None, device=None): self._prefix = [0] self._index = {} c = 0 + for i in range(self.lens[0]): for j in range(len(self.lens)): if self.lens[j] <= i: @@ -28,10 +33,15 @@ def __init__(self, seqs, dtype=None, device=None): def packed(self): ind = torch.tensor(self.ind, dtype=torch.long, device=self.device) + if not ind.numel() or ind.max() >= self.padded()[0].size(1): + return None, None padded = self.padded()[0].index_select(1, ind) return pack_padded_sequence(padded, torch.tensor(self.lens)) def padded(self, max_len=None, batch_first=False): + if not self.seqs: + return torch.empty((0, 0), dtype=self.dtype, device=self.device), torch.empty((0, 0), dtype=torch.bool, device=self.device) + seqs = [torch.tensor(s, dtype=self.dtype, device=self.device) if not isinstance(s, torch.Tensor) else s for s in self.seqs] diff --git a/EduNLP/Pretrain/quesnet_vec.py b/EduNLP/Pretrain/quesnet_vec.py index 8c50a833..66ff0bbf 100644 --- a/EduNLP/Pretrain/quesnet_vec.py +++ b/EduNLP/Pretrain/quesnet_vec.py @@ -156,8 +156,9 @@ def _convert_to_ids(self, item: Union[str, dict, list], key=lambda x: x, token_item = self.tokenize(item, key) token_idx = [] for _, w in enumerate(token_item): - if isinstance(w, FigureSegment): + if isinstance(w, FigureSegment) and 'ques_figure_ids' in item.keys(): # image + try: fig_id = f"{w.src[10:-1]}" fig_index = item['ques_figure_ids'].index(fig_id) @@ -171,11 +172,13 @@ def _convert_to_ids(self, item: Union[str, dict, list], key=lambda x: x, else: fig_src = item['ques_figure_paths'][fig_index] + print(f"Open figure {fig_src}") im = Image.open(fig_src) im = im.resize((56, 56)) token_idx.append(to_grayscale(im)) + except Exception: - warnings.warn('Open image error! path = ' + fig_src) + warnings.warn('Open image error!') token_idx.append(self.stoi['word'][self.img_token]) else: # word @@ -390,6 +393,7 @@ def __getitem__(self, index): token = self.tokenizer(line, key=self.content_key, meta=self.meta) content = token['seq_idx'] meta = token['meta_idx'] + if self.answer_key(line).isalpha() and len(self.answer_key(line)) == 1 and ord(self.answer_key(line)) < 128 and len(self.option_key(line)) > 0: answer_idx = ord(self.answer_key(line).upper()) - ord('A') options = self.option_key(line) @@ -441,7 +445,7 @@ def __init__(self, data, *label, length=None, batch_size=1, shuffle=True): self.batch_size = batch_size self.queue = queue.Queue(maxsize=8) self.length = length if length is not None else len(data) - + assert all(self.length == len(lab) for lab in label), \ 'data and label must have same lengths' @@ -545,7 +549,7 @@ def optimizer(*models, **kwargs): return _cur_optim -def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save_embs = False, train_params = None): +def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save_embs = False, load_embs = False, train_params = None): """ pretrain quesnet Parameters @@ -558,6 +562,8 @@ def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save quesnet tokenizer save_embs : bool, optional whether to save pretrained word/image/meta embeddings seperately + load_embs : bool, optional + whether to load pretrained word/image/meta embeddings seperately train_params : dict, optional the training parameters and model parameters, by default None - "n_epochs": int, default = 1 @@ -609,7 +615,7 @@ def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save default_train_params.update(train_params) train_params = default_train_params - dataset = QuesnetDataset(path) + dataset = QuesnetDataset(path, img_dir=img_dir) tokenizer = dataset.tokenizer tokenizer.save_pretrained(output_dir) model = QuesNetForPreTraining(_stoi=tokenizer.stoi, feat_size=train_params['feat_size'], @@ -642,7 +648,7 @@ def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save meta_corpus.append(meta_vector) # train word2vec for text embedding - if pretrain_dir != None: + if pretrain_dir != None and load_embs: model.quesnet.load_emb(np.load(os.path.join(output_dir, 'w2v_embs.npy'))) else: gensim_w2v = Word2Vec(sentences=[[item] for item in emb_dict.keys()], min_count=1, @@ -661,7 +667,7 @@ def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save logger.info('quesnet Word Embedding loaded') # train auto-encoder loss for image embedding - if pretrain_dir != None: + if pretrain_dir != None and load_embs: model.quesnet.load_img(torch.load(os.path.join(pretrain_dir, 'trained_ie.pt'))) else: img_dataset = EmbeddingDataset(data=img_corpus, data_type='image') @@ -675,7 +681,7 @@ def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save # train auto-encoder loss for meta embedding - if pretrain_dir != None: + if pretrain_dir != None and load_embs: model.quesnet.load_meta(torch.load(os.path.join(pretrain_dir, 'trained_me.pt'))) else: meta_dateset = EmbeddingDataset(data=meta_corpus, data_type='meta') @@ -696,7 +702,7 @@ def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save optim = optimizer(model, lr=train_params['lr']) n_batches = 0 for epoch in range(0, train_params['n_epochs']): - train_iter = PrefetchIter(dataset, train_params['batch_size']) + train_iter = PrefetchIter(dataset, batch_size=train_params['batch_size']) bar = enumerate(tqdm(train_iter, initial=train_iter.pos), train_iter.pos) for i, batch in critical(bar): diff --git a/static/test_data/standard_luna_data.json b/static/test_data/standard_luna_data.json index 9b8e3906..0ef4403a 100644 --- a/static/test_data/standard_luna_data.json +++ b/static/test_data/standard_luna_data.json @@ -1,13 +1,13 @@ -{"ques_content": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", "ques_subject": 1, "ques_id": "726cdbec-33a9-11ec-909c-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["\\\\{-4,1\\\\}", "\\\\{1,5\\\\}", "\\\\{3,5\\\\}", "\\\\{1,3\\\\}"], "ques_answer": "D", "know_list": [0, 10, 57], "know_name": ["代数", "集合", "集合的相等"], "difficulty": 0.424379, "ques_figure_ids": [], "ques_figure_paths": []} +{"ques_content": "已知集合$A=\\left\\{x \\mid x^{2}-3 x-4<0\\right\\}, \\quad B=\\{-4,1,3,5\\}, \\quad$ 则 $A \\cap B=$", "ques_subject": 1, "ques_id": "726cdbec-33a9-11ec-909c-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["\\\\{-4,1\\\\}", "\\\\{1,5\\\\}", "\\\\{3,5\\\\}", "\\\\{1,3\\\\}"], "ques_answer": "C", "know_list": [0, 10, 57], "know_name": ["代数", "集合", "集合的相等"], "difficulty": 0.424379, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "若复数$z=1+2 i+i^{3}$,则$|z|=$", "ques_subject": 1, "ques_id": "726e139c-33a9-11ec-bd9e-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["0", "1", "$\\\\sqrt{2}$", "2"], "ques_answer": "C", "know_list": [0, 19, 269], "know_name": ["代数", "数系的扩充与复数", "复数代数形式的加减运算"], "difficulty": 0.566538, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "埃及胡夫金字塔是古代世界建筑奇迹之一,它的形状可视为一个正四棱锥。以该四棱锥的高为边长的正方形面积等于该四棱锥一个侧面三角形的面积,则其侧面三角形底边上的高与底面正方形的边长的比值为", "ques_subject": 1, "ques_id": "726e3a92-33a9-11ec-88d7-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\frac{\\\\sqrt{5}-1}{4}$", "$\\\\frac{\\\\sqrt{5}-1}{2}$", "$\\\\frac{\\\\sqrt{5}+1}{4}$", "$\\\\frac{\\\\sqrt{5}+1}{2}$"], "ques_answer": "C", "know_list": [6, 30, 511], "know_name": ["立体几何", "空间几何体", "棱柱的结构特征"], "difficulty": 0.604718, "ques_figure_ids": ["726e3a92-33a9-11ec-88d7-98fa9b625adb"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d66b18-33a9-11ec-a11a-98fa9b625adb.png"]} {"ques_content": "设$O$为正方形$ABCD$中心,在$O, A, B, C, D$中任取$3$点,则取到的$3$点共线的概率为", "ques_subject": 1, "ques_id": "726e3a93-33a9-11ec-9d16-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\frac{1}{5}$", "$\\\\frac{2}{5}$", "$\\\\frac{1}{2}$", "$\\\\frac{4}{5}$"], "ques_answer": "A", "know_list": [1, 21, 309], "know_name": ["排列组合与概率统计", "概率", "古典概型及其概率计算公式"], "difficulty": 0.455177, "ques_figure_ids": [], "ques_figure_paths": []} -{"ques_content": "某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位:$^{\\circ} \\mathrm{C}$)的关系,在$20$个不同温度条件下进行种子发芽实验,由实验数据$\\left(x_{i}, y_{i}\\right)(i=1,2, \\cdots, 20)$得到下面的散点图:$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$由此散点图,在$10$$^{\\circ} \\mathrm{C}$至$40$$^{\\circ} \\mathrm{C}$之间,下面四个回归方程类型中最适宜作为发芽率$y$和温度$x$的回归方程类型的是", "ques_subject": 1, "ques_id": "726e618c-33a9-11ec-9f9e-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$y=a+b x$", "$y=a+b x^{2}$", "$y=a+b e^{x}$", "$y=a+b \\\\ln x$"], "ques_answer": "D", "know_list": [1, 20, 286], "know_name": ["排列组合与概率统计", "统计与统计案例", "变量间的相关关系"], "difficulty": 0.530059, "ques_figure_ids": ["726e618c-33a9-11ec-9f9e-98fa9b625adb"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d69210-33a9-11ec-b2fe-98fa9b625adb.png"]} +{"ques_content": "某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位:$^{\\circ} \\mathrm{C}$)的关系,在$20$个不同温度条件下进行种子发芽实验,由实验数据$\\left(x_{i}, y_{i}\\right)(i=1,2, \\cdots, 20)$得到下面的散点图:$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$由此散点图,在$10$$^{\\circ} \\mathrm{C}$至$40$$^{\\circ} \\mathrm{C}$之间,下面四个回归方程类型中最适宜作为发芽率$y$和温度$x$的回归方程类型的是", "ques_subject": 1, "ques_id": "726e618c-33a9-11ec-9f9e-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$y=a+b x$", "$y=a+b x^{2}$", "$y=a+b e^{x}$", "$y=a+b \\\\ln x$"], "ques_answer": "C", "know_list": [1, 20, 286], "know_name": ["排列组合与概率统计", "统计与统计案例", "变量间的相关关系"], "difficulty": 0.530059, "ques_figure_ids": ["000004d6-0479-11ec-829b-797d5eb43535"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d69210-33a9-11ec-b2fe-98fa9b625adb.png"]} {"ques_content": "已知圆$x^{2}+y^{2}-6 x=0$,过点($1,2)$的直线被该圆所截得的弦的长度的最小值为", "ques_subject": 1, "ques_id": "726e8886-33a9-11ec-b0d0-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["1", "2", "3", "4"], "ques_answer": "A", "know_list": [5, 27, 440], "know_name": ["平面解析几何", "直线与方程", "斜率的计算公式"], "difficulty": 0.58802, "ques_figure_ids": [], "ques_figure_paths": []} -{"ques_content": "设函数$f(x)=\\cos \\left(\\omega x+\\frac{\\pi}{6}\\right)$ 在 $[-\\pi, \\pi]$的图像大致如下图,则$f(x)$的最小周期为", "ques_subject": 1, "ques_id": "726e8887-33a9-11ec-a994-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\frac{10 \\\\pi}{9}$", "$\\\\frac{7 \\\\pi}{6}$", "$\\\\frac{4\\\\pi}{3}$", "$\\\\frac{3 \\\\pi}{2}$"], "ques_answer": "C", "know_list": [4, 26, 422], "know_name": ["三角函数", "三角函数", "函数y=Asin(ωx +ф)的图像变换"], "difficulty": 0.421256, "ques_figure_ids": ["726e8887-33a9-11ec-a994-98fa9b625adb"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d6b902-33a9-11ec-b912-98fa9b625adb.png"]} +{"ques_content": "设函数$f(x)=\\cos \\left(\\omega x+\\frac{\\pi}{6}\\right)$ 在 $[-\\pi, \\pi]$的图像大致如下图,则$f(x)$的最小周期为", "ques_subject": 1, "ques_id": "726e8887-33a9-11ec-a994-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\frac{10 \\\\pi}{9}$", "$\\\\frac{7 \\\\pi}{6}$", "$\\\\frac{4\\\\pi}{3}$", "$\\\\frac{3 \\\\pi}{2}$"], "ques_answer": "C", "know_list": [4, 26, 422], "know_name": ["三角函数", "三角函数", "函数y=Asin(ωx +ф)的图像变换"], "difficulty": 0.421256, "ques_figure_ids": ["000004d6-0479-11ec-829b-797d5eb43535"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d6b902-33a9-11ec-b912-98fa9b625adb.png"]} {"ques_content": "设对数$a \\log _{3} 4=2,$ 则 $4^{-a}=$", "ques_subject": 1, "ques_id": "726eb0ac-33a9-11ec-b339-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\frac{1}{16}$", "$\\\\frac{1}{9}$", "$\\\\frac{1}{8}$", "$\\\\frac{1}{6}$"], "ques_answer": "B", "know_list": [0, 13, 129], "know_name": ["代数", " 基本初等函数Ⅰ", "指数函数的实际应用"], "difficulty": 0.34596, "ques_figure_ids": [], "ques_figure_paths": []} -{"ques_content": "执行右面的程序框图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$,则输出的$n=$", "ques_subject": 1, "ques_id": "726ed764-33a9-11ec-9fd2-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["17", "19", "21", "23"], "ques_answer": "C", "know_list": [2, 23, 351], "know_name": ["算法与框图", "算法初步与框图", "程序框图"], "difficulty": 0.78378, "ques_figure_ids": ["726ed764-33a9-11ec-9fd2-98fa9b625adb"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d6dff8-33a9-11ec-8bfa-98fa9b625adb.png"]} -{"ques_content": "设$\\left\\{a_{n}\\right\\}$是等比数列,且$a_{1}+a_{2}+a_{3}=1, a_{2}+a_{3}+a_{4}=2,$ 则 $a_{6}+a_{7}+a_{8}=$", "ques_subject": 1, "ques_id": "726ed765-33a9-11ec-b319-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["12", "24", "30", "32"], "ques_answer": "D", "know_list": [0, 17, 213], "know_name": ["代数", "数列", "等比数列"], "difficulty": 0.455039, "ques_figure_ids": [], "ques_figure_paths": []} +{"ques_content": "执行右面的程序框图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$,则输出的$n=$", "ques_subject": 1, "ques_id": "726ed764-33a9-11ec-9fd2-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["17", "19", "21", "23"], "ques_answer": "C", "know_list": [2, 23, 351], "know_name": ["算法与框图", "算法初步与框图", "程序框图"], "difficulty": 0.78378, "ques_figure_ids": ["000004d6-0479-11ec-829b-797d5eb43535"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d6dff8-33a9-11ec-8bfa-98fa9b625adb.png"]} +{"ques_content": "设$\\left\\{a_{n}\\right\\}$是等比数列,且$a_{1}+a_{2}+a_{3}=1, a_{2}+a_{3}+a_{4}=2,$ 则 $a_{6}+a_{7}+a_{8}=$", "ques_subject": 1, "ques_id": "726ed765-33a9-11ec-b319-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["12", "24", "30", "32"], "ques_answer": "C", "know_list": [0, 17, 213], "know_name": ["代数", "数列", "等比数列"], "difficulty": 0.455039, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "设$F_{1}, F_{2}$是双曲线$C: x^{2}-\\frac{y^{2}}{3}=1$的两个焦点,$O$为坐标原点,点$P$在$C$上且$|O P|=2$,则$\\triangle P F_{1} F_{2}$的面积为", "ques_subject": 1, "ques_id": "726efed0-33a9-11ec-a18c-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\frac{7}{2}$", "3", "$\\\\frac{5}{2}$", "2"], "ques_answer": "B", "know_list": [5, 29, 494], "know_name": ["平面解析几何", "圆锥曲线与方程", "双曲线的定义"], "difficulty": 0.393424, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "已知$A, B, C$为球$O$的球面上的三个点,$\\odot O_{1}$ 为 $\\triangle A B C$ 的外接圆,若 $\\odot O_{1}$ 的面积为 $4 \\pi$,$A B=B C=A C=O O_{1},$ 则球 $O$ 的表面积为", "ques_subject": 1, "ques_id": "726f25f0-33a9-11ec-8501-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$64 \\\\pi$", "$48 \\\\pi$", "$36 \\\\pi$", "$32 \\\\pi$"], "ques_answer": "A", "know_list": [6, 31, 569], "know_name": ["立体几何", "空间向量与立体几何", "用空间向量求直线间的夹角、距离"], "difficulty": 0.365152, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "若$x,y$满足约束条件$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,则$z=x+7 y$的最大值为_______", "ques_subject": 1, "ques_id": "726f25f1-33a9-11ec-82f6-98fa9b625adb", "ques_type": 2, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "1", "know_list": [4, 25, 379], "know_name": ["三角函数", "三角函数及其恒等变换", "三角函数的定义域"], "difficulty": 0.371465, "ques_figure_ids": [], "ques_figure_paths": []} @@ -16,10 +16,10 @@ {"ques_content": "数列$\\left\\{a_{n}\\right\\}$满足$a_{n+2}+(-1)^{n} a_{n}=3 n-1$,前$16$项和为$540$,则$\\left\\{a_{1}\\right\\}=$_______", "ques_subject": 1, "ques_id": "726f737a-33a9-11ec-8c30-98fa9b625adb", "ques_type": 2, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "$\\left\\{a_{1}\\right\\}=7$", "know_list": [7, 33, 587], "know_name": ["高等数学", "矩阵与变换", "二阶矩阵"], "difficulty": 0.559559, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "某厂接受了一项加工业务,加工出来的产品(单位:件)按标准分为$A, B, C, D$四个等级。加工业务约定:对于$A$级品,$B$级品,$C$级品,厂家每件分别收取加工费$90$元,$50$元,$20$元;对于$D$级品,厂家每件要赔偿原料损失费$50$元。该厂有甲、乙两个分厂可承接加工业务,甲分厂加工成本费为$25$元/件,乙分厂加工成本费为$20$元/件。厂家为决定由哪个分厂承接加工业务,在两个分厂各试加工了$100$件这种产品,并统计了这些产品的等级,整理如下:分别估计甲、乙两分厂加工出来的一件产品为$A$级品的概率;分别求甲、乙两分厂加工出来的$100$件产品的平均利润,以平均利润为依据,厂家应选哪个分厂承接加工业务?", "ques_subject": 1, "ques_id": "726f9938-33a9-11ec-a236-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "(1)0.4和0.28;(2)$x+y+z=1$", "know_list": [0, 14, 164], "know_name": ["代数", "函数的应用", "函数模型的选择与应用"], "difficulty": 0.29537, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "$\\triangle A B C$ 的内角为 $A, \\quad B, \\quad C$的对边分别为$a, b, c$,已知$B=150^{\\circ}$。若$a=\\sqrt{3} c, \\quad b=2 \\sqrt{7}, \\quad$ 求 $\\triangle A B C$ 的面积;若$\\sin A+\\sqrt{3} \\sin C=\\frac{\\sqrt{2}}{2}, \\quad$ 求 $C$", "ques_subject": 1, "ques_id": "726f9939-33a9-11ec-981c-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "(1)$S_{\\triangle A B C}=\\sqrt{3}$;(2)$C=\\frac{\\pi}{12}$。", "know_list": [4, 26, 436], "know_name": ["三角函数", "三角函数", "解三角形"], "difficulty": 0.286434, "ques_figure_ids": [], "ques_figure_paths": []} -{"ques_content": "如图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$,$D$ 为圆锥的顶点,$O$是圆锥底面的圆心,$\\triangle A B C$是底面的内接正三角形,$P$ 为$DO$上一点,$\\angle A P C=90^{\\circ}$。证明:平面$P A B \\perp$平面$P A C $设$D O=\\sqrt{2}$,圆锥的侧面积为$\\sqrt{3} \\pi$,求三棱锥$P-A B C$的体积。", "ques_subject": 1, "ques_id": "726fc02e-33a9-11ec-99e4-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "", "know_list": [6, 31, 555], "know_name": ["立体几何", "空间向量与立体几何", "空间点、线、面的位置"], "difficulty": 0.339271, "ques_figure_ids": ["726fc02e-33a9-11ec-99e4-98fa9b625adb"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d72de2-33a9-11ec-84f7-98fa9b625adb.png"]} {"ques_content": "已知函数$f(x)=e^{x}-a(x+2)$当$a=1$时,讨论$f(x)$的单调性;若$f(x)$有两个零点,求$a$的取值范围", "ques_subject": 1, "ques_id": "726fc02f-33a9-11ec-8019-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "(1)$f(x)$在$(-\\infty, 0)$上单调递减,在$(0, +\\infty)$上单调递增\n(2)$\\left(e^{-1},+\\infty\\right)$", "know_list": [0, 15, 177], "know_name": ["代数", "导数及其应用", "利用导数研究函数的极值"], "difficulty": 0.270787, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "已知$A,B$分别为椭圆$E: \\frac{x^{2}}{a^{2}}+y^{2}=1 \\quad a>1$的左、右顶点,$G$ 为$E$的上顶点, $\\overrightarrow{A G} \\cdot \\overrightarrow{G B}=8$。$P$为直线$x=6$的动点,$P A $与$ E$的另一交点为 $C$, $PB$与$E$的另一交点为$D$求$E$的方程证明:直线$CD$过定点", "ques_subject": 1, "ques_id": "726fe850-33a9-11ec-9724-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "(1)$\\frac{x^{2}}{9}+y^{2}=1$;(2)$\\left(\\frac{3}{2}, 0\\right)$", "know_list": [5, 29, 486], "know_name": ["平面解析几何", "圆锥曲线与方程", "椭圆的定义"], "difficulty": 0.358355, "ques_figure_ids": [], "ques_figure_paths": []} {"ques_content": "[选修$4-4$:坐标系与参数方程]\n在直角坐标系$xOy$中,曲线$C_1$的参数方程为$\\left\\{\\begin{array}{l}x=\\cos ^{k} t \\\\ y=\\sin ^{k} t\\end{array}\\right.$($t$为参数),以坐标原点为极点,$x$轴正半轴为极轴建立极坐标系,曲线$C_2$的极坐标方程为$4 \\rho \\cos \\theta-16 \\rho \\sin \\theta+3=0$当$k=1$时,$C_1$是什么曲线?当$k=4$时,求$C_1$与$C_2$的公共点的直角坐标", "ques_subject": 1, "ques_id": "726fe851-33a9-11ec-bf44-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "(1)以原点为圆心,以1为半径的圆;\n(2)$\\left(\\frac{1}{4}, \\frac{1}{4}\\right)$", "know_list": [7, 35, 615], "know_name": ["高等数学", "坐标系与参数方程", "简单曲线的极坐标方程"], "difficulty": 0.331245, "ques_figure_ids": [], "ques_figure_paths": []} -{"ques_content": "[选修$4-5$:不等式选讲]\n已知函数$f(x)=|3 x+1|-2|x|$画出$y=f(x)$的图像求不等式$f(x)>f(x+1)$的解集", "ques_subject": 1, "ques_id": "72700f52-33a9-11ec-bdd0-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "(1)如图;(2)$\\left(-\\infty,-\\frac{7}{6}\\right)$", "know_list": [7, 36, 635], "know_name": ["高等数学", "不等式选讲", "绝对值不等式"], "difficulty": 0.517198, "ques_figure_ids": ["72700f52-33a9-11ec-bdd0-98fa9b625adb"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d754d2-33a9-11ec-8dee-98fa9b625adb.png"]} -{"ques_content": "$1.$已知集合 $A=\\{x||x|<3, x \\in Z\\}, \\quad B=\\{x||x|>1, x \\in \\mathbf{Z}\\}, \\quad$ 则 $A \\cap B=$", "ques_subject": 1, "ques_id": "72700f53-33a9-11ec-8395-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\varnothing$", "$\\\\{-3,-2,2,3\\\\}$", "$\\\\{-2,0,2\\\\}$", "$\\\\{-2,2\\\\}$"], "ques_answer": "D", "know_list": [0, 10, 57], "know_name": ["代数", "集合", "集合的相等"], "difficulty": 0.349825, "ques_figure_ids": [], "ques_figure_paths": []} -{"ques_content": "复数 $(1-i)^{4}=$", "ques_subject": 1, "ques_id": "7270351c-33a9-11ec-82f2-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["-4", "4", "$-4i$", "$4i$"], "ques_answer": "A", "know_list": [0, 19, 269], "know_name": ["代数", "数系的扩充与复数", "复数代数形式的加减运算"], "difficulty": 0.649847, "ques_figure_ids": [], "ques_figure_paths": []} \ No newline at end of file +{"ques_content": "[选修$4-5$:不等式选讲]\n已知函数$f(x)=|3 x+1|-2|x|$画出$y=f(x)$的图像求不等式$f(x)>f(x+1)$的解集", "ques_subject": 1, "ques_id": "72700f52-33a9-11ec-bdd0-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "(1)如图;(2)$\\left(-\\infty,-\\frac{7}{6}\\right)$", "know_list": [7, 36, 635], "know_name": ["高等数学", "不等式选讲", "绝对值不等式"], "difficulty": 0.517198, "ques_figure_ids": ["000004d6-0479-11ec-829b-797d5eb43535"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d754d2-33a9-11ec-8dee-98fa9b625adb.png"]} +{"ques_content": "$1.$已知集合 $A=\\{x||x|<3, x \\in Z\\}, \\quad B=\\{x||x|>1, x \\in \\mathbf{Z}\\}, \\quad$ 则 $A \\cap B=$", "ques_subject": 1, "ques_id": "72700f53-33a9-11ec-8395-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["$\\\\varnothing$", "$\\\\{-3,-2,2,3\\\\}$", "$\\\\{-2,0,2\\\\}$", "$\\\\{-2,2\\\\}$"], "ques_answer": "C", "know_list": [0, 10, 57], "know_name": ["代数", "集合", "集合的相等"], "difficulty": 0.349825, "ques_figure_ids": [], "ques_figure_paths": []} +{"ques_content": "复数 $(1-i)^{4}=$", "ques_subject": 1, "ques_id": "7270351c-33a9-11ec-82f2-98fa9b625adb", "ques_type": 7, "ques_system": 10, "ques_period": 2, "ques_options": ["-4", "4", "$-4i$", "$4i$"], "ques_answer": "A", "know_list": [0, 19, 269], "know_name": ["代数", "数系的扩充与复数", "复数代数形式的加减运算"], "difficulty": 0.649847, "ques_figure_ids": [], "ques_figure_paths": []} +{"ques_content": "如图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$,$D$ 为圆锥的顶点,$O$是圆锥底面的圆心,$\\triangle A B C$是底面的内接正三角形,$P$ 为$DO$上一点,$\\angle A P C=90^{\\circ}$。证明:平面$P A B \\perp$平面$P A C $设$D O=\\sqrt{2}$,圆锥的侧面积为$\\sqrt{3} \\pi$,求三棱锥$P-A B C$的体积。", "ques_subject": 1, "ques_id": "726fc02e-33a9-11ec-99e4-98fa9b625adb", "ques_type": 5, "ques_system": 10, "ques_period": 2, "ques_options": [], "ques_answer": "", "know_list": [6, 31, 555], "know_name": ["立体几何", "空间向量与立体几何", "空间点、线、面的位置"], "difficulty": 0.339271, "ques_figure_ids": ["000004d6-0479-11ec-829b-797d5eb43535"], "ques_figure_paths": ["/share/luna/database-data/figure/LUNA-figure/7/73/73d72de2-33a9-11ec-84f7-98fa9b625adb.png"]} \ No newline at end of file diff --git a/tests/test_pretrain/test_pretrained_quesnet.py b/tests/test_pretrain/test_pretrained_quesnet.py index 4e717aa9..2bf86c34 100644 --- a/tests/test_pretrain/test_pretrained_quesnet.py +++ b/tests/test_pretrain/test_pretrained_quesnet.py @@ -83,6 +83,7 @@ def test_train_quesnet(self, standard_luna_data, pretrained_model_dir): pretrained_model_dir, img_dir=img_dir, save_embs=True, + load_embs=False, # data_params={ # "stem_key": "ques_content" # }, From 35c1057e40cb77c8fd95d95dd7343c58949015a8 Mon Sep 17 00:00:00 2001 From: wintermelon008 <1904346407@qq.com> Date: Tue, 1 Aug 2023 17:42:57 +0800 Subject: [PATCH 46/58] update codes dor flake8 --- EduNLP/Pretrain/quesnet_vec.py | 162 +++++++++++++++++++------------- tests/test_pretrain/conftest.py | 1 - 2 files changed, 95 insertions(+), 68 deletions(-) diff --git a/EduNLP/Pretrain/quesnet_vec.py b/EduNLP/Pretrain/quesnet_vec.py index 66ff0bbf..b7d479d9 100644 --- a/EduNLP/Pretrain/quesnet_vec.py +++ b/EduNLP/Pretrain/quesnet_vec.py @@ -157,12 +157,12 @@ def _convert_to_ids(self, item: Union[str, dict, list], key=lambda x: x, token_idx = [] for _, w in enumerate(token_item): if isinstance(w, FigureSegment) and 'ques_figure_ids' in item.keys(): - # image + # image try: fig_id = f"{w.src[10:-1]}" fig_index = item['ques_figure_ids'].index(fig_id) - + if self.img_dir != "": fig_src = os.path.join(self.img_dir, fig_id) if '.png' in item['ques_figure_paths'][fig_index]: @@ -171,12 +171,12 @@ def _convert_to_ids(self, item: Union[str, dict, list], key=lambda x: x, fig_src += '.jpg' else: fig_src = item['ques_figure_paths'][fig_index] - + print(f"Open figure {fig_src}") im = Image.open(fig_src) im = im.resize((56, 56)) token_idx.append(to_grayscale(im)) - + except Exception: warnings.warn('Open image error!') token_idx.append(self.stoi['word'][self.img_token]) @@ -316,23 +316,26 @@ def padding(self, idx, max_length, type='word'): def set_img_dir(self, path): self.img_dir = path - - + + class QuesnetDataset(Dataset): ''' Quesnet-specific datasets ''' - def __init__(self, filename: str, - tokenizer: QuesNetTokenizer = None, - img_dir: str = "", - meta: Optional[list] = None, - content_key=lambda x: x['ques_content'], - meta_key=lambda x: x['know_name'], - answer_key=lambda x: x['ques_answer'], - option_key=lambda x: x['ques_options'], - pipeline=None, - skip=0 - ): + def __init__( + self, + filename: str, + tokenizer: QuesNetTokenizer = None, + img_dir: str = "", + meta: Optional[list] = None, + content_key=lambda x: x['ques_content'], + meta_key=lambda x: x['know_name'], + answer_key=lambda x: x['ques_answer'], + option_key=lambda x: x['ques_options'], + pipeline=None, + skip=0 + ): + self.filename = filename self.skip = skip self.img_dir = img_dir @@ -341,16 +344,22 @@ def __init__(self, filename: str, self.answer_key = answer_key self.option_key = option_key self.pipeline = pipeline - - if tokenizer == None: - tokenizer = QuesNetTokenizer(meta=['know_name'], - img_dir=img_dir) + + if tokenizer is None: + tokenizer = QuesNetTokenizer( + meta=['know_name'], + img_dir=img_dir + ) self.tokenizer = tokenizer self.meta = meta if meta else tokenizer.meta self.load_data_lines() - tokenizer.set_vocab(self.lines, key=lambda x: x['ques_content'], - trim_min_count=2, silent=False) - tokenizer.set_meta_vocab(self.lines, silent=False) + tokenizer.set_vocab( + self.lines, + key=lambda x: x['ques_content'], + trim_min_count=2, + silent=False + ) + tokenizer.set_meta_vocab(self.lines, silent=False) def load_data_lines(self): @@ -376,15 +385,13 @@ def load_data_lines(self): if not line: break self.lines.append(json.loads(line.strip())) - + self.length = row - skip - 1 assert self.length > 0, f'{data_dir} is empty. Or file length is less than skip length.' - def __len__(self): return len(self.lines) - def __getitem__(self, index): if isinstance(index, int): line = self.lines[index] @@ -393,8 +400,9 @@ def __getitem__(self, index): token = self.tokenizer(line, key=self.content_key, meta=self.meta) content = token['seq_idx'] meta = token['meta_idx'] - - if self.answer_key(line).isalpha() and len(self.answer_key(line)) == 1 and ord(self.answer_key(line)) < 128 and len(self.option_key(line)) > 0: + + if self.answer_key(line).isalpha() and len(self.answer_key(line)) == 1 \ + and ord(self.answer_key(line)) < 128 and len(self.option_key(line)) > 0: answer_idx = ord(self.answer_key(line).upper()) - ord('A') options = self.option_key(line) answer = self.tokenizer(options.pop(answer_idx), meta=self.meta)['seq_idx'] @@ -402,24 +410,30 @@ def __getitem__(self, index): else: answer = (self.tokenizer(self.answer_key(line), meta=self.meta))['seq_idx'] false_options = [[0], [0], [0]] - - qs = Question(id=qid, content=content, answer=answer, - false_options=false_options, labels=meta) + + qs = Question( + id=qid, + content=content, + answer=answer, + false_options=false_options, + labels=meta + ) + if callable(self.pipeline): - qs = self.pipeline(qs) - + qs = self.pipeline(qs) + return qs - - + elif isinstance(index, slice): results = [] for i in range(*index.indices(len(self))): results.append(self[i]) return results - + else: raise TypeError('Invalid argument type. Index type should be int or slice.') - + + class EmbeddingDataset(Dataset): def __init__(self, data, data_type='image'): self.data = data @@ -434,7 +448,7 @@ def __getitem__(self, idx): return to_tensor(self.data[idx]) elif self.data_type == 'meta': return self.data[idx] - + class PrefetchIter: """Iterator on data and labels, with states for save and restore.""" @@ -445,7 +459,7 @@ def __init__(self, data, *label, length=None, batch_size=1, shuffle=True): self.batch_size = batch_size self.queue = queue.Queue(maxsize=8) self.length = length if length is not None else len(data) - + assert all(self.length == len(lab) for lab in label), \ 'data and label must have same lengths' @@ -497,10 +511,11 @@ def produce(self): except Exception as e: self.queue.put(e) return - + sigint_handler = signal.getsignal(signal.SIGINT) + def critical(f): it = iter(f) signal_received = () @@ -519,7 +534,7 @@ def handler(sig, frame): sigint_handler(*signal_received) except StopIteration: break - + def pretrain_embedding_layer(dataset: EmbeddingDataset, ae: AE, lr: float = 1e-3, log_step: int = 1, epochs: int = 3, batch_size: int = 4, device=torch.device('cpu')): @@ -538,18 +553,20 @@ def pretrain_embedding_layer(dataset: EmbeddingDataset, ae: AE, lr: float = 1e-3 logger.info(f"[Epoch{i}][Batch{batch}]Training {train_type} Embedding layer, loss:{loss}") return ae + def optimizer(*models, **kwargs): - _cur_optim = [m.optim_cls(m.parameters(), **kwargs) - if hasattr(m, 'optim_cls') - else torch.optim.Adam(m.parameters(), **kwargs) - for m in models] + _cur_optim = [ + m.optim_cls(m.parameters(), **kwargs) + if hasattr(m, 'optim_cls') + else torch.optim.Adam(m.parameters(), **kwargs) for m in models + ] if len(_cur_optim) == 1: return _cur_optim[0] else: - return _cur_optim - + return _cur_optim + -def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save_embs = False, load_embs = False, train_params = None): +def pretrain_quesnet(path, output_dir, pretrain_dir=None, img_dir=None, save_embs=False, load_embs=False, train_params=None): """ pretrain quesnet Parameters @@ -597,7 +614,7 @@ def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save """ os.makedirs(output_dir, exist_ok=True) device = torch.device(train_params['device']) - + default_train_params = { # train params "n_epochs": 1, @@ -648,11 +665,14 @@ def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save meta_corpus.append(meta_vector) # train word2vec for text embedding - if pretrain_dir != None and load_embs: + if pretrain_dir is not None and load_embs: model.quesnet.load_emb(np.load(os.path.join(output_dir, 'w2v_embs.npy'))) else: - gensim_w2v = Word2Vec(sentences=[[item] for item in emb_dict.keys()], min_count=1, - vector_size=emb_size) + gensim_w2v = Word2Vec( + sentences=[[item] for item in emb_dict.keys()], + min_count=1, + vector_size=emb_size + ) gensim_w2v.init_weights() gensim_w2v.train(corpus_iterable=w2v_corpus, total_examples=len(w2v_corpus), epochs=train_params['n_epochs']) w2v_emb = gensim_w2v.syn1neg @@ -667,35 +687,46 @@ def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save logger.info('quesnet Word Embedding loaded') # train auto-encoder loss for image embedding - if pretrain_dir != None and load_embs: + if pretrain_dir is not None and load_embs: model.quesnet.load_img(torch.load(os.path.join(pretrain_dir, 'trained_ie.pt'))) else: img_dataset = EmbeddingDataset(data=img_corpus, data_type='image') - trained_ie = pretrain_embedding_layer(dataset=img_dataset, ae=model.quesnet.ie, lr=train_params['lr'], - log_step=train_params['log_steps'], batch_size=train_params['batch_size'], - epochs=train_params['n_epochs'], device=device) + trained_ie = pretrain_embedding_layer( + dataset=img_dataset, + ae=model.quesnet.ie, + lr=train_params['lr'], + log_step=train_params['log_steps'], + batch_size=train_params['batch_size'], + epochs=train_params['n_epochs'], + device=device + ) if save_embs: torch.save(trained_ie.state_dict(), os.path.join(output_dir, 'trained_ie.pt')) model.quesnet.load_img(trained_ie) logger.info('quesnet Image Embedding loaded') - # train auto-encoder loss for meta embedding - if pretrain_dir != None and load_embs: + if pretrain_dir is not None and load_embs: model.quesnet.load_meta(torch.load(os.path.join(pretrain_dir, 'trained_me.pt'))) else: meta_dateset = EmbeddingDataset(data=meta_corpus, data_type='meta') - trained_me = pretrain_embedding_layer(dataset=meta_dateset, ae=model.quesnet.me, lr=train_params['lr'], - log_step=train_params['log_steps'], batch_size=train_params['batch_size'], - epochs=train_params['n_epochs'], device=device) + trained_me = pretrain_embedding_layer( + dataset=meta_dateset, + ae=model.quesnet.me, + lr=train_params['lr'], + log_step=train_params['log_steps'], + batch_size=train_params['batch_size'], + epochs=train_params['n_epochs'], + device=device + ) if save_embs: torch.save(trained_me.state_dict(), os.path.join(output_dir, 'trained_me.pt')) model.quesnet.load_meta(trained_me) logger.info('quesnet Meta Embedding loaded') - + logger.info("quesnet Word, Image and Meta Embeddings training is done") # DONE for datasets - + # HLM and DOO training dataset.pipeline = partial(model.quesnet.make_batch, device=device, pretrain=True) model.train() @@ -726,6 +757,3 @@ def pretrain_quesnet(path, output_dir, pretrain_dir = None, img_dir = None, save model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) - - - diff --git a/tests/test_pretrain/conftest.py b/tests/test_pretrain/conftest.py index 75e1bc94..461f6fcb 100644 --- a/tests/test_pretrain/conftest.py +++ b/tests/test_pretrain/conftest.py @@ -9,7 +9,6 @@ # TEST_GPU = torch.cuda.is_available() - @pytest.fixture(scope="module") def standard_luna_data(): data_path = path_append(abs_current_dir(__file__), "../../static/test_data/standard_luna_data.json", to_str=True) From 9bd85dbe97753e8c17aad1da88c7d529c7a61ef3 Mon Sep 17 00:00:00 2001 From: wintermelon008 <1904346407@qq.com> Date: Tue, 1 Aug 2023 17:56:54 +0800 Subject: [PATCH 47/58] fix bugs for flake8 --- EduNLP/ModelZoo/quesnet/quesnet.py | 24 +++--------------- EduNLP/ModelZoo/quesnet/util.py | 7 +++--- EduNLP/Pretrain/quesnet_vec.py | 40 ++++++++++++++++++------------ 3 files changed, 32 insertions(+), 39 deletions(-) diff --git a/EduNLP/ModelZoo/quesnet/quesnet.py b/EduNLP/ModelZoo/quesnet/quesnet.py index fe1b0c84..c07aa3c0 100644 --- a/EduNLP/ModelZoo/quesnet/quesnet.py +++ b/EduNLP/ModelZoo/quesnet/quesnet.py @@ -160,7 +160,7 @@ def make_batch(self, data, device, pretrain=False): for i, fo in enumerate(q.false_options): false_options[i].append([0] + fo) - + lembs = SeqBatch(lembs, device=device) rembs = SeqBatch(rembs, device=device) embs = SeqBatch(embs, device=device) @@ -195,24 +195,8 @@ def make_batch(self, data, device, pretrain=False): words = torch.cat(words, dim=0) if words else None ims = torch.cat(ims, dim=0) if ims else None - metas = torch.cat(metas, dim=0) if metas else None - - - # print("debug1") - # print(lembs) - # print(rembs) - # print(words) - # print(ims) - # print(metas) - # print(wmask) - # print(imask) - # print(mmask) - # print(embs) - # print(ans_input) - # print(ans_output) - # print(false_opt_input) - - + metas = torch.cat(metas, dim=0) if metas else None + if pretrain: return ( lembs, rembs, words, ims, metas, wmask, imask, mmask, @@ -331,7 +315,7 @@ def forward(self, batch): h = outputs.hidden x = ans_input.packed() - + y, _ = self.ans_decode(PackedSequence(self.quesnet.we(x[0].data), x.batch_sizes), h.repeat(self.config.layers, 1, 1)) floss = F.cross_entropy(self.ans_output(y.data), diff --git a/EduNLP/ModelZoo/quesnet/util.py b/EduNLP/ModelZoo/quesnet/util.py index cc11f8b0..b840e37a 100644 --- a/EduNLP/ModelZoo/quesnet/util.py +++ b/EduNLP/ModelZoo/quesnet/util.py @@ -23,7 +23,7 @@ def __init__(self, seqs, dtype=None, device=None): self._prefix = [0] self._index = {} c = 0 - + for i in range(self.lens[0]): for j in range(len(self.lens)): if self.lens[j] <= i: @@ -40,8 +40,9 @@ def packed(self): def padded(self, max_len=None, batch_first=False): if not self.seqs: - return torch.empty((0, 0), dtype=self.dtype, device=self.device), torch.empty((0, 0), dtype=torch.bool, device=self.device) - + return torch.empty((0, 0), dtype=self.dtype, device=self.device), \ + torch.empty((0, 0), dtype=torch.bool, device=self.device) + seqs = [torch.tensor(s, dtype=self.dtype, device=self.device) if not isinstance(s, torch.Tensor) else s for s in self.seqs] diff --git a/EduNLP/Pretrain/quesnet_vec.py b/EduNLP/Pretrain/quesnet_vec.py index b7d479d9..d33976b7 100644 --- a/EduNLP/Pretrain/quesnet_vec.py +++ b/EduNLP/Pretrain/quesnet_vec.py @@ -39,6 +39,7 @@ def save_list(item2index, path): def clip(v, low, high): return max(low, min(v, high)) + # Basic unit of Dataset Question = namedtuple('Question', ['id', 'content', 'answer', 'false_options', 'labels']) @@ -334,7 +335,7 @@ def __init__( option_key=lambda x: x['ques_options'], pipeline=None, skip=0 - ): + ): self.filename = filename self.skip = skip @@ -349,7 +350,7 @@ def __init__( tokenizer = QuesNetTokenizer( meta=['know_name'], img_dir=img_dir - ) + ) self.tokenizer = tokenizer self.meta = meta if meta else tokenizer.meta self.load_data_lines() @@ -358,16 +359,15 @@ def __init__( key=lambda x: x['ques_content'], trim_min_count=2, silent=False - ) + ) tokenizer.set_meta_vocab(self.lines, silent=False) - def load_data_lines(self): '''Read data by row from a JSON file - + Important: the data file is loaded during initialization. ''' - + # TODO: All data is read into memory without chunking. # This may lead to low efficiency. data_dir = self.filename @@ -402,7 +402,7 @@ def __getitem__(self, index): meta = token['meta_idx'] if self.answer_key(line).isalpha() and len(self.answer_key(line)) == 1 \ - and ord(self.answer_key(line)) < 128 and len(self.option_key(line)) > 0: + and ord(self.answer_key(line)) < 128 and len(self.option_key(line)) > 0: answer_idx = ord(self.answer_key(line).upper()) - ord('A') options = self.option_key(line) answer = self.tokenizer(options.pop(answer_idx), meta=self.meta)['seq_idx'] @@ -417,7 +417,7 @@ def __getitem__(self, index): answer=answer, false_options=false_options, labels=meta - ) + ) if callable(self.pipeline): qs = self.pipeline(qs) @@ -556,17 +556,25 @@ def pretrain_embedding_layer(dataset: EmbeddingDataset, ae: AE, lr: float = 1e-3 def optimizer(*models, **kwargs): _cur_optim = [ - m.optim_cls(m.parameters(), **kwargs) - if hasattr(m, 'optim_cls') + m.optim_cls(m.parameters(), **kwargs) + if hasattr(m, 'optim_cls') else torch.optim.Adam(m.parameters(), **kwargs) for m in models - ] + ] if len(_cur_optim) == 1: return _cur_optim[0] else: return _cur_optim - -def pretrain_quesnet(path, output_dir, pretrain_dir=None, img_dir=None, save_embs=False, load_embs=False, train_params=None): + +def pretrain_quesnet( + path, + output_dir, + pretrain_dir=None, + img_dir=None, + save_embs=False, + load_embs=False, + train_params=None +): """ pretrain quesnet Parameters @@ -672,7 +680,7 @@ def pretrain_quesnet(path, output_dir, pretrain_dir=None, img_dir=None, save_emb sentences=[[item] for item in emb_dict.keys()], min_count=1, vector_size=emb_size - ) + ) gensim_w2v.init_weights() gensim_w2v.train(corpus_iterable=w2v_corpus, total_examples=len(w2v_corpus), epochs=train_params['n_epochs']) w2v_emb = gensim_w2v.syn1neg @@ -699,7 +707,7 @@ def pretrain_quesnet(path, output_dir, pretrain_dir=None, img_dir=None, save_emb batch_size=train_params['batch_size'], epochs=train_params['n_epochs'], device=device - ) + ) if save_embs: torch.save(trained_ie.state_dict(), os.path.join(output_dir, 'trained_ie.pt')) model.quesnet.load_img(trained_ie) @@ -718,7 +726,7 @@ def pretrain_quesnet(path, output_dir, pretrain_dir=None, img_dir=None, save_emb batch_size=train_params['batch_size'], epochs=train_params['n_epochs'], device=device - ) + ) if save_embs: torch.save(trained_me.state_dict(), os.path.join(output_dir, 'trained_me.pt')) model.quesnet.load_meta(trained_me) From 92776a6b45e4e7be0e303518f7b4996149dc2de8 Mon Sep 17 00:00:00 2001 From: wintermelon008 <1904346407@qq.com> Date: Tue, 1 Aug 2023 18:03:26 +0800 Subject: [PATCH 48/58] fix bugs for flake8 --- EduNLP/ModelZoo/quesnet/quesnet.py | 2 +- EduNLP/Pretrain/quesnet_vec.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/EduNLP/ModelZoo/quesnet/quesnet.py b/EduNLP/ModelZoo/quesnet/quesnet.py index c07aa3c0..a103645e 100644 --- a/EduNLP/ModelZoo/quesnet/quesnet.py +++ b/EduNLP/ModelZoo/quesnet/quesnet.py @@ -195,7 +195,7 @@ def make_batch(self, data, device, pretrain=False): words = torch.cat(words, dim=0) if words else None ims = torch.cat(ims, dim=0) if ims else None - metas = torch.cat(metas, dim=0) if metas else None + metas = torch.cat(metas, dim=0) if metas else None if pretrain: return ( diff --git a/EduNLP/Pretrain/quesnet_vec.py b/EduNLP/Pretrain/quesnet_vec.py index d33976b7..d6b0ae78 100644 --- a/EduNLP/Pretrain/quesnet_vec.py +++ b/EduNLP/Pretrain/quesnet_vec.py @@ -361,7 +361,7 @@ def __init__( silent=False ) tokenizer.set_meta_vocab(self.lines, silent=False) - + def load_data_lines(self): '''Read data by row from a JSON file From 00e3726d0ed6a182ad2ad9ae1223dddacf71af4a Mon Sep 17 00:00:00 2001 From: shangzi <2716440431@qq.com> Date: Wed, 2 Aug 2023 12:48:19 +0000 Subject: [PATCH 49/58] Update tutorial docs and api docs --- docs/source/api/ModelZoo.rst | 36 +++ docs/source/api/tokenizer.rst | 82 ++++++ docs/source/tutorial/en/pretrain.rst | 314 +++++++++++++++++++++ docs/source/tutorial/en/tokenization.rst | 338 ++++++++++++++++++++++- docs/source/tutorial/zh/pretrain.rst | 316 +++++++++++++++++++++ docs/source/tutorial/zh/tokenization.rst | 335 ++++++++++++++++++++++ 6 files changed, 1418 insertions(+), 3 deletions(-) diff --git a/docs/source/api/ModelZoo.rst b/docs/source/api/ModelZoo.rst index 1b8ec9e4..1ee6569b 100644 --- a/docs/source/api/ModelZoo.rst +++ b/docs/source/api/ModelZoo.rst @@ -7,6 +7,28 @@ base_model .. automodule:: EduNLP.ModelZoo.base_model :members: +:: + 相关方法中的参数说明: + + save_pretrained(output_dir): + output_dir: str + The path you want to save your model + + classmethodfrom_pretrained(pretrained_model_path, *args, **kwargs): + pretrained_model_path: str + The path where you load your checkpoint from + + save_config(config_dir): + config_dir: str + The path you want to save the config file + + @classmethod + from_config(config_path, *args, **kwargs): + config_path: str + The path where you load the config file + + + rnn ----------- @@ -14,6 +36,14 @@ rnn :members: :imported-members: +:: + 参数补充说明: + @classmethod from_config(config_path, **kwargs): + config_path: str + The path where you load the config file + + + disenqnet ----------- @@ -21,6 +51,12 @@ disenqnet :members: :imported-members: +:: + 参数补充说明: + @classmethod from_config(config_path, **kwargs): + config_path: str + The path where you load the config file + quesnet ----------- diff --git a/docs/source/api/tokenizer.rst b/docs/source/api/tokenizer.rst index 63d27f48..e7cf330c 100644 --- a/docs/source/api/tokenizer.rst +++ b/docs/source/api/tokenizer.rst @@ -4,3 +4,85 @@ EduNLP.Tokenizer .. automodule:: EduNLP.Tokenizer :members: :imported-members: + +AstFormulaTokenizer参数定义 +####################################### + +:: + Parameters + ---------- + symbol : str, optional + Elements to symbolize before tokenization, by default "gmas" + figures : _type_, optional + Info for figures in items, by default None + """ + +CharTokenizer参数定义 +####################################### + +:: + """Tokenize text char by char. eg. "题目内容" -> ["题", "目", "内", 容"] + + Parameters + ---------- + stop_words : str, optional + stop_words to skip, by default "default" + """ + +CustomTokenizer参数定义 +####################################### + +:: + """Tokenize SIF items by customized configuration + + Parameters + ---------- + symbol : str, optional + Elements to symbolize before tokenization, by default "gmas" + figures : _type_, optional + Info for figures in items, by default None + kwargs: addtional configuration for SIF items + including text_params, formula_params, figure_params, more details could be found in `EduNLP.SIF.sif4sci` + """ + +PureTextTokenizer参数定义 +####################################### + +:: + """ + Treat all elements in SIF item as prue text. Spectially, tokenize formulas as text. + + Parameters + ---------- + handle_figure_formula : str, optional + whether to skip or symbolize special formulas( $\\FormFigureID{…}$ and $\\FormFigureBase64{…}), + by default skip + +SpaceTokenizer参数定义 +####################################### + +:: + """ + Tokenize text by space. eg. "题目 内容" -> ["题目", "内容"] + + Parameters + ---------- + stop_words : str, optional + stop_words to skip, by default "default" + """ + +EduNLP.Tokenizer.get_tokenizer参数定义 +####################################### + +:: + Parameters + ---------- + name: str + the name of tokenizer, e.g. text, pure_text. + args: + the parameters passed to tokenizer + kwargs: + the parameters passed to tokenizer + Returns + ------- + tokenizer: Tokenizer \ No newline at end of file diff --git a/docs/source/tutorial/en/pretrain.rst b/docs/source/tutorial/en/pretrain.rst index fd07ea27..f3345514 100644 --- a/docs/source/tutorial/en/pretrain.rst +++ b/docs/source/tutorial/en/pretrain.rst @@ -45,6 +45,171 @@ The corpus dictionary is a tool introduced in pre-training to facilitate user po >>> print(res) ['An', '[UNK]', '[UNK]', 'a', '[UNK]', '[UNK]', '[UNK]'] +:: + EduVocab() related parameter descriptions and function definitions: + class EduVocab(object): + """The vocabulary container for a corpus. + + Parameters + ---------- + vocab_path : str, optional + vocabulary path to initialize this container, by default None + corpus_items : List[str], optional + corpus items to update this vocabulary, by default None + bos_token : str, optional + token representing for the start of a sentence, by default "[BOS]" + eos_token : str, optional + token representing for the end of a sentence, by default "[EOS]" + pad_token : str, optional + token representing for padding, by default "[PAD]" + unk_token : str, optional + token representing for unknown word, by default "[UNK]" + specials : List[str], optional + spacials tokens in vocabulary, by default None + lower : bool, optional + wheather to lower the corpus items, by default False + trim_min_count : int, optional + the lower bound number for adding a word into vocabulary, by default 1 + """ + def __init__(self, vocab_path: str = None, corpus_items: List[str] = None, bos_token: str = "[BOS]", + eos_token: str = "[EOS]", pad_token: str = "[PAD]", unk_token: str = "[UNK]", + specials: List[str] = None, lower: bool = False, trim_min_count: int = 1, **kwargs): + super(EduVocab, self).__init__() + + self._tokens = [] + self.idx_to_token = dict() + self.token_to_idx = dict() + self.frequencies = dict() + # 定义特殊词 + self.bos_token = bos_token + self.eos_token = eos_token + self.pad_token = pad_token + self.unk_token = unk_token + self._special_tokens = [self.pad_token, self.unk_token, self.bos_token, self.eos_token] + + if specials: + self._special_tokens += specials + for st in self._special_tokens: + self._add(st) + # 加载词典 + if vocab_path is not None: + self.load_vocab(vocab_path) + elif corpus_items is not None: + self.set_vocab(corpus_items, lower, trim_min_count) + + self.bos_idx = self.token_to_idx[self.bos_token] + self.eos_idx = self.token_to_idx[self.eos_token] + self.pad_idx = self.token_to_idx[self.pad_token] + self.unk_idx = self.token_to_idx[self.unk_token] + + def __len__(self): + return len(self._tokens) + + @property + def vocab_size(self): + return len(self._tokens) + + @property + def special_tokens(self): + return self._special_tokens + + @property + def tokens(self): + return self._tokens + + def to_idx(self, token): + """convert token to index""" + return self.token_to_idx.get(token, self.unk_idx) + + def to_token(self, idx): + """convert index to index""" + return self.idx_to_token.get(idx, self.unk_token) + + def convert_sequence_to_idx(self, tokens, bos=False, eos=False): + """convert sentence of tokens to sentence of indexs""" + res = [self.to_idx(t) for t in tokens] + if bos is True: + res = [self.bos_idx] + res + if eos is True: + res = res + [self.eos_idx] + return res + + def convert_sequence_to_token(self, idxs, **kwargs): + """convert sentence of indexs to sentence of tokens""" + return [self.to_token(i) for i in idxs] + + def set_vocab(self, corpus_items: List[str], lower: bool = False, trim_min_count: int = 1, silent=True): + """Update the vocabulary with the tokens in corpus items + + Parameters + ---------- + corpus_items : List[str], optional + corpus items to update this vocabulary, by default None + lower : bool, optional + wheather to lower the corpus items, by default False + trim_min_count : int, optional + the lower bound number for adding a word into vocabulary, by default 1 + """ + word2cnt = dict() + for item in corpus_items: + for word in item: + word = word.lower() if lower else word + word2cnt[word] = word2cnt.get(word, 0) + 1 + words = [w for w, c in word2cnt.items() if c >= trim_min_count and w not in self._special_tokens] + for token in words: + self._add(token) + if not silent: + keep_word_cnts = sum(word2cnt[w] for w in words) + all_word_cnts = sum(word2cnt.values()) + print(f"save words(trim_min_count={trim_min_count}): {len(words)}/{len(word2cnt)} = {len(words) / len(word2cnt):.4f}\ + with frequency {keep_word_cnts}/{all_word_cnts}={keep_word_cnts / all_word_cnts:.4f}") + + def load_vocab(self, vocab_path: str): + """Load the vocabulary from vocab_file + + Parameters + ---------- + vocab_path : str + path to save vocabulary file + """ + with open(vocab_path, "r", encoding="utf-8") as file: + self._tokens = file.read().strip().split('\n') + self.token_to_idx = {token: idx for idx, token in enumerate(self._tokens)} + self.idx_to_token = {idx: token for idx, token in enumerate(self._tokens)} + + def save_vocab(self, vocab_path: str): + """Save the vocabulary into vocab_file + + Parameters + ---------- + vocab_path : str + path to save vocabulary file + """ + with open(vocab_path, 'w', encoding='utf-8') as file: + for i in range(self.vocab_size): + token = self._tokens[i] + file.write(f"{token}\n") + + def _add(self, token: str): + if token not in self._tokens: + idx = len(self._tokens) + self._tokens.append(token) + self.idx_to_token[idx] = token + self.token_to_idx[token] = idx + + def add_specials(self, tokens: List[str]): + """Add special tokens into vocabulary""" + for token in tokens: + if token not in self._special_tokens: + self._special_tokens += [token] + self._add(token) + + def add_tokens(self, tokens: List[str]): + """Add tokens into vocabulary""" + for token in tokens: + self._add(token) + + Basic Steps @@ -67,6 +232,93 @@ Examples: # 10 dimension with fasstext method train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="d2v") +:: + Definition of train_vector(): + def train_vector(items, w2v_prefix, embedding_dim=None, method="sg", binary=None, train_params=None): + """ + + Parameters + ---------- + items:str + the text of question + w2v_prefix + embedding_dim:int + vector_size + method:str + the method of training, + e.g.: sg, cbow, fasttext, d2v, bow, tfidf + binary: model format + True:bin; + False:kv + train_params: dict + the training parameters passed to model + + Returns + ---------- + tokenizer: Tokenizer + + """ + monitor = MonitorCallback(["word", "I", "less"]) + _train_params = dict( + min_count=0, + vector_size=embedding_dim, + workers=multiprocessing.cpu_count(), + callbacks=[monitor] + ) + if method in {"sg", "cbow"}: + sg = 1 if method == "sg" else 0 + _train_params["sg"] = sg + if train_params is not None: + _train_params.update(train_params) + model = gensim.models.Word2Vec( + items, **_train_params + ) + binary = binary if binary is not None else False + elif method == "fasttext": + if train_params is not None: + _train_params.update(train_params) + model = gensim.models.FastText( + sentences=items, + **_train_params + ) + binary = binary if binary is not None else True + elif method == "d2v": + if train_params is not None: + _train_params.update(train_params) + docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(items)] + model = gensim.models.Doc2Vec( + docs, **_train_params + ) + binary = binary if binary is not None else True + elif method == "bow": + model = gensim.corpora.Dictionary(items) + binary = binary if binary is not None else True + elif method == "tfidf": + dictionary_path = train_vector(items, w2v_prefix, method="bow") + dictionary = BowLoader(dictionary_path) + corpus = [dictionary.infer_vector(item) for item in items] + model = gensim.models.TfidfModel(corpus) + binary = binary if binary is not None else True + else: + raise ValueError("Unknown method: %s" % method) + + filepath = w2v_prefix + method + if embedding_dim is not None: + filepath = filepath + "_" + str(embedding_dim) + + if binary is True: + filepath += ".bin" + logger.info("model is saved to %s" % filepath) + model.save(filepath) + else: + if method in {"fasttext", "d2v"}: # pragma: no cover + logger.warning("binary should be True for %s, otherwise all vectors for ngrams will be lost." % method) + filepath += ".kv" + logger.info("model is saved to %s" % filepath) + model.wv.save(filepath) + return filepath + + Load models ---------------- @@ -80,6 +332,68 @@ Examples: >>> model_path = "../test_model/d2v/test_gensim_luna_stem_tf_d2v_256.bin" >>> i2v = D2V("text","d2v",filepath=model_path, pretrained_t2v = False) +:: + + Taking D2V as an example, the specific definitions are as follows: (For other interfaces, please refer to the definitions under EduNLP/I2V) + + class D2V(I2V): + """ + The model aims to transfer item to vector directly. + + Bases + ------- + I2V + + Parameters + ----------- + tokenizer: str + the tokenizer name + t2v: str + the name of token2vector model + args: + the parameters passed to t2v + tokenizer_kwargs: dict + the parameters passed to tokenizer + pretrained_t2v: bool + True: use pretrained t2v model + False: use your own t2v model + kwargs: + the parameters passed to t2v + + Returns + ------- + i2v model: I2V + """ + + def infer_vector(self, items, tokenize=True, key=lambda x: x, *args, + **kwargs) -> tuple: + """ + It is a function to switch item to vector. And before using the function, it is necessary to load model. + + Parameters + ----------- + items:str + the text of question + tokenize: bool + True: tokenize the item + key: function + determine how to get the text of each item + args: + the parameters passed to t2v + kwargs: + the parameters passed to t2v + + Returns + -------- + vector:list + """ + tokens = self.tokenize(items, key=key) if tokenize is True else items + tokens = [token for token in tokens] + return self.t2v(tokens, *args, **kwargs), None + + @classmethod + def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): + return cls("pure_text", name, pretrained_t2v=True, model_dir=model_dir) Examples of Model Training ------------------------------------ diff --git a/docs/source/tutorial/en/tokenization.rst b/docs/source/tutorial/en/tokenization.rst index e6935a92..fdac21d5 100644 --- a/docs/source/tutorial/en/tokenization.rst +++ b/docs/source/tutorial/en/tokenization.rst @@ -26,6 +26,32 @@ The corresponding instance is `EduNLP.SIF.tokenize`. >>> tokenize(seg(items), formula_params={"method": "ast"}) ['如图所示', '三角形', , '面积', '\\\\SIFBlank', \\FigureID{1}] +:: + + The definition of EduNLP.SIF.tokenize: + + def tokenize(segment_list: SegmentList, text_params=None, formula_params=None, figure_params=None): + """ + an actual api to tokenize item + + Parameters + ---------- + segment_list:list + segmented item + text_params:dict + the method to duel with text + formula_params:dict + the method to duel with formula + figure_params:dict + the method to duel with figure + + Returns + ---------- + list + tokenized item + + """ + return TokenList(segment_list, text_params, formula_params, figure_params) Standard interface ^^^^^^^^^^^^^^^^^^^^^^ @@ -88,6 +114,28 @@ In addition, we provide a key parameter to select the pending content in the inc ['文', '具', '店', '有', '$', '600', '$', '本', '练', '习', '本', '卖', '出', '一', '些', '后', '还', '剩', '$', '4', '$', '包', '每', '包', '$', '25', '$', '本', '卖', '出', '多', '少', '本'] +:: + + The dataefinition of CharTokenizer: + class CharTokenizer(Tokenizer): + def __init__(self, stop_words="punctuations", **kwargs) -> None: + """Tokenize text char by char. eg. "题目内容" -> ["题", "目", "内", 容"] + + Parameters + ---------- + stop_words : str, optional + stop_words to skip, by default "default" + """ + self.stop_words = set("\n\r\t .,;?\"\'。.,、;?“”‘’()") if stop_words == "punctuations" else stop_words + self.stop_words = stop_words if stop_words is not None else set() + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + tokens = tokenize_text(key(item).strip(), granularity="char", stopwords=self.stop_words) + return tokens SpaceTokenizer ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -109,6 +157,30 @@ In addition, we provide a key parameter to select the pending content in the inc ['已知集合$A=\\left\\{x', '\\mid', 'x^{2}-3', 'x-4<0\\right\\},', '\\quad', 'B=\\{-4,1,3,5\\},', '\\quad$', '则', '$A', '\\cap', 'B=$'] +:: + + The definition of SpaceTokenizer: + class SpaceTokenizer(Tokenizer): + """Tokenize text by space. eg. "题目 内容" -> ["题目", "内容"] + + Parameters + ---------- + stop_words : str, optional + stop_words to skip, by default "default" + """ + def __init__(self, stop_words="punctuations", **kwargs) -> None: + stop_words = set("\n\r\t .,;?\"\'。.,、;?“”‘’()") if stop_words == "punctuations" else stop_words + self.stop_words = stop_words if stop_words is not None else set() + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + tokens = key(item).strip().split(' ') + if self.stop_words: + tokens = [w for w in tokens if w != '' and w not in self.stop_words] + return tokens CustomTokenizer ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -136,7 +208,55 @@ In addition, we provide a key parameter to select the pending content in the inc >>> print(next(tokens)) ['已知', '集合', '[FORMULA]', '[FORMULA]'] - +:: + + The definition of CustomTokenizer: + class CustomTokenizer(Tokenizer): + def __init__(self, symbol="gmas", figures=None, **kwargs): + """Tokenize SIF items by customized configuration + + Parameters + ---------- + symbol : str, optional + Elements to symbolize before tokenization, by default "gmas" + figures : _type_, optional + Info for figures in items, by default None + kwargs: addtional configuration for SIF items + including text_params, formula_params, figure_params, more details could be found in `EduNLP.SIF.sif4sci` + """ + self.tokenization_params = { + "text_params": kwargs.get("text_params", None), + "formula_params": kwargs.get("formula_params", None), + "figure_params": kwargs.get("figure_params", None) + } + self.symbol = symbol + self.figures = figures + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + """Tokenize items, return iterator genetator + + Parameters + ---------- + item : Iterable + question items + key : function, optional + determine how to get the text of items, by default lambdax: x + """ + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + """Tokenize one item, return token list + + Parameters + ---------- + item : Union[str, dict] + question item + key : function, optional + determine how to get the text of item, by default lambdax: x + """ + return tokenize(seg(key(item), symbol=self.symbol, figures=self.figures), + **self.tokenization_params, **kwargs).tokens PureTextTokenizer @@ -157,6 +277,55 @@ In addition, we provide a key parameter to select the pending content in the inc >>> print(next(tokens)) ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]'] +:: + + The definition of PureTextTokenizer: + class PureTextTokenizer(Tokenizer): + def __init__(self, handle_figure_formula="skip", **kwargs): + """ + Treat all elements in SIF item as prue text. Spectially, tokenize formulas as text. + + Parameters + ---------- + handle_figure_formula : str, optional + whether to skip or symbolize special formulas( $\\FormFigureID{…}$ and $\\FormFigureBase64{…}), + by default skip + + """ + # Formula images are skipped by default + if handle_figure_formula == "skip": + skip_figure_formula = True + symbolize_figure_formula = False + elif handle_figure_formula == "symbolize": + skip_figure_formula = False + symbolize_figure_formula = True + elif handle_figure_formula is None: + skip_figure_formula, symbolize_figure_formula = False, False + else: + raise ValueError('handle_figure_formula should be one in ["skip", "symbolize", None]') + formula_params = { + "method": "linear", + "skip_figure_formula": skip_figure_formula, + "symbolize_figure_formula": symbolize_figure_formula + } + text_params = { + "granularity": "word", + "stopwords": "default", + } + formula_params.update(kwargs.pop("formula_params", {})) + text_params.update(kwargs.pop("text_params", {})) + self.tokenization_params = { + "formula_params": formula_params, + "text_params": text_params, + "figure_params": kwargs.get("figure_params", None) + } + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + return tokenize(seg(key(item), symbol="gmas"), **self.tokenization_params, **kwargs).tokens AstFormulaTokenizer @@ -176,7 +345,53 @@ In addition, we provide a key parameter to select the pending content in the inc ['公式', '[FORMULA]', '如图', '[FIGURE]', 'mathord_0', ',', 'mathord_1', '约束条件', '公式', '[FORMULA]', '[SEP]', 'mathord_2', '=', 'mathord_0', '+', 'textord', 'mathord_1', '最大值', '[MARK]'] - +:: + The definition of AstFormulaTokenizer: + class AstFormulaTokenizer(Tokenizer): + def __init__(self, symbol="gmas", figures=None, **kwargs): + """Tokenize formulas in SIF items by AST parser. + + Parameters + ---------- + symbol : str, optional + Elements to symbolize before tokenization, by default "gmas" + figures : _type_, optional + Info for figures in items, by default None + """ + formula_params = { + "method": "ast", + + "ord2token": True, + "return_type": "list", + "var_numbering": True, + + "skip_figure_formula": False, + "symbolize_figure_formula": True + } + text_params = { + "granularity": "word", + "stopwords": "default", + } + formula_params.update(kwargs.pop("formula_params", {})) + text_params.update(kwargs.pop("text_params", {})) + self.tokenization_params = { + "formula_params": formula_params, + "text_params": text_params, + "figure_params": kwargs.pop("figure_params", None), + } + self.symbol = symbol + self.figures = figures + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + mode = kwargs.pop("mode", 0) + ret = sif4sci(key(item), figures=self.figures, symbol=self.symbol, mode=mode, + tokenization_params=self.tokenization_params, errors="ignore", **kwargs) + ret = [] if ret is None else ret.tokens + return ret GensimWordTokenizer ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -201,7 +416,59 @@ By default, the pictures, blanks in the question text and other parts of the inc >>> print(token_item.tokens) ['已知', '公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]'] - +:: + + The definition of GensimWordTokenizer: + class GensimWordTokenizer(object): + """ + + Parameters + ---------- + symbol: str + select the methods to symbolize: + "t": text, + "f": formula, + "g": figure, + "m": question mark, + "a": tag, + "s": sep, + e.g.: gm, fgm, gmas, fgmas + general: bool + + True: when item isn't in standard format, and want to tokenize formulas(except formulas in figure) linearly. + + False: when use 'ast' mothed to tokenize formulas instead of 'linear'. + + Returns + ---------- + tokenizer: Tokenizer + + """ + def __init__(self, symbol="gm", general=False): + self.symbol = symbol + if general is True: + self.tokenization_params = { + "formula_params": { + "method": "linear", + "symbolize_figure_formula": True + } + } + else: + self.tokenization_params = { + "formula_params": { + "method": "ast", + "return_type": "list", + "ord2token": True + } + } + + def batch_process(self, *items): + pass + + def __call__(self, item): + return sif4sci( + item, symbol=self.symbol, tokenization_params=self.tokenization_params, errors="ignore" + ) GensimSegTokenizer @@ -236,6 +503,71 @@ Select segmentation level: print(len(token_item), token_item) # 2 [['[TEXT_BEGIN]', '已知', '公式', '[FORMULA_BEGIN]', \FormFigureID{1}, '[TEXT_BEGIN]', '如图', '[FIGURE]', '[FORMULA_BEGIN]', 'mathord', ',', 'mathord', '[TEXT_BEGIN]', '约束条件', '公式', '[FORMULA_BEGIN]', [FORMULA], '[SEP]'], ['[FORMULA_BEGIN]', 'mathord', '=', 'mathord', '+', 'textord', 'mathord', '[TEXT_BEGIN]', '最大值', '[MARK]']] +:: + + The definition of GensimSegTokenizer: + class GensimSegTokenizer(object): # pragma: no cover + """ + + Parameters + ---------- + symbol:str + select the methods to symbolize: + "t": text, + "f": formula, + "g": figure, + "m": question mark, + "a": tag, + "s": sep, + e.g. gms, fgm + + depth: int or None + + 0: only separate at \\SIFSep ; + 1: only separate at \\SIFTag ; + 2: separate at \\SIFTag and \\SIFSep ; + otherwise, separate all segments ; + + Returns + ---------- + tokenizer: Tokenizer + + """ + def __init__(self, symbol="gms", depth=None, flatten=False, **kwargs): + self.symbol = symbol + self.tokenization_params = { + "formula_params": { + "method": "ast", + "return_type": "list", + "ord2token": True + } + } + self.kwargs = dict( + add_seg_type=True if depth in {0, 1, 2} else False, + add_seg_mode="head", + depth=depth, + drop="s" if depth not in {0, 1, 2} else "" + ) + self.kwargs.update(kwargs) + self.flatten = flatten + + def __call__(self, item, flatten=None, **kwargs): + flatten = self.flatten if flatten is None else flatten + tl = sif4sci( + item, symbol=self.symbol, tokenization_params=self.tokenization_params, errors="ignore" + ) + if kwargs: + _kwargs = deepcopy(self.kwargs) + _kwargs.update(kwargs) + else: + _kwargs = self.kwargs + if tl: + ret = tl.get_segments(**_kwargs) + if flatten is True: + return it.chain(*ret) + return ret + return tl + More examples ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/source/tutorial/zh/pretrain.rst b/docs/source/tutorial/zh/pretrain.rst index 82092160..a4697133 100644 --- a/docs/source/tutorial/zh/pretrain.rst +++ b/docs/source/tutorial/zh/pretrain.rst @@ -42,6 +42,171 @@ >>> print(res) ['An', '[UNK]', '[UNK]', 'a', '[UNK]', '[UNK]', '[UNK]'] +:: + + EduVocab相关参数说明和函数定义: + class EduVocab(object): + """The vocabulary container for a corpus. + + Parameters + ---------- + vocab_path : str, optional + vocabulary path to initialize this container, by default None + corpus_items : List[str], optional + corpus items to update this vocabulary, by default None + bos_token : str, optional + token representing for the start of a sentence, by default "[BOS]" + eos_token : str, optional + token representing for the end of a sentence, by default "[EOS]" + pad_token : str, optional + token representing for padding, by default "[PAD]" + unk_token : str, optional + token representing for unknown word, by default "[UNK]" + specials : List[str], optional + spacials tokens in vocabulary, by default None + lower : bool, optional + wheather to lower the corpus items, by default False + trim_min_count : int, optional + the lower bound number for adding a word into vocabulary, by default 1 + """ + def __init__(self, vocab_path: str = None, corpus_items: List[str] = None, bos_token: str = "[BOS]", + eos_token: str = "[EOS]", pad_token: str = "[PAD]", unk_token: str = "[UNK]", + specials: List[str] = None, lower: bool = False, trim_min_count: int = 1, **kwargs): + super(EduVocab, self).__init__() + + self._tokens = [] + self.idx_to_token = dict() + self.token_to_idx = dict() + self.frequencies = dict() + # 定义特殊词 + self.bos_token = bos_token + self.eos_token = eos_token + self.pad_token = pad_token + self.unk_token = unk_token + self._special_tokens = [self.pad_token, self.unk_token, self.bos_token, self.eos_token] + + if specials: + self._special_tokens += specials + for st in self._special_tokens: + self._add(st) + # 加载词典 + if vocab_path is not None: + self.load_vocab(vocab_path) + elif corpus_items is not None: + self.set_vocab(corpus_items, lower, trim_min_count) + + self.bos_idx = self.token_to_idx[self.bos_token] + self.eos_idx = self.token_to_idx[self.eos_token] + self.pad_idx = self.token_to_idx[self.pad_token] + self.unk_idx = self.token_to_idx[self.unk_token] + + def __len__(self): + return len(self._tokens) + + @property + def vocab_size(self): + return len(self._tokens) + + @property + def special_tokens(self): + return self._special_tokens + + @property + def tokens(self): + return self._tokens + + def to_idx(self, token): + """convert token to index""" + return self.token_to_idx.get(token, self.unk_idx) + + def to_token(self, idx): + """convert index to index""" + return self.idx_to_token.get(idx, self.unk_token) + + def convert_sequence_to_idx(self, tokens, bos=False, eos=False): + """convert sentence of tokens to sentence of indexs""" + res = [self.to_idx(t) for t in tokens] + if bos is True: + res = [self.bos_idx] + res + if eos is True: + res = res + [self.eos_idx] + return res + + def convert_sequence_to_token(self, idxs, **kwargs): + """convert sentence of indexs to sentence of tokens""" + return [self.to_token(i) for i in idxs] + + def set_vocab(self, corpus_items: List[str], lower: bool = False, trim_min_count: int = 1, silent=True): + """Update the vocabulary with the tokens in corpus items + + Parameters + ---------- + corpus_items : List[str], optional + corpus items to update this vocabulary, by default None + lower : bool, optional + wheather to lower the corpus items, by default False + trim_min_count : int, optional + the lower bound number for adding a word into vocabulary, by default 1 + """ + word2cnt = dict() + for item in corpus_items: + for word in item: + word = word.lower() if lower else word + word2cnt[word] = word2cnt.get(word, 0) + 1 + words = [w for w, c in word2cnt.items() if c >= trim_min_count and w not in self._special_tokens] + for token in words: + self._add(token) + if not silent: + keep_word_cnts = sum(word2cnt[w] for w in words) + all_word_cnts = sum(word2cnt.values()) + print(f"save words(trim_min_count={trim_min_count}): {len(words)}/{len(word2cnt)} = {len(words) / len(word2cnt):.4f}\ + with frequency {keep_word_cnts}/{all_word_cnts}={keep_word_cnts / all_word_cnts:.4f}") + + def load_vocab(self, vocab_path: str): + """Load the vocabulary from vocab_file + + Parameters + ---------- + vocab_path : str + path to save vocabulary file + """ + with open(vocab_path, "r", encoding="utf-8") as file: + self._tokens = file.read().strip().split('\n') + self.token_to_idx = {token: idx for idx, token in enumerate(self._tokens)} + self.idx_to_token = {idx: token for idx, token in enumerate(self._tokens)} + + def save_vocab(self, vocab_path: str): + """Save the vocabulary into vocab_file + + Parameters + ---------- + vocab_path : str + path to save vocabulary file + """ + with open(vocab_path, 'w', encoding='utf-8') as file: + for i in range(self.vocab_size): + token = self._tokens[i] + file.write(f"{token}\n") + + def _add(self, token: str): + if token not in self._tokens: + idx = len(self._tokens) + self._tokens.append(token) + self.idx_to_token[idx] = token + self.token_to_idx[token] = idx + + def add_specials(self, tokens: List[str]): + """Add special tokens into vocabulary""" + for token in tokens: + if token not in self._special_tokens: + self._special_tokens += [token] + self._add(token) + + def add_tokens(self, tokens: List[str]): + """Add tokens into vocabulary""" + for token in tokens: + self._add(token) + 预训练令牌化容器 >>>>>>>>>>>>>>>>>>>>>>>> @@ -104,6 +269,94 @@ Examples: # 10 dimension with fasstext method train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="d2v") +:: + + PureTextTokenizer在“令牌化”部分已有详细说明 + train_vector定义如下: + def train_vector(items, w2v_prefix, embedding_dim=None, method="sg", binary=None, train_params=None): + """ + + Parameters + ---------- + items:str + the text of question + w2v_prefix + embedding_dim:int + vector_size + method:str + the method of training, + e.g.: sg, cbow, fasttext, d2v, bow, tfidf + binary: model format + True:bin; + False:kv + train_params: dict + the training parameters passed to model + + Returns + ---------- + tokenizer: Tokenizer + + """ + monitor = MonitorCallback(["word", "I", "less"]) + _train_params = dict( + min_count=0, + vector_size=embedding_dim, + workers=multiprocessing.cpu_count(), + callbacks=[monitor] + ) + if method in {"sg", "cbow"}: + sg = 1 if method == "sg" else 0 + _train_params["sg"] = sg + if train_params is not None: + _train_params.update(train_params) + model = gensim.models.Word2Vec( + items, **_train_params + ) + binary = binary if binary is not None else False + elif method == "fasttext": + if train_params is not None: + _train_params.update(train_params) + model = gensim.models.FastText( + sentences=items, + **_train_params + ) + binary = binary if binary is not None else True + elif method == "d2v": + if train_params is not None: + _train_params.update(train_params) + docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(items)] + model = gensim.models.Doc2Vec( + docs, **_train_params + ) + binary = binary if binary is not None else True + elif method == "bow": + model = gensim.corpora.Dictionary(items) + binary = binary if binary is not None else True + elif method == "tfidf": + dictionary_path = train_vector(items, w2v_prefix, method="bow") + dictionary = BowLoader(dictionary_path) + corpus = [dictionary.infer_vector(item) for item in items] + model = gensim.models.TfidfModel(corpus) + binary = binary if binary is not None else True + else: + raise ValueError("Unknown method: %s" % method) + + filepath = w2v_prefix + method + if embedding_dim is not None: + filepath = filepath + "_" + str(embedding_dim) + + if binary is True: + filepath += ".bin" + logger.info("model is saved to %s" % filepath) + model.save(filepath) + else: + if method in {"fasttext", "d2v"}: # pragma: no cover + logger.warning("binary should be True for %s, otherwise all vectors for ngrams will be lost." % method) + filepath += ".kv" + logger.info("model is saved to %s" % filepath) + model.wv.save(filepath) + return filepath + 加载预训练模型 >>>>>>>>>>>>>>>>>>>>>>>> @@ -127,6 +380,69 @@ Examples: i2v = D2V("text", "d2v", filepath=model_path, pretrained_t2v=False) +:: + + 以D2V为例,具体定义如下:(其余接口可参考EduNLP/I2V下的各个定义) + class D2V(I2V): + """ + The model aims to transfer item to vector directly. + + Bases + ------- + I2V + + Parameters + ----------- + tokenizer: str + the tokenizer name + t2v: str + the name of token2vector model + args: + the parameters passed to t2v + tokenizer_kwargs: dict + the parameters passed to tokenizer + pretrained_t2v: bool + True: use pretrained t2v model + False: use your own t2v model + kwargs: + the parameters passed to t2v + + Returns + ------- + i2v model: I2V + """ + + def infer_vector(self, items, tokenize=True, key=lambda x: x, *args, + **kwargs) -> tuple: + """ + It is a function to switch item to vector. And before using the function, it is necessary to load model. + + Parameters + ----------- + items:str + the text of question + tokenize: bool + True: tokenize the item + key: function + determine how to get the text of each item + args: + the parameters passed to t2v + kwargs: + the parameters passed to t2v + + Returns + -------- + vector:list + """ + tokens = self.tokenize(items, key=key) if tokenize is True else items + tokens = [token for token in tokens] + return self.t2v(tokens, *args, **kwargs), None + + @classmethod + def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): + return cls("pure_text", name, pretrained_t2v=True, model_dir=model_dir) + + 更多模型训练案例 ----------------------- diff --git a/docs/source/tutorial/zh/tokenization.rst b/docs/source/tutorial/zh/tokenization.rst index dcc84c4e..93d154ce 100644 --- a/docs/source/tutorial/zh/tokenization.rst +++ b/docs/source/tutorial/zh/tokenization.rst @@ -30,6 +30,31 @@ >>> tokenize(seg(items), formula_params={"method": "ast"}) ['如图所示', '三角形', , '面积', '\\\\SIFBlank', \\FigureID{1}] +:: + EduNLP.SIF.tokenize的函数形式及参数定义如下: + def tokenize(segment_list: SegmentList, text_params=None, formula_params=None, figure_params=None): + """ + an actual api to tokenize item + + Parameters + ---------- + segment_list:list + segmented item + text_params:dict + the method to duel with text + formula_params:dict + the method to duel with formula + figure_params:dict + the method to duel with figure + + Returns + ---------- + list + tokenized item + + """ + return TokenList(segment_list, text_params, formula_params, figure_params) + 标准接口 ^^^^^^^^^^^^^^^^^^^^^^ @@ -97,6 +122,28 @@ CharTokenizer ['文', '具', '店', '有', '$', '600', '$', '本', '练', '习', '本', '卖', '出', '一', '些', '后', '还', '剩', '$', '4', '$', '包', '每', '包', '$', '25', '$', '本', '卖', '出', '多', '少', '本'] +:: + + CharTokenizer定义如下: + class CharTokenizer(Tokenizer): + def __init__(self, stop_words="punctuations", **kwargs) -> None: + """Tokenize text char by char. eg. "题目内容" -> ["题", "目", "内", 容"] + + Parameters + ---------- + stop_words : str, optional + stop_words to skip, by default "default" + """ + self.stop_words = set("\n\r\t .,;?\"\'。.,、;?“”‘’()") if stop_words == "punctuations" else stop_words + self.stop_words = stop_words if stop_words is not None else set() + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + tokens = tokenize_text(key(item).strip(), granularity="char", stopwords=self.stop_words) + return tokens SpaceTokenizer ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -118,6 +165,30 @@ SpaceTokenizer ['已知集合$A=\\left\\{x', '\\mid', 'x^{2}-3', 'x-4<0\\right\\},', '\\quad', 'B=\\{-4,1,3,5\\},', '\\quad$', '则', '$A', '\\cap', 'B=$'] +:: + + SpaceTokenizer定义如下: + class SpaceTokenizer(Tokenizer): + """Tokenize text by space. eg. "题目 内容" -> ["题目", "内容"] + + Parameters + ---------- + stop_words : str, optional + stop_words to skip, by default "default" + """ + def __init__(self, stop_words="punctuations", **kwargs) -> None: + stop_words = set("\n\r\t .,;?\"\'。.,、;?“”‘’()") if stop_words == "punctuations" else stop_words + self.stop_words = stop_words if stop_words is not None else set() + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + tokens = key(item).strip().split(' ') + if self.stop_words: + tokens = [w for w in tokens if w != '' and w not in self.stop_words] + return tokens CustomTokenizer ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -145,6 +216,55 @@ CustomTokenizer >>> print(next(tokens)) ['已知', '集合', '[FORMULA]', '[FORMULA]'] +:: + + CustomTokenizer定义如下: + class CustomTokenizer(Tokenizer): + def __init__(self, symbol="gmas", figures=None, **kwargs): + """Tokenize SIF items by customized configuration + + Parameters + ---------- + symbol : str, optional + Elements to symbolize before tokenization, by default "gmas" + figures : _type_, optional + Info for figures in items, by default None + kwargs: addtional configuration for SIF items + including text_params, formula_params, figure_params, more details could be found in `EduNLP.SIF.sif4sci` + """ + self.tokenization_params = { + "text_params": kwargs.get("text_params", None), + "formula_params": kwargs.get("formula_params", None), + "figure_params": kwargs.get("figure_params", None) + } + self.symbol = symbol + self.figures = figures + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + """Tokenize items, return iterator genetator + + Parameters + ---------- + item : Iterable + question items + key : function, optional + determine how to get the text of items, by default lambdax: x + """ + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + """Tokenize one item, return token list + + Parameters + ---------- + item : Union[str, dict] + question item + key : function, optional + determine how to get the text of item, by default lambdax: x + """ + return tokenize(seg(key(item), symbol=self.symbol, figures=self.figures), + **self.tokenization_params, **kwargs).tokens @@ -166,6 +286,55 @@ PureTextTokenizer >>> print(next(tokens)) ['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]'] +:: + + PureTextTokenizer定义如下: + class PureTextTokenizer(Tokenizer): + def __init__(self, handle_figure_formula="skip", **kwargs): + """ + Treat all elements in SIF item as prue text. Spectially, tokenize formulas as text. + + Parameters + ---------- + handle_figure_formula : str, optional + whether to skip or symbolize special formulas( $\\FormFigureID{…}$ and $\\FormFigureBase64{…}), + by default skip + + """ + # Formula images are skipped by default + if handle_figure_formula == "skip": + skip_figure_formula = True + symbolize_figure_formula = False + elif handle_figure_formula == "symbolize": + skip_figure_formula = False + symbolize_figure_formula = True + elif handle_figure_formula is None: + skip_figure_formula, symbolize_figure_formula = False, False + else: + raise ValueError('handle_figure_formula should be one in ["skip", "symbolize", None]') + formula_params = { + "method": "linear", + "skip_figure_formula": skip_figure_formula, + "symbolize_figure_formula": symbolize_figure_formula + } + text_params = { + "granularity": "word", + "stopwords": "default", + } + formula_params.update(kwargs.pop("formula_params", {})) + text_params.update(kwargs.pop("text_params", {})) + self.tokenization_params = { + "formula_params": formula_params, + "text_params": text_params, + "figure_params": kwargs.get("figure_params", None) + } + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + return tokenize(seg(key(item), symbol="gmas"), **self.tokenization_params, **kwargs).tokens AstFormulaTokenizer @@ -185,6 +354,54 @@ AstFormulaTokenizer ['公式', '[FORMULA]', '如图', '[FIGURE]', 'mathord_0', ',', 'mathord_1', '约束条件', '公式', '[FORMULA]', '[SEP]', 'mathord_2', '=', 'mathord_0', '+', 'textord', 'mathord_1', '最大值', '[MARK]'] +:: + AstFormulaTokenizer定义如下: + class AstFormulaTokenizer(Tokenizer): + def __init__(self, symbol="gmas", figures=None, **kwargs): + """Tokenize formulas in SIF items by AST parser. + + Parameters + ---------- + symbol : str, optional + Elements to symbolize before tokenization, by default "gmas" + figures : _type_, optional + Info for figures in items, by default None + """ + formula_params = { + "method": "ast", + + "ord2token": True, + "return_type": "list", + "var_numbering": True, + + "skip_figure_formula": False, + "symbolize_figure_formula": True + } + text_params = { + "granularity": "word", + "stopwords": "default", + } + formula_params.update(kwargs.pop("formula_params", {})) + text_params.update(kwargs.pop("text_params", {})) + self.tokenization_params = { + "formula_params": formula_params, + "text_params": text_params, + "figure_params": kwargs.pop("figure_params", None), + } + self.symbol = symbol + self.figures = figures + + def __call__(self, items: Iterable, key=lambda x: x, **kwargs): + for item in items: + yield self._tokenize(item, key=key, **kwargs) + + def _tokenize(self, item: Union[str, dict], key=lambda x: x, **kwargs): + mode = kwargs.pop("mode", 0) + ret = sif4sci(key(item), figures=self.figures, symbol=self.symbol, mode=mode, + tokenization_params=self.tokenization_params, errors="ignore", **kwargs) + ret = [] if ret is None else ret.tokens + return ret + @@ -213,6 +430,59 @@ GensimWordTokenizer >>> print(token_item.tokens) ['已知', '公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]'] +:: + + GensimWordTokenizer定义如下: + class GensimWordTokenizer(object): + """ + + Parameters + ---------- + symbol: str + select the methods to symbolize: + "t": text, + "f": formula, + "g": figure, + "m": question mark, + "a": tag, + "s": sep, + e.g.: gm, fgm, gmas, fgmas + general: bool + + True: when item isn't in standard format, and want to tokenize formulas(except formulas in figure) linearly. + + False: when use 'ast' mothed to tokenize formulas instead of 'linear'. + + Returns + ---------- + tokenizer: Tokenizer + + """ + def __init__(self, symbol="gm", general=False): + self.symbol = symbol + if general is True: + self.tokenization_params = { + "formula_params": { + "method": "linear", + "symbolize_figure_formula": True + } + } + else: + self.tokenization_params = { + "formula_params": { + "method": "ast", + "return_type": "list", + "ord2token": True + } + } + + def batch_process(self, *items): + pass + + def __call__(self, item): + return sif4sci( + item, symbol=self.symbol, tokenization_params=self.tokenization_params, errors="ignore" + ) @@ -245,6 +515,71 @@ GensimSegTokenizer print(len(token_item), token_item) # 2 [['[TEXT_BEGIN]', '已知', '公式', '[FORMULA_BEGIN]', \FormFigureID{1}, '[TEXT_BEGIN]', '如图', '[FIGURE]', '[FORMULA_BEGIN]', 'mathord', ',', 'mathord', '[TEXT_BEGIN]', '约束条件', '公式', '[FORMULA_BEGIN]', [FORMULA], '[SEP]'], ['[FORMULA_BEGIN]', 'mathord', '=', 'mathord', '+', 'textord', 'mathord', '[TEXT_BEGIN]', '最大值', '[MARK]']] +:: + + GensimSegTokenizer定义如下: + class GensimSegTokenizer(object): # pragma: no cover + """ + + Parameters + ---------- + symbol:str + select the methods to symbolize: + "t": text, + "f": formula, + "g": figure, + "m": question mark, + "a": tag, + "s": sep, + e.g. gms, fgm + + depth: int or None + + 0: only separate at \\SIFSep ; + 1: only separate at \\SIFTag ; + 2: separate at \\SIFTag and \\SIFSep ; + otherwise, separate all segments ; + + Returns + ---------- + tokenizer: Tokenizer + + """ + def __init__(self, symbol="gms", depth=None, flatten=False, **kwargs): + self.symbol = symbol + self.tokenization_params = { + "formula_params": { + "method": "ast", + "return_type": "list", + "ord2token": True + } + } + self.kwargs = dict( + add_seg_type=True if depth in {0, 1, 2} else False, + add_seg_mode="head", + depth=depth, + drop="s" if depth not in {0, 1, 2} else "" + ) + self.kwargs.update(kwargs) + self.flatten = flatten + + def __call__(self, item, flatten=None, **kwargs): + flatten = self.flatten if flatten is None else flatten + tl = sif4sci( + item, symbol=self.symbol, tokenization_params=self.tokenization_params, errors="ignore" + ) + if kwargs: + _kwargs = deepcopy(self.kwargs) + _kwargs.update(kwargs) + else: + _kwargs = self.kwargs + if tl: + ret = tl.get_segments(**_kwargs) + if flatten is True: + return it.chain(*ret) + return ret + return tl + 更多示例 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 8d0c3774392ba16b856ca6c358e5eda5db348e4c Mon Sep 17 00:00:00 2001 From: karin0018 Date: Wed, 2 Aug 2023 22:05:37 +0800 Subject: [PATCH 50/58] update tutorial and api references --- EduNLP/I2V/i2v.py | 27 +++- EduNLP/Vector/disenqnet/disenqnet.py | 10 ++ EduNLP/Vector/elmo_vec.py | 20 +++ EduNLP/Vector/gensim_vec.py | 69 +++++++++ EduNLP/Vector/t2v.py | 54 ++++++- docs/README.md | 7 +- docs/source/api/vector.rst | 4 +- docs/source/conf.py | 2 + docs/source/tutorial/zh/pipeline.rst | 8 +- docs/source/tutorial/zh/vectorization.rst | 55 ++----- examples/t2v/t2v_elmo.ipynb | 177 ++++++++++++++++++---- 11 files changed, 355 insertions(+), 78 deletions(-) diff --git a/EduNLP/I2V/i2v.py b/EduNLP/I2V/i2v.py index 0ac17859..b5316c2b 100644 --- a/EduNLP/I2V/i2v.py +++ b/EduNLP/I2V/i2v.py @@ -97,19 +97,43 @@ def __call__(self, items, *args, **kwargs): return self.infer_vector(items, *args, **kwargs) def tokenize(self, items, *args, key=lambda x: x, **kwargs) -> list: - # """tokenize item""" + """ + tokenize item + + Parameter + ---------- + items: a list of questions + + Return + ---------- + tokens: list + """ return self.tokenizer(items, *args, key=key, **kwargs) def infer_vector(self, items, key=lambda x: x, **kwargs) -> tuple: + """ + get question embedding + + NotImplemented + """ raise NotImplementedError def infer_item_vector(self, tokens, *args, **kwargs) -> ...: + """NotImplemented""" return self.infer_vector(tokens, *args, **kwargs)[0] def infer_token_vector(self, tokens, *args, **kwargs) -> ...: + """NotImplemented""" return self.infer_vector(tokens, *args, **kwargs)[1] def save(self, config_path): + """ + save model weights in config_path + + Parameter: + ---------- + config_path: str + """ with open(config_path, "w", encoding="utf-8") as wf: json.dump(self.params, wf, ensure_ascii=False, indent=2) @@ -126,6 +150,7 @@ def load(cls, config_path, *args, **kwargs): @classmethod def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs): + """NotImplemented""" raise NotImplementedError @property diff --git a/EduNLP/Vector/disenqnet/disenqnet.py b/EduNLP/Vector/disenqnet/disenqnet.py index 27f6d571..aa21393a 100644 --- a/EduNLP/Vector/disenqnet/disenqnet.py +++ b/EduNLP/Vector/disenqnet/disenqnet.py @@ -44,6 +44,16 @@ def infer_vector(self, items: dict, vector_type=None, **kwargs) -> torch.Tensor: def infer_tokens(self, items: dict, **kwargs) -> torch.Tensor: embeded, _, _ = self(items) + """ + get tokens embedding with DisenQModel + + Parameters + ---------- + items: dict, {'content_idx': tensor(),'content_len': tensor()}, the tokens about question items after tokenizer processing + + Returns: + torch.Tensor: token embedding + """ return embeded @property diff --git a/EduNLP/Vector/elmo_vec.py b/EduNLP/Vector/elmo_vec.py index 36ea8254..bd0c2272 100644 --- a/EduNLP/Vector/elmo_vec.py +++ b/EduNLP/Vector/elmo_vec.py @@ -21,6 +21,16 @@ def __call__(self, items: dict): return outputs def infer_vector(self, items: dict, **kwargs) -> torch.Tensor: + """ + get sentence vector embedding with ElmoModel + + Parameters + ---------- + items: dict, {'seq_idx': tensor(),'seq_len':tensor()}, the tokens about question after tokenizer processing + + Returns: + torch.Tensor: sentence embedding + """ outputs = self(items) item_embeds = torch.cat( (outputs.forward_output[torch.arange(len(items["seq_len"])), torch.tensor(items["seq_len"]) - 1], @@ -29,6 +39,16 @@ def infer_vector(self, items: dict, **kwargs) -> torch.Tensor: return item_embeds def infer_tokens(self, items, **kwargs) -> torch.Tensor: + """ + get tokens embedding with ElmoModel + + Parameters + ---------- + items: dict, {'seq_idx': tensor()}, the tokens about question after tokenizer processing + + Returns: + torch.Tensor: token embedding + """ outputs = self(items) forward_hiddens = outputs.forward_output backward_hiddens = outputs.backward_output diff --git a/EduNLP/Vector/gensim_vec.py b/EduNLP/Vector/gensim_vec.py index 7d3f05de..cb40620a 100644 --- a/EduNLP/Vector/gensim_vec.py +++ b/EduNLP/Vector/gensim_vec.py @@ -66,11 +66,36 @@ def __getitem__(self, item): return self.wv[item] if index not in self.constants.values() else np.zeros((self.vector_size,)) def infer_vector(self, items, agg="mean", **kwargs) -> list: + """ + get sentence embedding with word2vec model + + Parameters + ---------- + item: list, the tokens after tokenizer processing + + Return + ------ + vector: list + [array(), ..., array()] + """ token_vectors = self.infer_tokens(items, **kwargs) # return [eval("np.%s" % agg)(item, axis=0) if item else np.array([]) for item in token_vectors] return [eval("np.%s" % agg)(item, axis=0) if item else np.zeros(self.vector_size,) for item in token_vectors] def infer_tokens(self, items, **kwargs) -> list: + """ + get token embedding with word2vec model + + Parameters + ---------- + item: list + the tokens after tokenizer processing + + Return + ------ + vector: list + [[array(), ..., array()], [...], [...]] + """ return [list(self(*item)) for item in items] @@ -95,6 +120,19 @@ def __init__(self, filepath): self.dictionary = corpora.Dictionary.load(filepath) def infer_vector(self, item, return_vec=False): + """ + get Bow vector + + Parameters + ---------- + item: list + the tokens after tokenizer processing + + Return + ------ + vector: list + [array(), ..., array()] + """ item = self.dictionary.doc2bow(item) if not return_vec: return item # return dic as default @@ -121,6 +159,19 @@ def __init__(self, filepath): self.dictionary = corpora.Dictionary.load(dictionary_path) def infer_vector(self, item, return_vec=False): + """ + get Tf-idf vector + + Parameters + ---------- + item: list + the tokens after tokenizer processing + + Return + ------ + vector: list + [array(), ..., array()] + """ dic_item = self.dictionary.doc2bow(item) tfidf_item = self.tfidf_model[dic_item] # return dic as default @@ -181,7 +232,25 @@ def vector_size(self): return self.d2v.vector_size def infer_vector(self, items, *args, **kwargs) -> list: + """ + get vector with D2V model + + Parameters + ---------- + item: list + the tokens after tokenizer processing + + Return + ------ + vector: list + [array(), ..., array()] + """ return [self(item) for item in items] def infer_tokens(self, item, *args, **kwargs) -> ...: + """ + get token embeddings with D2V + + NotImplemented + """ raise NotImplementedError diff --git a/EduNLP/Vector/t2v.py b/EduNLP/Vector/t2v.py index 5a386666..debbb803 100644 --- a/EduNLP/Vector/t2v.py +++ b/EduNLP/Vector/t2v.py @@ -49,8 +49,10 @@ class T2V(object): ... 如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}] >>> model_dir = "examples/test_model/d2v" >>> url, model_name, *args = get_pretrained_model_info('d2v_test_256') - >>> (); path = get_data(url, model_dir); () # doctest: +ELLIPSIS - (...) + >>> path = get_data(url, model_dir); # doctest: +ELLIPSIS + downloader, INFO http://base.ustc.edu.cn/data/model_zoo/modelhub/doc2vec_pub/1/d2v_test_256.zip is saved as examples\test_model\d2v\d2v_test_256.zip + Downloading examples\test_model\d2v\d2v_test_256.zip 100.00%: 4.73MB | 4.73MB + downloader, INFO examples\test_model\d2v\d2v_test_256.zip is unzip to examples\test_model\d2v\d2v_test_256 >>> path = path_append(path, os.path.basename(path) + '.bin', to_str=True) >>> t2v = T2V('d2v',filepath=path) >>> print(t2v(item)) @@ -69,9 +71,28 @@ def __call__(self, items, *args, **kwargs): return self.i2v.infer_vector(items, *args, **kwargs) def infer_vector(self, items, *args, **kwargs): + """ + get question embedding with T2V + + Parameters + ---------- + items:list + a list of question + + Returns + ------- + vector:list + numpy.ndarray([dtype=float32)] + + """ return self.i2v.infer_vector(items, *args, **kwargs) def infer_tokens(self, items, *args, **kwargs): + """ + get token embeddings with T2V + + NotImplemented + """ return self.i2v.infer_tokens(items, *args, **kwargs) @property @@ -80,6 +101,27 @@ def vector_size(self) -> int: def get_pretrained_model_info(name): + """ + get the pretrained model information with the given name + + Parameters + ---------- + name:str + select the pretrained model + e.g.: + d2v_math_300 + w2v_math_300 + elmo_math_2048 + bert_math_768 + bert_taledu_768 + disenq_math_256 + quesnet_math_512 + + Returns + -------- + list: [model url (where to download), model name] + + """ url = MODELHUB_URL + 'getPretrainedModel' param = {'name': name} r = requests.get(url, params=param) @@ -89,6 +131,14 @@ def get_pretrained_model_info(name): def get_all_pretrained_models(): + """ + get all pretrained models' name + + Returns + ------- + the pretrained models' name:list + e.g.['bert_bio_ptc', 'bert_geo_ptc', 'bert_math_768', ... ] + """ url = MODELHUB_URL + 'getPretrainedModelList' r = requests.get(url) assert r.status_code == 200, r.status_code diff --git a/docs/README.md b/docs/README.md index 8d4db8b8..33778258 100644 --- a/docs/README.md +++ b/docs/README.md @@ -3,26 +3,31 @@ EduNLP document and tutorial folder Requirements ------------ + See the requirements `docs_deps` in `setup.py`: + ```sh pip install -e .[doc] ``` - Build documents --------------- + First, clean up existing files: + ``` make clean ``` Then build: + ``` make html ``` Render locally -------------- + ``` cd build/html python3 -m http.server 8000 diff --git a/docs/source/api/vector.rst b/docs/source/api/vector.rst index f8bb56de..b960e97a 100644 --- a/docs/source/api/vector.rst +++ b/docs/source/api/vector.rst @@ -10,13 +10,13 @@ EduNLP.Vector.t2v EduNLP.Vector.disenqnet --------------------- +------------------------- .. automodule:: EduNLP.Vector.disenqnet.disenqnet :members: EduNLP.Vector.quesnet --------------------- +------------------------- .. automodule:: EduNLP.Vector.quesnet.quesnet :members: diff --git a/docs/source/conf.py b/docs/source/conf.py index de5bfe16..92e95a92 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -114,3 +114,5 @@ def copy_tree(src, tar): 'undoc-members': True, } autodoc_member_order = 'bysource' + +nbsphinx_allow_errors = True diff --git a/docs/source/tutorial/zh/pipeline.rst b/docs/source/tutorial/zh/pipeline.rst index 6dca611e..31253e11 100644 --- a/docs/source/tutorial/zh/pipeline.rst +++ b/docs/source/tutorial/zh/pipeline.rst @@ -2,9 +2,7 @@ 流水线 ======= -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: pipleine_gallery - :glob: +.. nbinfo:: + notebook: - 流水线 <../../build/blitz/pipeline/pipeline.ipynb> + `流水线 <../../build/blitz/pipeline/pipeline.ipynb>`_ diff --git a/docs/source/tutorial/zh/vectorization.rst b/docs/source/tutorial/zh/vectorization.rst index 9349b91e..1a597e73 100644 --- a/docs/source/tutorial/zh/vectorization.rst +++ b/docs/source/tutorial/zh/vectorization.rst @@ -133,29 +133,18 @@ I2V 向量化容器 更多I2V容器使用示例 ------------------------------------ - -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: i2v_gallery1 - :glob: - - W2V向量化 <../../build/blitz/i2v/i2v_w2v.ipynb> - - D2V向量化 <../../build/blitz/i2v/i2v_d2v.ipynb> - Elmo向量化 <../../build/blitz/i2v/i2v_elmo.ipynb> +`W2V向量化 <../../build/blitz/i2v/i2v_w2v.ipynb>`_ +`D2V向量化 <../../build/blitz/i2v/i2v_d2v.ipynb>`_ -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: i2v_gallery2 - :glob: - - Bert向量化 <../../build/blitz/i2v/i2v_bert.ipynb> - - DisenQNet向量化 <../../build/blitz/i2v/i2v_disenq.ipynb> - - QuesNet向量化 <../../build/blitz/i2v/i2v_quesnet.ipynb> +`Elmo向量化 <../../build/blitz/i2v/i2v_elmo.ipynb>`_ + +`Bert向量化 <../../build/blitz/i2v/i2v_bert.ipynb>`_ + +`DisenQNet向量化 <../../build/blitz/i2v/i2v_disenq.ipynb>`_ + +`QuesNet向量化 <../../build/blitz/i2v/i2v_quesnet.ipynb>`_ @@ -246,11 +235,7 @@ T2V 向量化容器 # 或 # t2v = W2V(path) -<<<<<<< HEAD item_vector = t2v.infer_vector(token_items) -======= - tem_vector = t2v.infer_vector(token_items) ->>>>>>> upstream/dev # [array(), ..., array()] token_vector = t2v.infer_tokens(token_items) # [[array(), ..., array()], [...], [...]] @@ -264,25 +249,15 @@ T2V 向量化容器 更多T2V容器使用示例 ------------------------------------ -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: t2v_gallery1 - :glob: - - W2V向量化 <../../build/blitz/t2v/t2v_w2v.ipynb> - D2V向量化 <../../build/blitz/t2v/t2v_d2v.ipynb> +`W2V向量化 <../../build/blitz/t2v/t2v_w2v.ipynb>`_ - Elmo向量化 <../../build/blitz/t2v/t2v_elmo.ipynb> +`D2V向量化 <../../build/blitz/t2v/t2v_d2v.ipynb>`_ +`Elmo向量化 <../../build/blitz/t2v/t2v_elmo.ipynb>`_ -.. nbgallery:: - :caption: This is a thumbnail gallery: - :name: t2v_gallery2 - :glob: - - Bert向量化 <../../build/blitz/t2v/t2v_bert.ipynb> +`Bert向量化 <../../build/blitz/t2v/t2v_bert.ipynb>`_ - DisenQNet向量化 <../../build/blitz/t2v/t2v_disenq.ipynb> +`DisenQNet向量化 <../../build/blitz/t2v/t2v_disenq.ipynb>`_ - QuesNet向量化 <../../build/blitz/t2v/t2v_quesnet.ipynb> +`QuesNet向量化 <../../build/blitz/t2v/t2v_quesnet.ipynb>`_ diff --git a/examples/t2v/t2v_elmo.ipynb b/examples/t2v/t2v_elmo.ipynb index c754b4d1..561a7128 100644 --- a/examples/t2v/t2v_elmo.ipynb +++ b/examples/t2v/t2v_elmo.ipynb @@ -12,16 +12,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "d:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\gensim\\similarities\\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n", - " warnings.warn(msg)\n" - ] - } - ], + "outputs": [], "source": [ "from EduNLP.Pretrain import ElmoTokenizer\n", "from EduNLP.Vector import T2V, ElmoModel\n", @@ -38,7 +29,7 @@ "BASE_DIR = \"../..\"\n", "\n", "data_dir = f\"{BASE_DIR}/static/test_data\"\n", - "output_dir = f\"{BASE_DIR}/examples/test_model/elmo\"" + "output_dir = f\"{BASE_DIR}/examples/test_model/elmo/elmo_768\"" ] }, { @@ -50,23 +41,29 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "([527, 231, 3, 13, 26, 79, 159, 527, 6, 33, 10, 13, 34, 133, 79, 168, 4], 17)\n", + "{'seq_idx': tensor([ 804, 19, 6, 69, 26, 66, 1381, 804, 9, 254, 27, 69,\n", + " 70, 246, 66, 239, 7]), 'seq_len': tensor(17)}\n", "\n", - "([[527, 231, 3, 13, 26, 79, 159, 527, 6, 33, 10, 13, 34, 133, 79, 168, 4], [7, 104, 13, 15, 16, 17, 18, 34, 79, 15, 16, 17, 18, 19, 105, 13, 10, 23, 106, 107, 104, 108, 109, 110, 111]], [17, 25])\n", + "{'seq_idx': tensor([[ 804, 19, 6, 69, 26, 66, 1381, 804, 9, 254, 27, 69,\n", + " 70, 246, 66, 239, 7, 0, 0, 0, 0, 0, 0, 0,\n", + " 0],\n", + " [ 64, 477, 69, 96, 81, 55, 82, 70, 66, 96, 81, 55,\n", + " 82, 71, 467, 69, 27, 78, 844, 77, 477, 1312, 865, 519,\n", + " 118]]), 'seq_len': tensor([17, 25])}\n", "\n" ] } ], "source": [ "# 加载之前训练的模型tokenizer\n", - "tokenizer = ElmoTokenizer(os.path.join(output_dir, \"vocab.json\"))\n", + "tokenizer = ElmoTokenizer(os.path.join(output_dir, \"vocab.txt\"))\n", "\n", "# 对题目文本进行令牌化\n", "items = [\n", @@ -83,7 +80,8 @@ "print(tokenizer(items, freeze_vocab=True))\n", "print()\n", "\n", - "token_items, lengths = tokenizer(items, pad_to_max_length=True)" + "token_items = tokenizer(items, pad_to_max_length=True)\n", + "_, lengths = token_items" ] }, { @@ -95,17 +93,143 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[EduNLP, INFO] All the weights of ElmoLM were initialized from the model checkpoint at ../../examples/test_model/elmo/elmo_768.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use ElmoLM for predictions without further training.\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "torch.Size([2, 512])\n", + "ElmoLMOutput([('pred_forward', tensor([[[-307.3449, -307.3120, -307.3644, ..., -310.1035, -307.8653,\n", + " -305.8521],\n", + " [-278.2352, -278.3191, -278.3070, ..., -278.2227, -277.5887,\n", + " -277.1101],\n", + " [-363.3187, -363.3951, -363.4167, ..., -365.0335, -361.3292,\n", + " -363.0343],\n", + " ...,\n", + " [-283.5177, -283.5760, -283.6111, ..., -284.5733, -282.6731,\n", + " -283.2103],\n", + " [-248.1853, -248.3669, -248.3075, ..., -248.6257, -247.6015,\n", + " -247.8452],\n", + " [-241.4586, -241.4421, -241.4153, ..., -240.6708, -240.4943,\n", + " -240.6182]],\n", "\n", - "torch.Size([2, 512])\n", - "torch.Size([2, 25, 512])\n", + " [[-334.8899, -334.8294, -334.9643, ..., -334.1731, -335.4581,\n", + " -334.7304],\n", + " [-355.3142, -355.3451, -355.4356, ..., -356.5914, -352.9772,\n", + " -354.9101],\n", + " [-407.1169, -406.9889, -407.2259, ..., -411.4367, -405.8418,\n", + " -407.2929],\n", + " ...,\n", + " [-330.3282, -330.3368, -330.3389, ..., -332.7447, -331.5250,\n", + " -329.7366],\n", + " [-283.1005, -283.1692, -283.1745, ..., -283.0395, -283.1997,\n", + " -282.8568],\n", + " [-235.8705, -235.7804, -235.8927, ..., -235.1325, -235.2520,\n", + " -235.5008]]], grad_fn=)), ('pred_backward', tensor([[[-179.5577, -179.5323, -179.4542, ..., -180.1722, -178.3642,\n", + " -178.2475],\n", + " [-344.9785, -344.9221, -345.0316, ..., -349.3186, -344.6387,\n", + " -344.3976],\n", + " [-311.8071, -311.6877, -311.8076, ..., -315.7125, -312.6975,\n", + " -311.0973],\n", + " ...,\n", + " [-164.1926, -164.2107, -164.1506, ..., -164.0445, -162.9915,\n", + " -163.1473],\n", + " [-161.2745, -161.2983, -161.2401, ..., -161.1260, -160.0812,\n", + " -160.0892],\n", + " [-201.1299, -201.1749, -201.0949, ..., -200.5798, -200.0364,\n", + " -200.2121]],\n", + "\n", + " [[-173.3366, -173.3426, -173.2081, ..., -174.0968, -172.0103,\n", + " -171.8691],\n", + " [-310.7520, -310.6165, -310.7965, ..., -313.6465, -312.9819,\n", + " -309.7775],\n", + " [-297.4435, -297.3187, -297.5530, ..., -300.7402, -298.0805,\n", + " -296.7505],\n", + " ...,\n", + " [-365.5618, -365.3937, -365.5269, ..., -369.6968, -367.3105,\n", + " -365.9371],\n", + " [-360.7122, -360.6502, -360.6949, ..., -365.4681, -362.4482,\n", + " -358.9928],\n", + " [-328.2362, -328.2809, -328.2350, ..., -331.3160, -328.4486,\n", + " -327.7178]]], grad_fn=)), ('forward_output', tensor([[[ 3.6525e-03, 2.2160e-02, 6.8173e-05, ..., -4.7844e-03,\n", + " 9.3836e-03, -4.3491e-01],\n", + " [ 1.8052e-02, 8.7361e-04, 4.8162e-01, ..., -5.3586e-03,\n", + " 6.6052e-02, -1.4424e-02],\n", + " [ 9.5443e-02, 1.6271e-02, 7.0382e-01, ..., -8.3553e-03,\n", + " 1.2897e-02, -3.1771e-02],\n", + " ...,\n", + " [ 5.2170e-03, 1.0105e-02, 2.6849e-01, ..., -3.0296e-03,\n", + " 1.4682e-01, -4.0381e-01],\n", + " [ 8.6507e-03, 1.2562e-02, 9.4650e-01, ..., -1.0862e-03,\n", + " 8.5297e-01, -2.2572e-01],\n", + " [ 3.1620e-02, 1.4642e-01, 1.0650e-01, ..., -1.8507e-01,\n", + " 3.6865e-04, -2.8440e-01]],\n", + "\n", + " [[ 5.3534e-01, 2.1329e-02, 2.8222e-02, ..., -7.0496e-02,\n", + " 6.7711e-02, -3.1365e-03],\n", + " [ 5.6558e-02, 7.0860e-03, 4.0042e-01, ..., -1.3037e-02,\n", + " 2.2477e-02, -2.0711e-02],\n", + " [ 2.1927e-02, 4.2798e-01, 9.1026e-01, ..., -1.8426e-01,\n", + " 6.0737e-03, -1.7819e-01],\n", + " ...,\n", + " [ 3.7156e-02, 2.2477e-02, 6.9470e-01, ..., -1.1230e-02,\n", + " 1.1101e-02, -2.4664e-01],\n", + " [ 1.4176e-02, 1.6747e-02, 7.8785e-02, ..., -1.8862e-02,\n", + " 8.9409e-03, -6.1224e-01],\n", + " [ 7.9827e-02, 3.7614e-02, 2.8973e-01, ..., -9.7911e-02,\n", + " 2.5626e-04, -1.0576e-01]]], grad_fn=)), ('backward_output', tensor([[[ 1.6726e-02, 1.6917e-02, 1.2608e-02, ..., -1.1215e-02,\n", + " -7.0637e-01, -2.7572e-01],\n", + " [ 5.7128e-03, 4.2807e-02, 3.8698e-02, ..., -1.0857e-03,\n", + " -7.1516e-01, -9.0976e-02],\n", + " [ 3.5365e-03, 2.8559e-02, 2.2622e-05, ..., -3.7339e-03,\n", + " 7.1600e-01, -5.7635e-01],\n", + " ...,\n", + " [ 4.6398e-02, 4.9136e-02, 1.2801e-02, ..., -1.9671e-02,\n", + " -9.6720e-03, -9.6724e-02],\n", + " [ 4.0719e-02, 4.4131e-02, 1.2812e-02, ..., -1.8923e-02,\n", + " -3.2772e-03, -8.6033e-02],\n", + " [ 4.6603e-02, 6.7548e-02, 3.4405e-02, ..., -2.9001e-02,\n", + " -2.7431e-03, -1.3146e-01]],\n", + "\n", + " [[ 2.9590e-02, 3.2132e-02, 1.8678e-02, ..., -2.1314e-02,\n", + " -6.8087e-01, -6.1092e-02],\n", + " [ 1.8839e-03, 1.5880e-02, 4.3065e-03, ..., -3.6674e-03,\n", + " 7.4532e-01, -3.9937e-02],\n", + " [ 2.6377e-01, 6.6237e-03, 7.1997e-02, ..., -2.2735e-02,\n", + " 3.1916e-01, -1.1802e-02],\n", + " ...,\n", + " [ 6.5437e-01, 8.1936e-02, 8.6420e-01, ..., -4.1453e-02,\n", + " -6.4451e-01, -1.3480e-01],\n", + " [ 1.7608e-01, 2.3962e-01, 8.7436e-01, ..., -1.5172e-01,\n", + " -7.5390e-01, -2.2155e-01],\n", + " [ 8.7120e-02, 1.5658e-01, 7.4500e-01, ..., -1.3044e-02,\n", + " -7.5951e-01, -3.6064e-01]]], grad_fn=))])\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "f:\\bdaa\\edunlp\\EduNLP\\Vector\\elmo_vec.py:36: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", + " (outputs.forward_output[torch.arange(len(items[\"seq_len\"])), torch.tensor(items[\"seq_len\"]) - 1],\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([2, 768])\n", + "torch.Size([2, 25, 768])\n", "\n" ] } @@ -115,21 +239,21 @@ "\n", "# # 获得句表征\n", "i_vec = t2v(token_items)\n", - "print(i_vec.shape)\n", + "print(i_vec)\n", "print()\n", "\n", "# 获得句表征和词表征\n", "i_vec = t2v.infer_vector(token_items, lengths=lengths)\n", "t_vec = t2v.infer_tokens(token_items, lengths=lengths)\n", - "print(i_vec.shape)\n", - "print(t_vec.shape)\n", + "print(i_vec.size())\n", + "print(t_vec.size())\n", "print()" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3.8.13 ('nlp')", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -143,9 +267,8 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.8.12" }, - "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "cc3e3b0a667322a868bdd200d76d82ed50310f7037715f6f0bc4c373c1c03ce5" @@ -153,5 +276,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From fc760d7c2cf0784abe8aa92f95740ed7fd11e43e Mon Sep 17 00:00:00 2001 From: karin0018 Date: Wed, 2 Aug 2023 22:40:20 +0800 Subject: [PATCH 51/58] fix the flake8 bug --- EduNLP/I2V/i2v.py | 4 +--- EduNLP/Tokenizer/tokenizer.py | 1 - EduNLP/Vector/disenqnet/disenqnet.py | 3 ++- EduNLP/Vector/t2v.py | 18 +++++------------- 4 files changed, 8 insertions(+), 18 deletions(-) diff --git a/EduNLP/I2V/i2v.py b/EduNLP/I2V/i2v.py index b5316c2b..72dd5ce7 100644 --- a/EduNLP/I2V/i2v.py +++ b/EduNLP/I2V/i2v.py @@ -3,15 +3,13 @@ import torch import json -import os.path from typing import List, Tuple from EduNLP.constant import MODEL_DIR from ..Vector import T2V, get_pretrained_t2v as get_t2v_pretrained_model from ..Vector import get_pretrained_model_info, get_all_pretrained_models from longling import path_append -from EduData import get_data from ..Tokenizer import Tokenizer, get_tokenizer -from EduNLP.Pretrain import ElmoTokenizer, BertTokenizer, DisenQTokenizer, QuesNetTokenizer, Question +from EduNLP.Pretrain import ElmoTokenizer, BertTokenizer, DisenQTokenizer, QuesNetTokenizer from EduNLP import logger __all__ = ["I2V", "D2V", "W2V", "Elmo", "Bert", "DisenQ", "QuesNet", "get_pretrained_i2v"] diff --git a/EduNLP/Tokenizer/tokenizer.py b/EduNLP/Tokenizer/tokenizer.py index d154fff4..bb95fa15 100644 --- a/EduNLP/Tokenizer/tokenizer.py +++ b/EduNLP/Tokenizer/tokenizer.py @@ -1,7 +1,6 @@ # coding: utf-8 # 2021/8/1 @ tongshiwei -from pickle import NONE from typing import Iterable, Union from ..SIF.segment import seg from ..SIF.tokenization import tokenize diff --git a/EduNLP/Vector/disenqnet/disenqnet.py b/EduNLP/Vector/disenqnet/disenqnet.py index aa21393a..de2f993c 100644 --- a/EduNLP/Vector/disenqnet/disenqnet.py +++ b/EduNLP/Vector/disenqnet/disenqnet.py @@ -49,7 +49,8 @@ def infer_tokens(self, items: dict, **kwargs) -> torch.Tensor: Parameters ---------- - items: dict, {'content_idx': tensor(),'content_len': tensor()}, the tokens about question items after tokenizer processing + items: dict + {'content_idx': tensor(),'content_len': tensor()}, the tokens about question after tokenizer processing Returns: torch.Tensor: token embedding diff --git a/EduNLP/Vector/t2v.py b/EduNLP/Vector/t2v.py index debbb803..0018e2d6 100644 --- a/EduNLP/Vector/t2v.py +++ b/EduNLP/Vector/t2v.py @@ -45,14 +45,13 @@ class T2V(object): Examples -------- - >>> item = [{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\ - ... 如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}] + >>> item = [{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$, \ + ... 如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$ + \\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}] >>> model_dir = "examples/test_model/d2v" >>> url, model_name, *args = get_pretrained_model_info('d2v_test_256') >>> path = get_data(url, model_dir); # doctest: +ELLIPSIS - downloader, INFO http://base.ustc.edu.cn/data/model_zoo/modelhub/doc2vec_pub/1/d2v_test_256.zip is saved as examples\test_model\d2v\d2v_test_256.zip - Downloading examples\test_model\d2v\d2v_test_256.zip 100.00%: 4.73MB | 4.73MB - downloader, INFO examples\test_model\d2v\d2v_test_256.zip is unzip to examples\test_model\d2v\d2v_test_256 + downloader, INFO ... >>> path = path_append(path, os.path.basename(path) + '.bin', to_str=True) >>> t2v = T2V('d2v',filepath=path) >>> print(t2v(item)) @@ -73,24 +72,20 @@ def __call__(self, items, *args, **kwargs): def infer_vector(self, items, *args, **kwargs): """ get question embedding with T2V - Parameters ---------- items:list - a list of question - + a list of question Returns ------- vector:list numpy.ndarray([dtype=float32)] - """ return self.i2v.infer_vector(items, *args, **kwargs) def infer_tokens(self, items, *args, **kwargs): """ get token embeddings with T2V - NotImplemented """ return self.i2v.infer_tokens(items, *args, **kwargs) @@ -103,7 +98,6 @@ def vector_size(self) -> int: def get_pretrained_model_info(name): """ get the pretrained model information with the given name - Parameters ---------- name:str @@ -116,11 +110,9 @@ def get_pretrained_model_info(name): bert_taledu_768 disenq_math_256 quesnet_math_512 - Returns -------- list: [model url (where to download), model name] - """ url = MODELHUB_URL + 'getPretrainedModel' param = {'name': name} From 362df0bfa6050f7b889a94218218b3dd6c7548f2 Mon Sep 17 00:00:00 2001 From: karin0018 Date: Wed, 2 Aug 2023 22:52:07 +0800 Subject: [PATCH 52/58] fix the flake8 bug --- EduNLP/I2V/i2v.py | 4 ---- EduNLP/Vector/disenqnet/disenqnet.py | 1 - EduNLP/Vector/elmo_vec.py | 2 -- EduNLP/Vector/gensim_vec.py | 19 ++++--------------- 4 files changed, 4 insertions(+), 22 deletions(-) diff --git a/EduNLP/I2V/i2v.py b/EduNLP/I2V/i2v.py index 72dd5ce7..aa5086a0 100644 --- a/EduNLP/I2V/i2v.py +++ b/EduNLP/I2V/i2v.py @@ -97,11 +97,9 @@ def __call__(self, items, *args, **kwargs): def tokenize(self, items, *args, key=lambda x: x, **kwargs) -> list: """ tokenize item - Parameter ---------- items: a list of questions - Return ---------- tokens: list @@ -111,7 +109,6 @@ def tokenize(self, items, *args, key=lambda x: x, **kwargs) -> list: def infer_vector(self, items, key=lambda x: x, **kwargs) -> tuple: """ get question embedding - NotImplemented """ raise NotImplementedError @@ -127,7 +124,6 @@ def infer_token_vector(self, tokens, *args, **kwargs) -> ...: def save(self, config_path): """ save model weights in config_path - Parameter: ---------- config_path: str diff --git a/EduNLP/Vector/disenqnet/disenqnet.py b/EduNLP/Vector/disenqnet/disenqnet.py index de2f993c..61cd0d30 100644 --- a/EduNLP/Vector/disenqnet/disenqnet.py +++ b/EduNLP/Vector/disenqnet/disenqnet.py @@ -46,7 +46,6 @@ def infer_tokens(self, items: dict, **kwargs) -> torch.Tensor: embeded, _, _ = self(items) """ get tokens embedding with DisenQModel - Parameters ---------- items: dict diff --git a/EduNLP/Vector/elmo_vec.py b/EduNLP/Vector/elmo_vec.py index bd0c2272..053ffad7 100644 --- a/EduNLP/Vector/elmo_vec.py +++ b/EduNLP/Vector/elmo_vec.py @@ -23,7 +23,6 @@ def __call__(self, items: dict): def infer_vector(self, items: dict, **kwargs) -> torch.Tensor: """ get sentence vector embedding with ElmoModel - Parameters ---------- items: dict, {'seq_idx': tensor(),'seq_len':tensor()}, the tokens about question after tokenizer processing @@ -41,7 +40,6 @@ def infer_vector(self, items: dict, **kwargs) -> torch.Tensor: def infer_tokens(self, items, **kwargs) -> torch.Tensor: """ get tokens embedding with ElmoModel - Parameters ---------- items: dict, {'seq_idx': tensor()}, the tokens about question after tokenizer processing diff --git a/EduNLP/Vector/gensim_vec.py b/EduNLP/Vector/gensim_vec.py index cb40620a..cc8ae9df 100644 --- a/EduNLP/Vector/gensim_vec.py +++ b/EduNLP/Vector/gensim_vec.py @@ -68,11 +68,9 @@ def __getitem__(self, item): def infer_vector(self, items, agg="mean", **kwargs) -> list: """ get sentence embedding with word2vec model - Parameters ---------- item: list, the tokens after tokenizer processing - Return ------ vector: list @@ -85,15 +83,13 @@ def infer_vector(self, items, agg="mean", **kwargs) -> list: def infer_tokens(self, items, **kwargs) -> list: """ get token embedding with word2vec model - Parameters ---------- item: list the tokens after tokenizer processing - Return ------ - vector: list + vector: list [[array(), ..., array()], [...], [...]] """ return [list(self(*item)) for item in items] @@ -122,17 +118,15 @@ def __init__(self, filepath): def infer_vector(self, item, return_vec=False): """ get Bow vector - Parameters ---------- item: list the tokens after tokenizer processing - Return ------ vector: list [array(), ..., array()] - """ + """ item = self.dictionary.doc2bow(item) if not return_vec: return item # return dic as default @@ -161,15 +155,13 @@ def __init__(self, filepath): def infer_vector(self, item, return_vec=False): """ get Tf-idf vector - Parameters ---------- item: list the tokens after tokenizer processing - Return ------ - vector: list + vector: list [array(), ..., array()] """ dic_item = self.dictionary.doc2bow(item) @@ -234,12 +226,10 @@ def vector_size(self): def infer_vector(self, items, *args, **kwargs) -> list: """ get vector with D2V model - Parameters ---------- - item: list + item: list the tokens after tokenizer processing - Return ------ vector: list @@ -250,7 +240,6 @@ def infer_vector(self, items, *args, **kwargs) -> list: def infer_tokens(self, item, *args, **kwargs) -> ...: """ get token embeddings with D2V - NotImplemented """ raise NotImplementedError From 8f103a13db09c79d7a02138e9d9fd970e42f8382 Mon Sep 17 00:00:00 2001 From: karin0018 Date: Wed, 2 Aug 2023 23:01:40 +0800 Subject: [PATCH 53/58] fix the examples bug --- EduNLP/I2V/i2v.py | 3 ++- EduNLP/Vector/t2v.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/EduNLP/I2V/i2v.py b/EduNLP/I2V/i2v.py index aa5086a0..01b67496 100644 --- a/EduNLP/I2V/i2v.py +++ b/EduNLP/I2V/i2v.py @@ -8,8 +8,9 @@ from ..Vector import T2V, get_pretrained_t2v as get_t2v_pretrained_model from ..Vector import get_pretrained_model_info, get_all_pretrained_models from longling import path_append +from EduData import get_data from ..Tokenizer import Tokenizer, get_tokenizer -from EduNLP.Pretrain import ElmoTokenizer, BertTokenizer, DisenQTokenizer, QuesNetTokenizer +from EduNLP.Pretrain import ElmoTokenizer, BertTokenizer, DisenQTokenizer, QuesNetTokenizer, Question from EduNLP import logger __all__ = ["I2V", "D2V", "W2V", "Elmo", "Bert", "DisenQ", "QuesNet", "get_pretrained_i2v"] diff --git a/EduNLP/Vector/t2v.py b/EduNLP/Vector/t2v.py index 0018e2d6..a5269f05 100644 --- a/EduNLP/Vector/t2v.py +++ b/EduNLP/Vector/t2v.py @@ -46,8 +46,7 @@ class T2V(object): Examples -------- >>> item = [{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$, \ - ... 如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$ - \\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}] + ... 如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}] >>> model_dir = "examples/test_model/d2v" >>> url, model_name, *args = get_pretrained_model_info('d2v_test_256') >>> path = get_data(url, model_dir); # doctest: +ELLIPSIS From c840248b3c1a1e0748207538c9fb07e36f3c6ce4 Mon Sep 17 00:00:00 2001 From: karin0018 Date: Wed, 2 Aug 2023 23:16:52 +0800 Subject: [PATCH 54/58] fix bug --- EduNLP/I2V/i2v.py | 1 + EduNLP/Tokenizer/tokenizer.py | 1 + EduNLP/Vector/t2v.py | 4 ++-- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/EduNLP/I2V/i2v.py b/EduNLP/I2V/i2v.py index 01b67496..975300c3 100644 --- a/EduNLP/I2V/i2v.py +++ b/EduNLP/I2V/i2v.py @@ -3,6 +3,7 @@ import torch import json +import os.path from typing import List, Tuple from EduNLP.constant import MODEL_DIR from ..Vector import T2V, get_pretrained_t2v as get_t2v_pretrained_model diff --git a/EduNLP/Tokenizer/tokenizer.py b/EduNLP/Tokenizer/tokenizer.py index bb95fa15..d154fff4 100644 --- a/EduNLP/Tokenizer/tokenizer.py +++ b/EduNLP/Tokenizer/tokenizer.py @@ -1,6 +1,7 @@ # coding: utf-8 # 2021/8/1 @ tongshiwei +from pickle import NONE from typing import Iterable, Union from ..SIF.segment import seg from ..SIF.tokenization import tokenize diff --git a/EduNLP/Vector/t2v.py b/EduNLP/Vector/t2v.py index a5269f05..c02486d3 100644 --- a/EduNLP/Vector/t2v.py +++ b/EduNLP/Vector/t2v.py @@ -49,8 +49,8 @@ class T2V(object): ... 如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}] >>> model_dir = "examples/test_model/d2v" >>> url, model_name, *args = get_pretrained_model_info('d2v_test_256') - >>> path = get_data(url, model_dir); # doctest: +ELLIPSIS - downloader, INFO ... + >>> (); path = get_data(url, model_dir); () # doctest: +ELLIPSIS + (...) >>> path = path_append(path, os.path.basename(path) + '.bin', to_str=True) >>> t2v = T2V('d2v',filepath=path) >>> print(t2v(item)) From 06461fd786e09f27c33070b67fe1c9af5530168b Mon Sep 17 00:00:00 2001 From: wintermelon008 <1904346407@qq.com> Date: Fri, 4 Aug 2023 08:48:59 +0800 Subject: [PATCH 55/58] [fix] update some codes for quesnet --- EduNLP/Vector/quesnet/quesnet.py | 4 ++-- examples/pretrain/quesnet.ipynb | 11 +++++++++-- tests/test_pretrain/test_pretrained_quesnet.py | 11 +++++++---- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/EduNLP/Vector/quesnet/quesnet.py b/EduNLP/Vector/quesnet/quesnet.py index 36371f1b..b8728c82 100644 --- a/EduNLP/Vector/quesnet/quesnet.py +++ b/EduNLP/Vector/quesnet/quesnet.py @@ -6,7 +6,7 @@ class QuesNetModel(Vector): - def __init__(self, pretrained_dir, img_dir=None, device="cpu", **kwargs): + def __init__(self, pretrained_dir, device="cpu", **kwargs): """ Parameters ---------- @@ -18,7 +18,7 @@ def __init__(self, pretrained_dir, img_dir=None, device="cpu", **kwargs): image dir """ self.device = torch.device(device) - self.model = QuesNet.from_pretrained(pretrained_dir, img_dir=img_dir).to(self.device) + self.model = QuesNet.from_pretrained(pretrained_dir).to(self.device) self.model.eval() def __call__(self, items: dict): diff --git a/examples/pretrain/quesnet.ipynb b/examples/pretrain/quesnet.ipynb index c5469c50..2b0a405f 100644 --- a/examples/pretrain/quesnet.ipynb +++ b/examples/pretrain/quesnet.ipynb @@ -138,8 +138,15 @@ "}\n", "\n", "# 当前仅支持linux下训练\n", - "# pretrain_quesnet(os.path.join(os.path.abspath(data_dir), 'quesnet_data.json'),\n", - "# output_dir, tokenizer, True, train_params)" + "pretrain_quesnet(\n", + " path=os.path.join(os.path.abspath(data_dir),'quesnet_data.json'),\n", + " output_dir=output_dir,\n", + " tokenizer=tokenizer,\n", + " img_dir=None,\n", + " save_embs=True,\n", + " load_embs=False,\n", + " train_params=train_params\n", + ")" ] }, { diff --git a/tests/test_pretrain/test_pretrained_quesnet.py b/tests/test_pretrain/test_pretrained_quesnet.py index 2bf86c34..92dd204f 100644 --- a/tests/test_pretrain/test_pretrained_quesnet.py +++ b/tests/test_pretrain/test_pretrained_quesnet.py @@ -68,10 +68,11 @@ def test_tokenizer(self, standard_luna_data, pretrained_tokenizer_dir): def test_train_quesnet(self, standard_luna_data, pretrained_model_dir): test_items = [ {'ques_content': '有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\ - 如图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$,\ - 若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}, + 如图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}, {'ques_content': '如图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$, \ - 若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'} + 若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$', + "ques_figure_ids": ["000004d6-0479-11ec-829b-797d5eb43535"], + "ques_figure_paths": ["../../static/test_data/quesnet_img/000004d6-0479-11ec-829b-797d5eb43535.png"]} ] ques_file = path_append(abs_current_dir(__file__), @@ -139,7 +140,9 @@ def test_quesnet_t2v(self, pretrained_model_dir): def test_quesnet_i2v(self, pretrained_model_dir): items = [ {'ques_content': '如图$\\FigureID{000004d6-0479-11ec-829b-797d5eb43535}$, \ - 若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'} + 若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$', + "ques_figure_ids": ["000004d6-0479-11ec-829b-797d5eb43535"], + "ques_figure_paths": ["../../static/test_data/quesnet_img/000004d6-0479-11ec-829b-797d5eb43535.png"]} ] img_dir = path_append(abs_current_dir(__file__), "../../static/test_data/quesnet_img", to_str=True) From ace19601f407a6013964b675d683fbb4a8a1c70b Mon Sep 17 00:00:00 2001 From: wintermelon008 <1904346407@qq.com> Date: Fri, 13 Oct 2023 10:24:52 +0800 Subject: [PATCH 56/58] Fix bugs for quesnet_vec --- EduNLP/Pretrain/quesnet_vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/EduNLP/Pretrain/quesnet_vec.py b/EduNLP/Pretrain/quesnet_vec.py index d6b0ae78..fecbce61 100644 --- a/EduNLP/Pretrain/quesnet_vec.py +++ b/EduNLP/Pretrain/quesnet_vec.py @@ -157,14 +157,14 @@ def _convert_to_ids(self, item: Union[str, dict, list], key=lambda x: x, token_item = self.tokenize(item, key) token_idx = [] for _, w in enumerate(token_item): - if isinstance(w, FigureSegment) and 'ques_figure_ids' in item.keys(): + if isinstance(w, FigureSegment) and isinstance(item, dict) and 'ques_figure_ids' in item.keys(): # image try: fig_id = f"{w.src[10:-1]}" fig_index = item['ques_figure_ids'].index(fig_id) - if self.img_dir != "": + if self.img_dir != None: fig_src = os.path.join(self.img_dir, fig_id) if '.png' in item['ques_figure_paths'][fig_index]: fig_src += '.png' From d65242c5603e623d9d8df9a55fa7cee9bc059e4e Mon Sep 17 00:00:00 2001 From: wintermelon008 <1904346407@qq.com> Date: Fri, 13 Oct 2023 11:12:02 +0800 Subject: [PATCH 57/58] Update --- EduNLP/Pretrain/quesnet_vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/EduNLP/Pretrain/quesnet_vec.py b/EduNLP/Pretrain/quesnet_vec.py index fecbce61..a331157b 100644 --- a/EduNLP/Pretrain/quesnet_vec.py +++ b/EduNLP/Pretrain/quesnet_vec.py @@ -164,7 +164,7 @@ def _convert_to_ids(self, item: Union[str, dict, list], key=lambda x: x, fig_id = f"{w.src[10:-1]}" fig_index = item['ques_figure_ids'].index(fig_id) - if self.img_dir != None: + if self.img_dir is not None: fig_src = os.path.join(self.img_dir, fig_id) if '.png' in item['ques_figure_paths'][fig_index]: fig_src += '.png' From 9a3f5f7a690ef56893e05ec62dff36adab155d97 Mon Sep 17 00:00:00 2001 From: KenelmQLH <1097824882@qq.com> Date: Sat, 13 Jan 2024 14:33:00 +0800 Subject: [PATCH 58/58] Update change.txt --- CHANGE.txt | 6 ++++++ EduNLP/ModelZoo/quesnet/quesnet.py | 1 + EduNLP/Pretrain/quesnet_vec.py | 4 ++-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/CHANGE.txt b/CHANGE.txt index 17e33497..cf09733d 100644 --- a/CHANGE.txt +++ b/CHANGE.txt @@ -1,3 +1,9 @@ +v1.0.0 + 1. Support cuda for I2V and T2V. + 2. Add demos for downstream tasks including knowledge & difficulty & discrimination prediction, similarity prediction and paper segmentation. + 3. Refactor quesnet for pretrain and vectorization. + 4. Update documents about tutorials and API. + v0.0.9 1. Refactor tokenizer Basic Tokenizer and Pretrained Tokenizer 2. Refactor model structures following huggingface styles for Elmo, BERT, DisenQNet and QuesNet diff --git a/EduNLP/ModelZoo/quesnet/quesnet.py b/EduNLP/ModelZoo/quesnet/quesnet.py index a103645e..73b836e0 100644 --- a/EduNLP/ModelZoo/quesnet/quesnet.py +++ b/EduNLP/ModelZoo/quesnet/quesnet.py @@ -80,6 +80,7 @@ def __init__(self, _stoi=None, meta='know_name', pretrained_embs: np.ndarray = N self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "kwargs"]} # self.config.update(kwargs) self.config["architecture"] = 'quesnet' + self.config["hidden_size"] = self.hidden_size = feat_size self.config = PretrainedConfig.from_dict(self.config) def init_h(self, batch_size): diff --git a/EduNLP/Pretrain/quesnet_vec.py b/EduNLP/Pretrain/quesnet_vec.py index d6b0ae78..a331157b 100644 --- a/EduNLP/Pretrain/quesnet_vec.py +++ b/EduNLP/Pretrain/quesnet_vec.py @@ -157,14 +157,14 @@ def _convert_to_ids(self, item: Union[str, dict, list], key=lambda x: x, token_item = self.tokenize(item, key) token_idx = [] for _, w in enumerate(token_item): - if isinstance(w, FigureSegment) and 'ques_figure_ids' in item.keys(): + if isinstance(w, FigureSegment) and isinstance(item, dict) and 'ques_figure_ids' in item.keys(): # image try: fig_id = f"{w.src[10:-1]}" fig_index = item['ques_figure_ids'].index(fig_id) - if self.img_dir != "": + if self.img_dir is not None: fig_src = os.path.join(self.img_dir, fig_id) if '.png' in item['ques_figure_paths'][fig_index]: fig_src += '.png'