diff --git a/paddle2.0_docs/text_classification_with_transformer/text_classification_with_transformer.ipynb b/paddle2.0_docs/text_classification_with_transformer/text_classification_with_transformer.ipynb new file mode 100644 index 00000000..4878b164 --- /dev/null +++ b/paddle2.0_docs/text_classification_with_transformer/text_classification_with_transformer.ipynb @@ -0,0 +1,489 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "# 通过Transformer实现文本分类\n", + "作者:[YinHang2515](https://github.com/YinHang2515)\n", + "\n", + "日期:2020年11月20日\n", + "\n", + "本示例教程演示如何使用Transformer模型在IMDB数据集上完成文本分类的任务。\n", + "\n", + "IMDB数据集是一个对电影评论标注为正向评论与负向评论的数据集,共有25000条文本数据作为训练集,25000条文本数据作为测试集。 该数据集的官方地址为: http://ai.stanford.edu/~amaas/data/sentiment/" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## 环境设置" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import paddle as pd\n", + "import paddle.nn as nn\n", + "import paddle.nn.functional as func\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## 处理数据集\n", + "首先通过paddle内置的dataset完成数据集的导入,并构建字典和相应的reader\n", + "\n", + "然后通过padding的方式对同一个batch中长度不一致的数据进行补齐" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading IMDB word dict....\n" + ] + } + ], + "source": [ + "print(\"Loading IMDB word dict....\")\r\n", + "word_dict = pd.dataset.imdb.word_dict()\r\n", + "\r\n", + "train_reader = pd.dataset.imdb.train(word_dict)\r\n", + "test_reader = pd.dataset.imdb.test(word_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "the:0\n", + "and:1\n", + "a:2\n", + "of:3\n", + "to:4\n", + "...\n", + "virtual:5143\n", + "warriors:5144\n", + "widely:5145\n", + ":5146\n", + ":5147\n", + "totally 5148 words\n" + ] + } + ], + "source": [ + "# 添加\r\n", + "word_dict[''] = len(word_dict)\r\n", + "\r\n", + "for k in list(word_dict)[:5]:\r\n", + " print(\"{}:{}\".format(k.decode('ASCII'), word_dict[k]))\r\n", + "\r\n", + "print(\"...\")\r\n", + "\r\n", + "for k in list(word_dict)[-5:]:\r\n", + " print(\"{}:{}\".format(k if isinstance(k, str) else k.decode('ASCII'), word_dict[k]))\r\n", + "\r\n", + "print(\"totally {} words\".format(len(word_dict)))\r\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## 参数设置" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "vocab_size = len(word_dict) \r\n", + "maxlen = 200 \r\n", + "seq_len = 200\r\n", + "batch_size = 128\r\n", + "epochs = 2\r\n", + "pad_id = word_dict['']\r\n", + "embed_dim = 32 # Embedding size for each token\r\n", + "num_heads = 2 # Number of attention heads\r\n", + "feed_dim = 32 # Hidden layer size in feed forward network inside transformer\r\n", + "\r\n", + "classes = ['positive', 'negative']" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(25000, 200)\n", + "(25000, 1)\n", + "(25000, 200)\n", + "(25000, 1)\n" + ] + } + ], + "source": [ + "#用padding的方式对齐数据\r\n", + "def create_padded_dataset(reader):\r\n", + " padded_sents = []\r\n", + " labels = []\r\n", + " for batch_id, data in enumerate(reader):\r\n", + " sent, label = data\r\n", + " padded_sent = sent[:seq_len] + [pad_id] * (seq_len - len(sent))\r\n", + " padded_sents.append(padded_sent)\r\n", + " labels.append(label)\r\n", + " return np.array(padded_sents), np.expand_dims(np.array(labels), axis=1)\r\n", + "\r\n", + "train_sents, train_labels = create_padded_dataset(train_reader())\r\n", + "test_sents, test_labels = create_padded_dataset(test_reader())\r\n", + "\r\n", + "print(train_sents.shape)\r\n", + "print(train_labels.shape)\r\n", + "print(test_sents.shape)\r\n", + "print(test_labels.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## 用Dataset 与 DataLoader 加载" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "\r\n", + "class IMDBDataset(pd.io.Dataset):\r\n", + " def __init__(self, sents, labels):\r\n", + "\r\n", + " self.sents = sents\r\n", + " self.labels = labels\r\n", + " \r\n", + " def __getitem__(self, index):\r\n", + "\r\n", + " data = self.sents[index]\r\n", + " label = self.labels[index]\r\n", + "\r\n", + " return data, label\r\n", + "\r\n", + " def __len__(self):\r\n", + " \r\n", + " return len(self.sents)\r\n", + " \r\n", + "train_dataset = IMDBDataset(train_sents, train_labels)\r\n", + "test_dataset = IMDBDataset(test_sents, test_labels)\r\n", + "\r\n", + "train_loader = pd.io.DataLoader(train_dataset, places=pd.CPUPlace(), return_list=True,\r\n", + " shuffle=True, batch_size=batch_size, drop_last=True)\r\n", + "test_loader = pd.io.DataLoader(test_dataset, places=pd.CPUPlace(), return_list=True,\r\n", + " shuffle=True, batch_size=batch_size, drop_last=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## 定义嵌入层" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "class TokenAndPositionEmbedding(nn.Layer):\r\n", + " def __init__(self, maxlen, vocab_size, embed_dim):\r\n", + " super(TokenAndPositionEmbedding, self).__init__()\r\n", + " self.token_emb = nn.Embedding(vocab_size, embed_dim)\r\n", + " self.pos_emb = nn.Embedding(maxlen, embed_dim)\r\n", + "\r\n", + " def forward(self, x):\r\n", + " maxlen = pd.shape(x)[-1]\r\n", + " positions = pd.arange(start=0, end=maxlen, step=1, dtype='int64')\r\n", + " positions = self.pos_emb(positions)\r\n", + " x = self.token_emb(x)\r\n", + " return x + positions\r\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## 组建网络" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "class MyNet(nn.Layer):\r\n", + " def __init__(self):\r\n", + " super(MyNet, self).__init__()\r\n", + " self.emb = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)\r\n", + " self.trs = nn.TransformerEncoderLayer(embed_dim, num_heads, feed_dim, normalize_before=True)\r\n", + " self.drop1 = nn.Dropout(0.1)\r\n", + " self.linear1 = nn.Linear(feed_dim, 20)\r\n", + " self.drop2 = nn.Dropout(0.1)\r\n", + " self.linear2 = nn.Linear(20, 2)\r\n", + "\r\n", + " def forward(self, x):\r\n", + " x = self.emb(x)\r\n", + " x = self.trs(x)\r\n", + " x = pd.mean(x, axis=1)\r\n", + " x = self.drop1(x)\r\n", + " x = self.linear1(x)\r\n", + " x = func.relu(x)\r\n", + " x = self.drop2(x)\r\n", + " x = self.linear2(x)\r\n", + " x = func.softmax(x)\r\n", + " return x" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## 训练模型" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The loss value printed in the log is the current step, and the metric is the average value of previous step.\n", + "Epoch 1/2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/distributed/parallel.py:119: UserWarning: Currently not a parallel execution environment, `paddle.distributed.init_parallel_env` will not do anything.\n", + " \"Currently not a parallel execution environment, `paddle.distributed.init_parallel_env` will not do anything.\"\n", + "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:77: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working\n", + " return (isinstance(seq, collections.Sequence) and\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "step 195/195 [==============================] - loss: 0.5317 - 447ms/step \n", + "Eval begin...\n", + "The loss value printed in the log is the current batch, and the metric is the average value of previous step.\n", + "step 195/195 [==============================] - loss: 0.4641 - 96ms/step \n", + "Eval samples: 24960\n", + "Epoch 2/2\n", + "step 195/195 [==============================] - loss: 0.3965 - 449ms/step \n", + "Eval begin...\n", + "The loss value printed in the log is the current batch, and the metric is the average value of previous step.\n", + "step 195/195 [==============================] - loss: 0.4130 - 96ms/step \n", + "Eval samples: 24960\n" + ] + } + ], + "source": [ + "#使用高层API进行训练\r\n", + "model = pd.Model(MyNet()) # 用 Model封装 MyNet\r\n", + "\r\n", + "# 模型配置\r\n", + "model.prepare(optimizer=pd.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()),\r\n", + " loss=nn.CrossEntropyLoss())\r\n", + "\r\n", + "#模型训练\r\n", + "model.fit(train_loader,\r\n", + " test_loader,\r\n", + " epochs=epochs,\r\n", + " batch_size=batch_size,\r\n", + " verbose=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## 模型验证" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predict begin...\n", + "step 195/195 [==============================] - 96ms/step \n", + "Predict samples: 24960\n" + ] + } + ], + "source": [ + "#使用model.predict对测试集进行测试\r\n", + "result = model.predict(test_loader, batch_size=64)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#5345 in beautiful water colors at the cliff is definitely a sight to vaguely the trailer witch i didnt find that impressive i was surprised at how beautiful and detailed it was this film just over me with its br at the center is a young boy that comes into contact with a sea creature and its their relationship that carries the movie is a master both at creating memorable imagery and showing young ones in a believable way with their little br there are a few parts that didnt sit well with me it would be an to say that the music during a particular scene ride of the its a shame because such a precious film as this cant afford to take and it hurt a otherwise truly great scene the and its never interested me either but i guess it served more as a background then anything br anyway great film the boat trip scenario with all its imagery and stood out me thinks pure and magical and yes is more intimate then computer animated im really \n", + "Lable: positive\n", + "Predict: positive\n", + "#19983 what garbage is there actually no part ii if this movie actually ends the way it did everyone involved with this movie should be ashamed this movie is nothing close to a movie like which was actually a fairly decent movie this movie was rushed in hopes of being able to gain a few easy dollars i started watching the movie under the that it would be bad but i never imagined it would be this bad this movie was nothing more then a way to the fears of the american people for if you have not seen this movie dont bother movie is full of poorly developed characters and poor acting i really hope the production of this movie was only a couple of weeks stay away dont be fooled by the hype \n", + "Lable: negative\n", + "Predict: positive\n", + "#11231 i read the comments about this movie before watching it and wasnt expecting much but as a b movie its a curiosity i guess you have to be a certain type of person to enjoy this kind of thing but i seriously thought it was awesome people should watch it just for the experience because its totally one of a kind dont expect much of it the acting is poor were obviously done on a etc definitely a b movie but nonetheless really worth checking out very funny mild gore and some questionable themes so probably not something to show your but for the more viewer id say its a must see \n", + "Lable: positive\n", + "Predict: negative\n", + "#18904 i did not like this movie i rented it hoping it would be something like the kingdom i was disappointed when i discovered it wasnt i also found it just plain the acting was bad the characters where unbelievable and the time jumps were crazy i only this film if your in the mood to see a crazy dude running around but im sure there are better films with the same thing i cant believe i wasted my time on this one \n", + "Lable: negative\n", + "Predict: negative\n", + "#6208 despite later claims this melodrama has very little in common with citizen kane its a of a ruthless but human fictional told in flashback but around time the shows none of his later gift for dialog and none of the cinematic of kane are evident still its very watchable with a young tracy his makeup makes him look just like well an old tracy showing depth and authority and moore a little past her prime and not physically well playing a theres also helen as one of the most in movie history the final third into the surviving print is and has missing audio and there are some plot holes left open how would she know whose son it was if shes sleeping with both of them and the music is awfully hokey for all that i was quite fascinated \n", + "Lable: positive\n", + "Predict: positive\n", + "#17908 how can you go wrong with a film that the of peter the of donald and the of peters easy you give them a terrible script to work withbr br peter in an evil role leads a cult that has captured a couple friends of donald donald plays a determined but priest that desires to his friends but needs the help of fred from to them he the help of a new york who dresses just like the hero of the cartoon in his task meanwhile the girlfriend of one of the missing people peters joins the search if all this sounds interesting then you like myself were br it doesnt take long for the trio to out the but their methods of bringing the bad guys to justice are also women will find this a every time they get on the bad guys trail our two male heroes tell to stay at the hotel even though she can handle herself better than the priest played by br story the script really lets us down \n", + "Lable: negative\n", + "Predict: positive\n", + "#10135 delivers great acting and greater special effects stars david one of my personal favorites as its special effects on the monsters were so good you thought they might be really however a perfect 10 on my list \n", + "Lable: positive\n", + "Predict: positive\n", + "#21323 i nearly fell asleep during a screening of this of a boring story that seems to go on forever it follows several days in the life of a male prostitute who falls in love with one of his tricks after a affair the trick leaves a long letter explaining why they cannot be together and how they must go on their separate br the male prostitute then goes on a trying to find his one true love repeatedly returning to the same places they looking for more clues or signs as to where he may his br in the meanwhile he up with one ugly guy who i thought was also a male prostitute a gay and some guy who ends up having a sexual with him in a back br it never to me how films still portray random sex acts as scenes that can take place in a brief matter of seconds such as in this case where the trick barely has his pants before three times and br br all of these random encounters end with \n", + "Lable: negative\n", + "Predict: positive\n", + "#9215 countless tv displays and the memorable appearances from 4 of todays hope screen debut keep still in the plot is about a research of five ones crazy idea on discovering the secret of death and learn whats after death then come back to life again yet the storyline hasnt been designed as fascinating as the idea of the plotbr br there are popular stereotypes to develop a regular script in there is nelson who creates the idea of death pretty but rachel david who cuts the on luckily not to be from the school ladies man joe and finally the smart guy did not come to medical school to murder my class no matter how deranged they might be they join hands altogether in an experiment where heart will be stopped and then they decide to continue this experiment in at night times in the not long after experience everyone starts a race over having the and the death experience their lives one by one yet soon they realize their daily life becomes affected \n", + "Lable: positive\n", + "Predict: positive\n", + "#3325 this film blew me away i thought i knew a little about the prison after watching this i see i knew nothing the story is told through the relationship between the and the black both the personal story of these two men and the drama were riveting the flashback sequences in the prison were awesome its hard to believe it wasnt documentary footage it was so real it was not only a great piece of drama it was an incredible lesson in an important chapter in american history im with and i give it two thumbs up \n", + "Lable: positive\n", + "Predict: positive\n" + ] + } + ], + "source": [ + "#随机选择文本打印并观察结果\r\n", + "label_pred = np.array([])\r\n", + "for label in result[0]:\r\n", + " label_pred = np.append(label_pred, label.reshape(-1))\r\n", + "for i in range(10):\r\n", + " words = \"\"\r\n", + " idx = np.random.randint(int(len(label_pred)/2))\r\n", + " for k in test_sents[idx]:\r\n", + " word = list(word_dict)[k]\r\n", + " if not isinstance(word, str):\r\n", + " word = word.decode('ASCII')\r\n", + " if word != '' and word != '':\r\n", + " words += (word + \" \")\r\n", + " print(\"#\" + str(idx) + \" \" + words)\r\n", + " print(\"Lable: \" + classes[test_labels[idx][0]])\r\n", + " if label_pred[2 * idx] > label_pred[2 * idx + 1]:\r\n", + " print(\"Predict: \" + classes[0])\r\n", + " else:\r\n", + " print(\"Predict: \" + classes[1])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "PaddlePaddle 2.0.0b0 (Python 3.5)", + "language": "python", + "name": "py35-paddle1.2.0" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}