diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 00000000000000..2c8f906aba7851 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,11 @@ +version: 2 +jobs: + build: + working_directory: ~/pytorch-pretrained-BERT + docker: + - image: circleci/python:3.7 + steps: + - checkout + - run: sudo pip install --progress-bar off . + - run: sudo pip install pytest + - run: python -m pytest -sv tests/ diff --git a/Likunlin_final/Likunlin_final/__init__.py b/Likunlin_final/Likunlin_final/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/Likunlin_final/Likunlin_final/settings.py b/Likunlin_final/Likunlin_final/settings.py new file mode 100644 index 00000000000000..e83c872d517c5d --- /dev/null +++ b/Likunlin_final/Likunlin_final/settings.py @@ -0,0 +1,121 @@ +""" +Django settings for Likunlin_final project. + +Generated by 'django-admin startproject' using Django 2.2. + +For more information on this file, see +https://docs.djangoproject.com/en/2.2/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/2.2/ref/settings/ +""" + +import os + +# Build paths inside the project like this: os.path.join(BASE_DIR, ...) +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/2.2/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = '7lu!q_nf9z&+*@3(ty!djsexs2($8@wx3^*oro@as!z0p4id&(' + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = ['192.168.53.8'] + + +# Application definition + +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'analyse_text', +] + +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +] + +ROOT_URLCONF = 'Likunlin_final.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [], + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'Likunlin_final.wsgi.application' + + +# Database +# https://docs.djangoproject.com/en/2.2/ref/settings/#databases + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), + } +} + + +# Password validation +# https://docs.djangoproject.com/en/2.2/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/2.2/topics/i18n/ + +LANGUAGE_CODE = 'en-us' + +TIME_ZONE = 'UTC' + +USE_I18N = True + +USE_L10N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/2.2/howto/static-files/ + +STATIC_URL = '/static/' diff --git a/Likunlin_final/Likunlin_final/urls.py b/Likunlin_final/Likunlin_final/urls.py new file mode 100644 index 00000000000000..86ae55cb41e380 --- /dev/null +++ b/Likunlin_final/Likunlin_final/urls.py @@ -0,0 +1,27 @@ +"""Likunlin_final URL Configuration + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/2.2/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" +from django.contrib import admin +from django.urls import path + +from analyse_text import views as analyse_views + + +urlpatterns = [ + path('admin/', admin.site.urls), + path('',analyse_views.home, name='home'), + path('modify/',analyse_views.modify), + path('analyse/',analyse_views.analyse), +] diff --git a/Likunlin_final/Likunlin_final/wsgi.py b/Likunlin_final/Likunlin_final/wsgi.py new file mode 100644 index 00000000000000..31220c1f430b80 --- /dev/null +++ b/Likunlin_final/Likunlin_final/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for Likunlin_final project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/2.2/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'Likunlin_final.settings') + +application = get_wsgi_application() diff --git a/Likunlin_final/analyse_text/Untitled.ipynb b/Likunlin_final/analyse_text/Untitled.ipynb new file mode 100644 index 00000000000000..33cb6d6eb744ba --- /dev/null +++ b/Likunlin_final/analyse_text/Untitled.ipynb @@ -0,0 +1,64 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from django.shortcuts import render\n", + "# -*- coding: utf-8 -*-\n", + "from django.shortcuts import render\n", + "from django.http import HttpResponse\n", + "import json\n", + "\n", + "tokens = []\n", + "suggestions = {}\n", + "def home(request):\n", + " return render(request, 'home.html')\n", + "\n", + "\n", + "def analyse(request):\n", + " global tokens\n", + " global suggestions\n", + " text = \"\"\n", + " text = request.GET['text']\n", + " tokens = text.split()\n", + " tokens = ['[CLS]', 'it', 'was', 'monday', 'morning', ',', 'and', 'the', 'writeing', 'class', 'had', 'just', 'begun', '.', 'we', 'were', 'ti', '##ring', '.', 'everyone', 'was', 'silent', ',', 'wait', 'to', 'see', 'who', 'would', 'be', 'called', 'upon', 'to', 'read', 'his', 'and', 'her', 'paragraph', 'aloud', '.', 'some', 'of', 'us', 'were', 'confidont', 'and', 'eagerly', 'take', 'part', 'in', 'the', 'class', 'activity', ',', 'others', 'were', 'nervous', 'and', 'anxious', '.', 'i', 'had', 'done', 'myself', 'homework', 'but', 'i', 'was', 'shy', '.', 'i', 'was', 'afraid', 'that', 'to', 'speak', 'in', 'front', 'of', 'a', 'larger', 'group', 'of', 'people', '.', 'at', 'that', 'moment', ',', 'i', 'remembered', 'that', 'my', 'father', 'once', 'said', ',', '\"', 'the', 'classroom', 'is', 'a', 'place', 'for', 'learning', 'and', 'that', 'include', 'leaning', 'from', 'textbooks', ',', 'and', 'mistake', 'as', 'well', '.', '\"', 'immediate', ',', 'i', 'raised', 'my', 'hand', '.', '[SEP]']\n", + " suggestions = {8: 'writing', 43: 'confident', 23: 'waiting', 34: 'or', 45: 'would', 46: 'taking', 51: 'activities', 62: 'my', 72: '去掉 that', 105: 'to', 106: 'includes', 107: 'learning', 108: 'on', 112: 'mistakes', 117: 'immediately'}\n", + " return HttpResponse(json.dumps({\"tokens\":tokens,\"suggestions\":suggestions}))\n", + "\n", + "def modify(request):\n", + " global tokens\n", + " global suggestions\n", + " index = request.GET['index']\n", + " tokens[int(index)] = suggestions[int(index)]\n", + " print(\"检查点\")\n", + " del suggestions[int(index)]\n", + " print(suggestions)\n", + " return HttpResponse(json.dumps({\"tokens\":tokens,\"suggestions\":suggestions}))\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Likunlin_final/analyse_text/__init__.py b/Likunlin_final/analyse_text/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/Likunlin_final/analyse_text/admin.py b/Likunlin_final/analyse_text/admin.py new file mode 100644 index 00000000000000..8c38f3f3dad51e --- /dev/null +++ b/Likunlin_final/analyse_text/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/Likunlin_final/analyse_text/apps.py b/Likunlin_final/analyse_text/apps.py new file mode 100644 index 00000000000000..83adc60b11fbce --- /dev/null +++ b/Likunlin_final/analyse_text/apps.py @@ -0,0 +1,5 @@ +from django.apps import AppConfig + + +class AnalyseTextConfig(AppConfig): + name = 'analyse_text' diff --git a/Likunlin_final/analyse_text/migrations/__init__.py b/Likunlin_final/analyse_text/migrations/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/Likunlin_final/analyse_text/models.py b/Likunlin_final/analyse_text/models.py new file mode 100644 index 00000000000000..71a836239075aa --- /dev/null +++ b/Likunlin_final/analyse_text/models.py @@ -0,0 +1,3 @@ +from django.db import models + +# Create your models here. diff --git a/Likunlin_final/analyse_text/templates/home.html b/Likunlin_final/analyse_text/templates/home.html new file mode 100644 index 00000000000000..dcc5b3bc4223da --- /dev/null +++ b/Likunlin_final/analyse_text/templates/home.html @@ -0,0 +1,307 @@ + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+
+
+

评价区

+
+
+ + +
+
+
+
+
+ +
+ +
+ +
+
+ +
+ +
+ +
+
+ + + + + + \ No newline at end of file diff --git a/Likunlin_final/analyse_text/tests.py b/Likunlin_final/analyse_text/tests.py new file mode 100644 index 00000000000000..7ce503c2dd97ba --- /dev/null +++ b/Likunlin_final/analyse_text/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/Likunlin_final/analyse_text/views.py b/Likunlin_final/analyse_text/views.py new file mode 100644 index 00000000000000..e40c6ca8d467f3 --- /dev/null +++ b/Likunlin_final/analyse_text/views.py @@ -0,0 +1,27 @@ +from django.shortcuts import render +# -*- coding: utf-8 -*- +from django.shortcuts import render +from django.http import HttpResponse +import json +import sys +sys.path =['/home/xd/projects/pytorch-pretrained-BERT'] + sys.path +from likunlin_final import analyze_text,modify_text + +text = [] +def home(request): + return render(request, 'home.html') + + +def analyse(request): + global text + text = request.GET['text'] + text = [text] + print("xiaofang") + suggestions,tokens,avg_gap = analyze_text(text) + return HttpResponse(json.dumps({"tokens":tokens,"suggestions":suggestions,"avg_gap":avg_gap})) + +def modify(request): + global text + index = request.GET['index'] + text,new_tokens,suggestions = modify_text(int(index),text) + return HttpResponse(json.dumps({"tokens":new_tokens,"suggestions":suggestions})) diff --git a/Likunlin_final/manage.py b/Likunlin_final/manage.py new file mode 100755 index 00000000000000..30c456de702310 --- /dev/null +++ b/Likunlin_final/manage.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +"""Django's command-line utility for administrative tasks.""" +import os +import sys + + +def main(): + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'Likunlin_final.settings') + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?" + ) from exc + execute_from_command_line(sys.argv) + + +if __name__ == '__main__': + main() diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000000000..1aba38f67a2211 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include LICENSE diff --git a/README.md b/README.md index eb337d8253f465..4e7d3bb1090bb4 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # PyTorch Pretrained Bert +[![CircleCI](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT.svg?style=svg)](https://circleci.com/gh/huggingface/pytorch-pretrained-BERT) + This repository contains an op-for-op PyTorch reimplementation of [Google's TensorFlow repository for the BERT model](https://github.com/google-research/bert) that was released together with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. This implementation is provided with [Google's pre-trained models](https://github.com/google-research/bert), examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided. @@ -14,12 +16,12 @@ This implementation is provided with [Google's pre-trained models](https://githu | [Doc](#doc) | Detailed documentation | | [Examples](#examples) | Detailed examples on how to fine-tune Bert | | [Notebooks](#notebooks) | Introduction on the provided Jupyter Notebooks | -| [TPU](#tup) | Notes on TPU support and pretraining scripts | +| [TPU](#tpu) | Notes on TPU support and pretraining scripts | | [Command-line interface](#Command-line-interface) | Convert a TensorFlow checkpoint in a PyTorch dump | ## Installation -This repo was tested on Python 3.5+ and PyTorch 0.4.1 +This repo was tested on Python 3.5+ and PyTorch 0.4.1/1.0.0 ### With pip @@ -46,13 +48,15 @@ python -m pytest -sv tests/ This package comprises the following classes that can be imported in Python and are detailed in the [Doc](#doc) section of this readme: -- Six PyTorch models (`torch.nn.Module`) for Bert with pre-trained weights (in the [`modeling.py`](./pytorch_pretrained_bert/modeling.py) file): - - [`BertModel`](./pytorch_pretrained_bert/modeling.py#L535) - raw BERT Transformer model (**fully pre-trained**), - - [`BertForMaskedLM`](./pytorch_pretrained_bert/modeling.py#L689) - BERT Transformer with the pre-trained masked language modeling head on top (**fully pre-trained**), - - [`BertForNextSentencePrediction`](./pytorch_pretrained_bert/modeling.py#L750) - BERT Transformer with the pre-trained next sentence prediction classifier on top (**fully pre-trained**), - - [`BertForPreTraining`](./pytorch_pretrained_bert/modeling.py#L618) - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (**fully pre-trained**), - - [`BertForSequenceClassification`](./pytorch_pretrained_bert/modeling.py#L812) - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**, the sequence classification head **is only initialized and has to be trained**), - - [`BertForQuestionAnswering`](./pytorch_pretrained_bert/modeling.py#L877) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**). +- Eight PyTorch models (`torch.nn.Module`) for Bert with pre-trained weights (in the [`modeling.py`](./pytorch_pretrained_bert/modeling.py) file): + - [`BertModel`](./pytorch_pretrained_bert/modeling.py#L537) - raw BERT Transformer model (**fully pre-trained**), + - [`BertForMaskedLM`](./pytorch_pretrained_bert/modeling.py#L691) - BERT Transformer with the pre-trained masked language modeling head on top (**fully pre-trained**), + - [`BertForNextSentencePrediction`](./pytorch_pretrained_bert/modeling.py#L752) - BERT Transformer with the pre-trained next sentence prediction classifier on top (**fully pre-trained**), + - [`BertForPreTraining`](./pytorch_pretrained_bert/modeling.py#L620) - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (**fully pre-trained**), + - [`BertForSequenceClassification`](./pytorch_pretrained_bert/modeling.py#L814) - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**, the sequence classification head **is only initialized and has to be trained**), + - [`BertForMultipleChoice`](./pytorch_pretrained_bert/modeling.py#L880) - BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is **pre-trained**, the multiple choice classification head **is only initialized and has to be trained**), + - [`BertForTokenClassification`](./pytorch_pretrained_bert/modeling.py#L949) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**), + - [`BertForQuestionAnswering`](./pytorch_pretrained_bert/modeling.py#L1015) - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**, the token classification head **is only initialized and has to be trained**). - Three tokenizers (in the [`tokenization.py`](./pytorch_pretrained_bert/tokenization.py) file): - `BasicTokenizer` - basic tokenization (punctuation splitting, lower casing, etc.), @@ -63,15 +67,17 @@ This package comprises the following classes that can be imported in Python and - `BertAdam` - Bert version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate. - A configuration class (in the [`modeling.py`](./pytorch_pretrained_bert/modeling.py) file): - - `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilisities to read and write from JSON configuration files. + - `BertConfig` - Configuration class to store the configuration of a `BertModel` with utilities to read and write from JSON configuration files. The repository further comprises: -- Three examples on how to use Bert (in the [`examples` folder](./examples)): +- Five examples on how to use Bert (in the [`examples` folder](./examples)): - [`extract_features.py`](./examples/extract_features.py) - Show how to extract hidden states from an instance of `BertModel`, - [`run_classifier.py`](./examples/run_classifier.py) - Show how to fine-tune an instance of `BertForSequenceClassification` on GLUE's MRPC task, - [`run_squad.py`](./examples/run_squad.py) - Show how to fine-tune an instance of `BertForQuestionAnswering` on SQuAD v1.0 task. - + - [`run_swag.py`](./examples/run_swag.py) - Show how to fine-tune an instance of `BertForMultipleChoice` on Swag task. + - [`run_lm_finetuning.py`](./examples/run_lm_finetuning.py) - Show how to fine-tune an instance of `BertForPretraining' on a target text corpus. + These examples are detailed in the [Examples](#examples) section of this readme. - Three notebooks that were used to check that the TensorFlow and PyTorch models behave identically (in the [`notebooks` folder](./notebooks)): @@ -153,7 +159,7 @@ Here is a detailed documentation of the classes in the package and how to use th | Sub-section | Description | |-|-| | [Loading Google AI's pre-trained weigths](#Loading-Google-AIs-pre-trained-weigths-and-PyTorch-dump) | How to load Google AI's pre-trained weight or a PyTorch saved instance | -| [PyTorch models](#PyTorch-models) | API of the six PyTorch model classes: `BertModel`, `BertForMaskedLM`, `BertForNextSentencePrediction`, `BertForPreTraining`, `BertForSequenceClassification` or `BertForQuestionAnswering` | +| [PyTorch models](#PyTorch-models) | API of the eight PyTorch model classes: `BertModel`, `BertForMaskedLM`, `BertForNextSentencePrediction`, `BertForPreTraining`, `BertForSequenceClassification`, `BertForMultipleChoice` or `BertForQuestionAnswering` | | [Tokenizer: `BertTokenizer`](#Tokenizer-BertTokenizer) | API of the `BertTokenizer` class| | [Optimizer: `BertAdam`](#Optimizer-BertAdam) | API of the `BertAdam` class | @@ -162,12 +168,12 @@ Here is a detailed documentation of the classes in the package and how to use th To load one of Google AI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated as ```python -model = BERT_CLASS.from_pretrain(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None) +model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None) ``` where -- `BERT_CLASS` is either the `BertTokenizer` class (to load the vocabulary) or one of the six PyTorch model classes (to load the pre-trained weights): `BertModel`, `BertForMaskedLM`, `BertForNextSentencePrediction`, `BertForPreTraining`, `BertForSequenceClassification` or `BertForQuestionAnswering`, and +- `BERT_CLASS` is either the `BertTokenizer` class (to load the vocabulary) or one of the eight PyTorch model classes (to load the pre-trained weights): `BertModel`, `BertForMaskedLM`, `BertForNextSentencePrediction`, `BertForPreTraining`, `BertForSequenceClassification`, `BertForTokenClassification`, `BertForMultipleChoice` or `BertForQuestionAnswering`, and - `PRE_TRAINED_MODEL_NAME_OR_PATH` is either: - the shortcut name of a Google AI's pre-trained model selected in the list: @@ -175,19 +181,26 @@ where - `bert-base-uncased`: 12-layer, 768-hidden, 12-heads, 110M parameters - `bert-large-uncased`: 24-layer, 1024-hidden, 16-heads, 340M parameters - `bert-base-cased`: 12-layer, 768-hidden, 12-heads , 110M parameters - - `bert-base-multilingual`: 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters + - `bert-large-cased`: 24-layer, 1024-hidden, 16-heads, 340M parameters + - `bert-base-multilingual-uncased`: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters + - `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters - `bert-base-chinese`: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters - a path or url to a pretrained model archive containing: - - - `bert_config.json` a configuration file for the model, and - - `pytorch_model.bin` a PyTorch dump of a pre-trained instance `BertForPreTraining` (saved with the usual `torch.save()`) + + - `bert_config.json` a configuration file for the model, and + - `pytorch_model.bin` a PyTorch dump of a pre-trained instance `BertForPreTraining` (saved with the usual `torch.save()`) If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_pretrained_bert/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_pretrained_bert/`). -- `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information) +- `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information). + +`Uncased` means that the text has been lowercased before WordPiece tokenization, e.g., `John Smith` becomes `john smith`. The Uncased model also strips out any accent markers. `Cased` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the [Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md) or the original TensorFlow repository. + +**When using an `uncased model`, make sure to pass `--do_lower_case` to the example training scripts (or pass `do_lower_case=True` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).** Example: ```python +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) model = BertForSequenceClassification.from_pretrained('bert-base-uncased') ``` @@ -200,8 +213,8 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased') The inputs and output are **identical to the TensorFlow model inputs and outputs**. We detail them here. This model takes as *inputs*: - -- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary (see the tokens preprocessing logic in the scripts `extract_features.py`, `run_classifier.py` and `run_squad.py`), and +[`modeling.py`](./pytorch_pretrained_bert/modeling.py) +- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] with the word token indices in the vocabulary (see the tokens preprocessing logic in the scripts [`extract_features.py`](./examples/extract_features.py), [`run_classifier.py`](./examples/run_classifier.py) and [`run_squad.py`](./examples/run_squad.py)), and - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [0, 1]. It's a mask to be used if some input sequence lengths are smaller than the max input sequence length of the current batch. It's the mask that we typically use for attention when a batch has varying length sentences. - `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`. @@ -215,7 +228,7 @@ This model *outputs* a tuple composed of: - `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a classifier pretrained on top of the hidden state associated to the first character of the input (`CLF`) to train on the Next-Sentence task (see BERT's paper). -An example on how to use this class is given in the `extract_features.py` script which can be used to extract the hidden states of the model for a given input. +An example on how to use this class is given in the [`extract_features.py`](./examples/extract_features.py) script which can be used to extract the hidden states of the model for a given input. #### 2. `BertForPreTraining` @@ -236,6 +249,9 @@ An example on how to use this class is given in the `extract_features.py` script - the masked language modeling logits, and - the next sentence classification logits. + +An example on how to use this class is given in the [`run_lm_finetuning.py`](./examples/run_lm_finetuning.py) script which can be used to fine-tune the BERT language model on your specific different text corpus. This should improve model performance, if the language style is different from the original BERT training corpus (Wiki + BookCorpus). + #### 3. `BertForMaskedLM` @@ -269,15 +285,31 @@ An example on how to use this class is given in the `extract_features.py` script The sequence-level classifier is a linear layer that takes as input the last hidden state of the first character in the input sequence (see Figures 3a and 3b in the BERT paper). -An example on how to use this class is given in the `run_classifier.py` script which can be used to fine-tune a single sequence (or pair of sequence) classifier using BERT, for example for the MRPC task. +An example on how to use this class is given in the [`run_classifier.py`](./examples/run_classifier.py) script which can be used to fine-tune a single sequence (or pair of sequence) classifier using BERT, for example for the MRPC task. + +#### 6. `BertForMultipleChoice` + +`BertForMultipleChoice` is a fine-tuning model that includes `BertModel` and a linear layer on top of the `BertModel`. + +The linear layer outputs a single value for each choice of a multiple choice problem, then all the outputs corresponding to an instance are passed through a softmax to get the model choice. + +This implementation is largely inspired by the work of OpenAI in [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) and the answer of Jacob Devlin in the following [issue](https://github.com/google-research/bert/issues/38). + +An example on how to use this class is given in the [`run_swag.py`](./examples/run_swag.py) script which can be used to fine-tune a multiple choice classifier using BERT, for example for the Swag task. + +#### 7. `BertForTokenClassification` + +`BertForTokenClassification` is a fine-tuning model that includes `BertModel` and a token-level classifier on top of the `BertModel`. -#### 6. `BertForQuestionAnswering` +The token-level classifier is a linear layer that takes as input the last hidden state of the sequence. + +#### 8. `BertForQuestionAnswering` `BertForQuestionAnswering` is a fine-tuning model that includes `BertModel` with a token-level classifiers on top of the full sequence of last hidden states. The token-level classifier takes as input the full sequence of the last hidden state and compute several (e.g. two) scores for each tokens that can for example respectively be the score that a given token is a `start_span` and a `end_span` token (see Figures 3c and 3d in the BERT paper). -An example on how to use this class is given in the `run_squad.py` script which can be used to fine-tune a token classifier using BERT, for example for the SQuAD task. +An example on how to use this class is given in the [`run_squad.py`](./examples/run_squad.py) script which can be used to fine-tune a token classifier using BERT, for example for the SQuAD task. ### Tokenizer: `BertTokenizer` @@ -313,7 +345,7 @@ The optimizer accepts the following arguments: - `b1` : Adams b1. Default : `0.9` - `b2` : Adams b2. Default : `0.999` - `e` : Adams epsilon. Default : `1e-6` -- `weight_decay_rate:` Weight decay. Default : `0.01` +- `weight_decay:` Weight decay. Default : `0.01` - `max_grad_norm` : Maximum norm for the gradients (`-1` means no clipping). Default : `1.0` ## Examples @@ -321,22 +353,23 @@ The optimizer accepts the following arguments: | Sub-section | Description | |-|-| | [Training large models: introduction, tools and examples](#Training-large-models-introduction,-tools-and-examples) | How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models | -| [Fine-tuning with BERT: running the examples](#Fine-tuning-with-BERT-running-the-examples) | Running the examples in [`./examples`](./examples/): `extract_classif.py`, `run_classifier.py` and `run_squad.py` | +| [Fine-tuning with BERT: running the examples](#Fine-tuning-with-BERT-running-the-examples) | Running the examples in [`./examples`](./examples/): `extract_classif.py`, `run_classifier.py`, `run_squad.py` and `run_lm_finetuning.py` | | [Fine-tuning BERT-large on GPUs](#Fine-tuning-BERT-large-on-GPUs) | How to fine tune `BERT large`| ### Training large models: introduction, tools and examples BERT-base and BERT-large are respectively 110M and 340M parameters models and it can be difficult to fine-tune them on a single GPU with the recommended batch size for good performance (in most case a batch size of 32). -To help with fine-tuning these models, we have included five techniques that you can activate in the fine-tuning scripts `run_classifier.py` and `run_squad.py`: gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training . For more details on how to use these techniques you can read [the tips on training large batches in PyTorch](https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255) that I published earlier this month. +To help with fine-tuning these models, we have included several techniques that you can activate in the fine-tuning scripts [`run_classifier.py`](./examples/run_classifier.py) and [`run_squad.py`](./examples/run_squad.py): gradient-accumulation, multi-gpu training, distributed training and 16-bits training . For more details on how to use these techniques you can read [the tips on training large batches in PyTorch](https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255) that I published earlier this month. Here is how to use these techniques in our scripts: - **Gradient Accumulation**: Gradient accumulation can be used by supplying a integer greater than 1 to the `--gradient_accumulation_steps` argument. The batch at each step will be divided by this integer and gradient will be accumulated over `gradient_accumulation_steps` steps. - **Multi-GPU**: Multi-GPU is automatically activated when several GPUs are detected and the batches are splitted over the GPUs. - **Distributed training**: Distributed training can be activated by supplying an integer greater or equal to 0 to the `--local_rank` argument (see below). -- **Optimize on CPU**: The Adam optimizer stores 2 moving average of the weights of the model. If you keep them on GPU 1 (typical behavior), your first GPU will have to store 3-times the size of the model. This is not optimal for large models like `BERT-large` and means your batch size is a lot lower than it could be. This option will perform the optimization and store the averages on the CPU/RAM to free more room on the GPU(s). As the most computational intensive operation is usually the backward pass, this doesn't have a significant impact on the training time. Activate this option with `--optimize_on_cpu` on the `run_squad.py` script. -- **16-bits training**: 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found [here](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) and a full documentation is [here](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html). In our scripts, this option can be activated by setting the `--fp16` flag and you can play with loss scaling using the `--loss_scaling` flag (see the previously linked documentation for details on loss scaling). If the loss scaling is too high (`Nan` in the gradients) it will be automatically scaled down until the value is acceptable. The default loss scaling is 128 which behaved nicely in our tests. +- **16-bits training**: 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found [here](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) and a full documentation is [here](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html). In our scripts, this option can be activated by setting the `--fp16` flag and you can play with loss scaling using the `--loss_scale` flag (see the previously linked documentation for details on loss scaling). The loss scale can be zero in which case the scale is dynamically adjusted or a positive power of two in which case the scaling is static. + +To use 16-bits training and distributed training, you need to install NVIDIA's apex extension [as detailed here](https://github.com/nvidia/apex). You will find more information regarding the internals of `apex` and how to use `apex` in [the doc and the associated repository](https://github.com/nvidia/apex). The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in [the relevant PR of the present repository](https://github.com/huggingface/pytorch-pretrained-BERT/pull/116). Note: To use *Distributed Training*, you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see [the above mentioned blog post]((https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255)) for more details): ```bash @@ -346,16 +379,22 @@ Where `$THIS_MACHINE_INDEX` is an sequential index assigned to each of your mach ### Fine-tuning with BERT: running the examples -We showcase the same examples as [the original implementation](https://github.com/google-research/bert/): fine-tuning a sequence-level classifier on the MRPC classification corpus and a token-level classifier on the question answering dataset SQuAD. +We showcase several fine-tuning examples based on (and extended from) [the original implementation](https://github.com/google-research/bert/): + +- a *sequence-level classifier* on the MRPC classification corpus, +- a *token-level classifier* on the question answering dataset SQuAD, and +- a *sequence-level multiple-choice classifier* on the SWAG classification corpus. +- a *BERT language model* on another target corpus + +#### MRPC + +This example code fine-tunes BERT on the Microsoft Research Paraphrase +Corpus (MRPC) corpus and runs in less than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed. -Before running these examples you should download the +Before running this example you should download the [GLUE data](https://gluebenchmark.com/tasks) by running [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) -and unpack it to some directory `$GLUE_DIR`. Please also download the `BERT-Base` -checkpoint, unzip it to some directory `$BERT_BASE_DIR`, and convert it to its PyTorch version as explained in the previous section. - -This example code fine-tunes `BERT-Base` on the Microsoft Research Paraphrase -Corpus (MRPC) corpus and runs in less than 10 minutes on a single K-80. +and unpack it to some directory `$GLUE_DIR`. ```shell export GLUE_DIR=/path/to/glue @@ -364,6 +403,7 @@ python run_classifier.py \ --task_name MRPC \ --do_train \ --do_eval \ + --do_lower_case \ --data_dir $GLUE_DIR/MRPC/ \ --bert_model bert-base-uncased \ --max_seq_length 128 \ @@ -375,7 +415,29 @@ python run_classifier.py \ Our test ran on a few seeds with [the original implementation hyper-parameters](https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks) gave evaluation results between 84% and 88%. -The second example fine-tunes `BERT-Base` on the SQuAD question answering task. +**Fast run with apex and 16 bit precision: fine-tuning on MRPC in 27 seconds!** +First install apex as indicated [here](https://github.com/NVIDIA/apex). +Then run +```shell +export GLUE_DIR=/path/to/glue + +python run_classifier.py \ + --task_name MRPC \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir $GLUE_DIR/MRPC/ \ + --bert_model bert-base-uncased \ + --max_seq_length 128 \ + --train_batch_size 32 \ + --learning_rate 2e-5 \ + --num_train_epochs 3.0 \ + --output_dir /tmp/mrpc_output/ +``` + +#### SQuAD + +This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a `$SQUAD_DIR` directory. @@ -390,6 +452,7 @@ python run_squad.py \ --bert_model bert-base-uncased \ --do_train \ --do_predict \ + --do_lower_case \ --train_file $SQUAD_DIR/train-v1.1.json \ --predict_file $SQUAD_DIR/dev-v1.1.json \ --train_batch_size 12 \ @@ -405,6 +468,54 @@ Training with the previous hyper-parameters gave us the following results: {"f1": 88.52381567990474, "exact_match": 81.22043519394512} ``` +#### SWAG + +The data for SWAG can be downloaded by cloning the following [repository](https://github.com/rowanz/swagaf) + +```shell +export SWAG_DIR=/path/to/SWAG + +python run_swag.py \ + --bert_model bert-base-uncased \ + --do_train \ + --do_lower_case \ + --do_eval \ + --data_dir $SWAG_DIR/data \ + --train_batch_size 16 \ + --learning_rate 2e-5 \ + --num_train_epochs 3.0 \ + --max_seq_length 80 \ + --output_dir /tmp/swag_output/ \ + --gradient_accumulation_steps 4 +``` + +Training with the previous hyper-parameters on a single GPU gave us the following results: +``` +eval_accuracy = 0.8062081375587323 +eval_loss = 0.5966546792367169 +global_step = 13788 +loss = 0.06423990014260186 +``` + +#### LM Fine-tuning + +The data should be a text file in the same format as [sample_text.txt](./samples/sample_text.txt) (one sentence per line, docs separated by empty line). +You can download an [exemplary training corpus](https://ext-bert-sample.obs.eu-de.otc.t-systems.com/small_wiki_sentence_corpus.txt) generated from wikipedia articles and splitted into ~500k sentences with spaCy. +Training one epoch on this corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with `train_batch_size=200` and `max_seq_length=128`: + + +```shell +python run_lm_finetuning.py \ + --bert_model bert-base-cased \ + --do_train \ + --train_file samples/sample_text.txt \ + --output_dir models \ + --num_train_epochs 5.0 \ + --learning_rate 3e-5 \ + --train_batch_size 32 \ + --max_seq_length 128 +``` + ## Fine-tuning BERT-large on GPUs The options we list above allow to fine-tune BERT-large rather easily on GPU(s) instead of the TPU used by the original implementation. @@ -424,6 +535,7 @@ python ./run_squad.py \ --bert_model bert-large-uncased \ --do_train \ --do_predict \ + --do_lower_case \ --train_file $SQUAD_TRAIN \ --predict_file $SQUAD_EVAL \ --learning_rate 3e-5 \ @@ -432,8 +544,7 @@ python ./run_squad.py \ --doc_stride 128 \ --output_dir $OUTPUT_DIR \ --train_batch_size 24 \ - --gradient_accumulation_steps 2 \ - --optimize_on_cpu + --gradient_accumulation_steps 2 ``` If you have a recent GPU (starting from NVIDIA Volta series), you should try **16-bit fine-tuning** (FP16). @@ -444,6 +555,7 @@ python ./run_squad.py \ --bert_model bert-large-uncased \ --do_train \ --do_predict \ + --do_lower_case \ --train_file $SQUAD_TRAIN \ --predict_file $SQUAD_EVAL \ --learning_rate 3e-5 \ @@ -479,7 +591,7 @@ A command-line interface is provided to convert a TensorFlow checkpoint in a PyT You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`./pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py`](convert_tf_checkpoint_to_pytorch.py) script. -This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using `torch.load()` (see examples in `extract_features.py`, `run_classifier.py` and `run_squad.py`). +This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using `torch.load()` (see examples in [`extract_features.py`](./examples/extract_features.py), [`run_classifier.py`](./examples/run_classifier.py) and [`run_squad.py`]((./examples/run_squad.py))). You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with `bert_model.ckpt`) but be sure to keep the configuration file (`bert_config.json`) and the vocabulary file (`vocab.txt`) as these are needed for the PyTorch model too. diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 00000000000000..6701ee5f62e8e7 --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,1003 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "InteractiveShell.ast_node_interactivity = 'all'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten.\n", + "Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead.\n", + "Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable.\n", + "Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable.\n", + "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n" + ] + } + ], + "source": [ + "# import seaborn as sns\n", + "import os\n", + "import json\n", + "\n", + "import numpy as np\n", + "import math\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "from pylab import rcParams\n", + "\n", + "import torch\n", + "import torch.nn.functional as F\n", + "from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig\n", + "from examples.extract_features import *" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "06/10/2019 08:14:45 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt\n", + "06/10/2019 08:14:45 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased/\n", + "06/10/2019 08:14:45 - INFO - pytorch_pretrained_bert.modeling - Model config {\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"max_position_embeddings\": 512,\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"type_vocab_size\": 2,\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "class Args:\n", + " def __init__(self):\n", + " pass\n", + " \n", + "args = Args()\n", + "args.no_cuda = True\n", + "\n", + "CONFIG_NAME = 'bert_config.json'\n", + "# BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/'\n", + "BERT_DIR = '/nas/pretrain-bert/pretrain-pytorch/bert-base-uncased/'\n", + "config_file = os.path.join(BERT_DIR, CONFIG_NAME)\n", + "config = BertConfig.from_json_file(config_file)\n", + "\n", + "# tokenizer = BertTokenizer.from_pretrained(os.path.join(BERT_DIR, 'vocab.txt'))\n", + "tokenizer = BertTokenizer.from_pretrained('/nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt')\n", + "model = BertForPreTraining.from_pretrained(BERT_DIR)\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n", + "_ = model.to(device)\n", + "_ = model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "def convert_text_to_examples(text):\n", + " examples = []\n", + " unique_id = 0\n", + " if True:\n", + " for line in text:\n", + " line = line.strip()\n", + " text_a = None\n", + " text_b = None\n", + " m = re.match(r\"^(.*) \\|\\|\\| (.*)$\", line)\n", + " if m is None:\n", + " text_a = line\n", + " else:\n", + " text_a = m.group(1)\n", + " text_b = m.group(2)\n", + " examples.append(\n", + " InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))\n", + " unique_id += 1\n", + " return examples\n", + "\n", + "def convert_examples_to_features(examples, tokenizer, append_special_tokens=True, replace_mask=True, print_info=False):\n", + " features = []\n", + " for (ex_index, example) in enumerate(examples):\n", + " tokens_a = tokenizer.tokenize(example.text_a)\n", + " tokens_b = None\n", + " if example.text_b:\n", + " tokens_b = tokenizer.tokenize(example.text_b)\n", + "\n", + " tokens = []\n", + " input_type_ids = []\n", + " if append_special_tokens:\n", + " tokens.append(\"[CLS]\")\n", + " input_type_ids.append(0)\n", + " for token in tokens_a:\n", + " if replace_mask and token == '_': # XD\n", + " token = \"[MASK]\"\n", + " tokens.append(token)\n", + " input_type_ids.append(0)\n", + " if append_special_tokens:\n", + " tokens.append(\"[SEP]\")\n", + " input_type_ids.append(0)\n", + "\n", + " if tokens_b:\n", + " for token in tokens_b:\n", + " if replace_mask and token == '_': # XD\n", + " token = \"[MASK]\"\n", + " tokens.append(token)\n", + " input_type_ids.append(1)\n", + " if append_special_tokens:\n", + " tokens.append(\"[SEP]\")\n", + " input_type_ids.append(1)\n", + "\n", + " input_ids = tokenizer.convert_tokens_to_ids(tokens)\n", + " input_mask = [1] * len(input_ids)\n", + "\n", + " if ex_index < 5:\n", + "# logger.info(\"*** Example ***\")\n", + "# logger.info(\"unique_id: %s\" % (example.unique_id))\n", + " logger.info(\"tokens: %s\" % \" \".join([str(x) for x in tokens]))\n", + "# logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n", + "# logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n", + "# logger.info(\n", + "# \"input_type_ids: %s\" % \" \".join([str(x) for x in input_type_ids]))\n", + " \n", + " features.append(\n", + " InputFeatures(\n", + " unique_id=example.unique_id,\n", + " tokens=tokens,\n", + " input_ids=input_ids,\n", + " input_mask=input_mask,\n", + " input_type_ids=input_type_ids))\n", + " return features\n", + "\n", + "def copy_and_mask_feature(feature, masked_tokens=None):\n", + " import copy\n", + " tokens = feature.tokens\n", + " masked_positions = [tokens.index(t) for t in masked_tokens if t in tokens] \\\n", + " if masked_tokens is not None else range(len(tokens))\n", + " assert len(masked_positions) > 0\n", + " masked_feature_copies = []\n", + " for masked_pos in masked_positions:\n", + " feature_copy = copy.deepcopy(feature)\n", + " feature_copy.input_ids[masked_pos] = tokenizer.vocab[\"[MASK]\"]\n", + " masked_feature_copies.append(feature_copy)\n", + " return masked_feature_copies, masked_positions\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def show_lm_probs(tokens, input_ids, probs, topk=5, firstk=20):\n", + " def print_pair(token, prob, end_str='', hit_mark=' '):\n", + " if i < firstk:\n", + " # token = token.replace('', '').replace('\\n', '/n')\n", + " print('{}{: >3} | {: <12}'.format(hit_mark, int(round(prob*100)), token), end=end_str)\n", + " \n", + " ret = None\n", + " for i in range(len(tokens)):\n", + " ind_ = input_ids[i].item() if input_ids is not None else tokenizer.vocab[tokens[i]]\n", + " prob_ = probs[i][ind_].item()\n", + " print_pair(tokens[i], prob_, end_str='\\t')\n", + " values, indices = probs[i].topk(topk)\n", + " top_pairs = []\n", + " for j in range(topk):\n", + " ind, prob = indices[j].item(), values[j].item()\n", + " hit_mark = '*' if ind == ind_ else ' '\n", + " token = tokenizer.ids_to_tokens[ind]\n", + " print_pair(token, prob, hit_mark=hit_mark, end_str='' if j < topk - 1 else '\\n')\n", + " top_pairs.append((token, prob))\n", + " if tokens[i] == \"[MASK]\":\n", + " ret = top_pairs\n", + " return ret" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import colored\n", + "from colored import stylize\n", + "\n", + "def show_abnormals(tokens, probs, show_suggestions=False):\n", + " def gap2color(gap):\n", + " if gap <= 5:\n", + " return 'yellow_1'\n", + " elif gap <= 10:\n", + " return 'orange_1'\n", + " else:\n", + " return 'red_1'\n", + " \n", + " def print_token(token, suggestion, gap):\n", + " if gap == 0:\n", + " print(stylize(token + ' ', colored.fg('white') + colored.bg('black')), end='')\n", + " else:\n", + " print(stylize(token, colored.fg(gap2color(gap)) + colored.bg('black')), end='')\n", + " if show_suggestions and gap > 5:\n", + " print(stylize('/' + suggestion + ' ', colored.fg('green' if gap > 10 else 'cyan') + colored.bg('black')), end='')\n", + " else:\n", + " print(stylize(' ', colored.fg(gap2color(gap)) + colored.bg('black')), end='')\n", + " # print('/' + suggestion, end=' ')\n", + " # print('%.2f' % gap, end=' ')\n", + " \n", + " avg_gap = 0.\n", + " for i in range(1, len(tokens) - 1): # skip first [CLS] and last [SEP]\n", + " ind_ = tokenizer.vocab[tokens[i]]\n", + " prob_ = probs[i][ind_].item()\n", + " top_prob = probs[i].max().item()\n", + " top_ind = probs[i].argmax().item()\n", + " gap = math.log(top_prob) - math.log(prob_)\n", + " suggestion = tokenizer.ids_to_tokens[top_ind]\n", + " print_token(tokens[i], suggestion, gap)\n", + " avg_gap += gap\n", + " avg_gap /= (len(tokens) - 2)\n", + " print()\n", + " print(avg_gap)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "analyzed_cache = {}\n", + "\n", + "def analyze_text(text, masked_tokens=None, show_suggestions=False, show_firstk_probs=20):\n", + " if text[0] in analyzed_cache:\n", + " features, mlm_probs = analyzed_cache[text[0]]\n", + " given_mask = \"[MASK]\" in features[0].tokens\n", + " tokens = features[0].tokens\n", + " else:\n", + " examples = convert_text_to_examples(text)\n", + " features = convert_examples_to_features(examples, tokenizer, print_info=False)\n", + " given_mask = \"[MASK]\" in features[0].tokens\n", + " if not given_mask or masked_tokens is not None:\n", + " assert len(features) == 1\n", + " features, masked_positions = copy_and_mask_feature(features[0], masked_tokens=masked_tokens)\n", + "\n", + " input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n", + " input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long)\n", + " input_ids = input_ids.to(device)\n", + " input_type_ids = input_type_ids.to(device)\n", + "\n", + " mlm_logits, _ = model(input_ids, input_type_ids)\n", + " mlm_probs = F.softmax(mlm_logits, dim=-1)\n", + "\n", + " tokens = features[0].tokens\n", + " if not given_mask or masked_tokens is not None:\n", + " bsz, seq_len, vocab_size = mlm_probs.size()\n", + " assert bsz == len(masked_positions)\n", + " # reduced_mlm_probs = torch.Tensor(1, seq_len, vocab_size)\n", + " # for i in range(seq_len):\n", + " # reduced_mlm_probs[0, i] = mlm_probs[i, i]\n", + " reduced_mlm_probs = torch.Tensor(1, len(masked_positions), vocab_size)\n", + " for i, pos in enumerate(masked_positions):\n", + " reduced_mlm_probs[0, i] = mlm_probs[i, pos]\n", + " mlm_probs = reduced_mlm_probs\n", + " tokens = [tokens[i] for i in masked_positions]\n", + " \n", + " analyzed_cache[text[0]] = (features, mlm_probs)\n", + " \n", + " top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=show_firstk_probs)\n", + " if not given_mask:\n", + " show_abnormals(tokens, mlm_probs[0], show_suggestions=show_suggestions)\n", + " return top_pairs" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0 | [CLS] \t 3 | . 1 | the 1 | , 1 | ) 1 | \" \n", + " 100 | \" \t*100 | \" 0 | ' 0 | and 0 | so 0 | did \n", + " 100 | is \t*100 | is 0 | was 0 | does 0 | isn 0 | has \n", + " 97 | tom \t* 97 | tom 2 | he 0 | thomas 0 | you 0 | she \n", + " 100 | taller \t*100 | taller 0 | tall 0 | shorter 0 | height 0 | tallest \n", + " 100 | than \t*100 | than 0 | then 0 | as 0 | that 0 | to \n", + " 100 | mary \t*100 | mary 0 | tom 0 | you 0 | barbara 0 | maria \n", + " 100 | ? \t*100 | ? 0 | . 0 | ! 0 | ... 0 | - \n", + " 100 | \" \t*100 | \" 0 | ' 0 | ! 0 | * 0 | ) \n", + " 100 | \" \t*100 | \" 0 | no 0 | ' 0 | oh 0 | that \n", + " 100 | no \t*100 | no 0 | yes 0 | nope 0 | yeah 0 | oh \n", + " 100 | , \t*100 | , 0 | . 0 | ; 0 | - 0 | no \n", + " 0 | [MASK] \t 80 | tom 10 | he 4 | mary 2 | she 1 | thomas \n", + " 100 | is \t*100 | is 0 | was 0 | does 0 | has 0 | no \n", + " 100 | taller \t*100 | taller 0 | shorter 0 | tall 0 | larger 0 | smaller \n", + " 100 | . \t*100 | . 0 | ; 0 | , 0 | ! 0 | ) \n", + " 100 | \" \t*100 | \" 0 | ' 0 | . 0 | ! 0 | ; \n", + " 0 | [SEP] \t 86 | . 4 | , 3 | he 2 | \" 1 | she \n" + ] + }, + { + "data": { + "text/plain": [ + "[('tom', 0.7961671352386475),\n", + " ('he', 0.09765198826789856),\n", + " ('mary', 0.04068772494792938),\n", + " ('she', 0.022535543888807297),\n", + " ('thomas', 0.0058586327359080315)]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text = [\"_ was the greatest physicist who developed theory of relativity.\"]\n", + "text = [\"The trophy doesn't fit into the brown suitcase because the _ is too large.\"] # relational adj\n", + "text = ['\"Is Tom taller than Mary?\" \"No, _ is taller.\"'] # yes/no\n", + "text = [ \"Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have the same hair color.\"] # compare \n", + "text = ['John is taller/shorter than Mary because/although _ is older/younger.'] # causality\n", + "text = [\"Jennifer is older than James . Jennifer younger than Robert . _ is the oldest.\"] # transitive inference\n", + "\n", + "analyze_text(text, show_firstk_probs=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def words2heads(attns, tokens, words):\n", + " positions = [tokens.index(word) for word in words]\n", + "\n", + " for layer in range(config.num_hidden_layers):\n", + " for head in range(config.num_attention_heads):\n", + " for pos_indices in [(0, 1), (1, 0)]:\n", + " from_pos, to_pos = positions[pos_indices[0]], positions[pos_indices[1]]\n", + " if attns[layer][head][from_pos].max(0)[1].item() == to_pos:\n", + " print('Layer %d, head %d: %s -> %s' % (layer, head, tokens[from_pos], tokens[to_pos]), end='\\t')\n", + " print(attns[layer][head][from_pos].topk(5)[0].data)\n", + "\n", + "def head2words(attns, tokens, layer, head):\n", + " for from_pos in range(len(tokens)):\n", + " to_pos = attns[layer][head][from_pos].max(0)[1].item()\n", + " from_word, to_word = tokens[from_pos], tokens[to_pos]\n", + " if from_word.isalpha() and to_word.isalpha():\n", + " print('%s @ %d -> %s @ %d' % (from_word, from_pos, to_word, to_pos), end='\\t')\n", + " print(attns[layer][head][from_pos].topk(5)[0].data)\n", + " \n", + "special_tokens = ['[CLS]', '[SEP]']\n", + "\n", + "def get_salient_heads(attns, tokens, attn_thld=0.5):\n", + " for layer in range(config.num_hidden_layers):\n", + " for head in range(config.num_attention_heads):\n", + " pos_pairs = []\n", + " for from_pos in range(1, len(tokens) - 1): # skip [CLS] and [SEP]\n", + " top_attn, to_pos = attns[layer][head][from_pos].max(0)\n", + " top_attn, to_pos = top_attn.item(), to_pos.item()\n", + " from_word, to_word = tokens[from_pos], tokens[to_pos]\n", + "# if from_word.isalpha() and to_word.isalpha() and top_attn >= attn_thld:\n", + " if abs(from_pos - to_pos) <= 1:\n", + "# print('Layer %d, head %d: %s @ %d -> %s @ %d' % (layer, head, from_word, from_pos, to_word, to_pos), end='\\t')\n", + "# print(attns[layer][head][from_pos].topk(5)[0].data)\n", + " pos_pairs.append((from_pos, to_pos))\n", + " \n", + " ratio = len(pos_pairs) / (len(tokens) - 2)\n", + " if ratio > 0.5:\n", + " print(ratio)\n", + " for from_pos, to_pos in pos_pairs:\n", + " print('Layer %d, head %d: %s @ %d -> %s @ %d' % (layer, head, tokens[from_pos], from_pos, tokens[to_pos], to_pos), end='\\t')\n", + " print(attns[layer][head][from_pos].topk(5)[0].data)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "01/10/2019 21:46:20 - INFO - examples.extract_features - tokens: [CLS] jim laughed because he was so happy . [SEP]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "jim @ 1 -> jim @ 1\ttensor([0.7248, 0.0842, 0.0656, 0.0407, 0.0319], device='cuda:0')\n" + ] + } + ], + "source": [ + "# text, words = [\"The trophy doesn't fit into the brown suitcase because the it is too large.\"], ['fit', 'large']\n", + "# text, words = [\"Mary couldn't beat John in the match because he was too strong.\"], ['beat', 'strong']\n", + "text, words = [\"John is taller than Mary because he is older.\"], ['taller', 'older']\n", + "# text, words = [\"The red ball is heavier than the blue ball because the red ball is bigger.\"], ['heavier', 'bigger']\n", + "text, words = [\"Jim laughed because he was so happy.\"], ['cried', 'sad']\n", + "# text, words = [\"Jim ate the cake quickly because he was so hungry.\"], ['ate', 'hungry']\n", + "# text, words = [\"Jim drank the juice quickly because he was so thirsty.\"], ['drank', 'thirsty']\n", + "# text, words = [\"Tom's drawing hangs high. It is above Susan's drawing\"], ['high', 'above']\n", + "# text, words = [\"Tom's drawing hangs low. It is below Susan's drawing\"], ['low', 'below']\n", + "# text, words = [\"John is taller than Mary . Mary is shorter than John.\"], ['taller', 'shorter']\n", + "# text, words = [\"The drawing is above the cabinet. The cabinet is below the drawing\"], ['above', 'below']\n", + "# text, words = [\"Jim is very thin . He is not fat.\"], ['thin', 'fat']\n", + "\n", + "features = convert_examples_to_features(convert_text_to_examples(text), tokenizer, print_info=False)\n", + "input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).to(device)\n", + "input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long).to(device)\n", + "mlm_logits, _ = model(input_ids, input_type_ids)\n", + "mlm_probs = F.softmax(mlm_logits, dim=-1)\n", + "tokens = features[0].tokens\n", + "# top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=100)\n", + "\n", + "attn_name = 'enc_self_attns'\n", + "hypo = {attn_name: [model.bert.encoder.layer[i].attention.self.attention_probs[0] for i in range(config.num_hidden_layers)]}\n", + "key_labels = query_labels = tokens\n", + "labels_dict = {attn_name: (key_labels, query_labels)}\n", + "result_tuple = (hypo, config.num_attention_heads, labels_dict)\n", + "# plot_layer_attn(result_tuple, attn_name=attn_name, layer=10, heads=None)\n", + "\n", + "attns = hypo[attn_name]\n", + " \n", + "# words2heads(attns, tokens, words)\n", + "head2words(attns, tokens, 2, 10)\n", + "# get_salient_heads(attns, tokens, attn_thld=0.0)" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "0,2\t-1\n", + "0,3\t-1\n", + "0,10\t+1 动宾\n", + "1,1\t+1 动介\n", + "1,4\t-1\n", + "1,11\t0\n", + "2,0\t+1**\n", + "2,6\t0**\n", + "2,9\t+1**\n", + "3,5\t-1\n", + "7,4\t-1\n", + "11,8\t0\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "head_size = config.hidden_size // config.num_attention_heads\n", + "layer = 1\n", + "head = 1 # 2, 3, 10\n", + "wq = model.bert.encoder.layer[layer].attention.self.query.weight.data.view(-1, config.num_attention_heads, head_size).permute(1, 0, 2)\n", + "wk = model.bert.encoder.layer[layer].attention.self.key.weight.data.view(-1, config.num_attention_heads, head_size).permute(1, 0, 2)\n", + "\n", + "wqk = torch.bmm(wq, wk.transpose(-1, -2))\n", + "# (wqk * wqk.transpose(-1, -2)).sum((1, 2)) / (wqk * wqk).sum((1, 2))\n", + "# plt.imshow(wqk[head]*wqk[head])\n", + "# plt.show()\n", + "\n", + "# q = torch.matmul(pos_emb, wq)\n", + "# k = torch.matmul(pos_emb_prev, wk)\n", + "# (q * k).sum((-2, -1))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "pos_emb = model.bert.embeddings.position_embeddings.weight.data\n", + "pos_emb_prev = torch.zeros_like(pos_emb)\n", + "pos_emb_next = torch.zeros_like(pos_emb)\n", + "pos_emb_prev[1:] = pos_emb[:-1]\n", + "pos_emb_next[:-1] = pos_emb[1:]\n", + "pos_emb, pos_emb_prev, pos_emb_next = pos_emb[1:-1], pos_emb_prev[1:-1], pos_emb_next[1:-1]\n", + "\n", + "# pos_q = torch.matmul(pos_emb, wk[head])\n", + "# plt.imshow(pos_q[:32])\n", + "# plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have the same hair color.',\n", + " 'Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have different hair colors.']" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text = [\n", + " # same / different\n", + " \"Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have the same hair color.\",\n", + " \"Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have different hair colors.\",\n", + " \"Tom has yellow hair. Mary has black hair. John has black hair. Mary and _ have the same hair color.\",\n", + " # because / although\n", + " \"John is taller/shorter than Mary because/although _ is older/younger.\",\n", + " \"The red ball is heavier/lighter than the blue ball because/although the _ ball is bigger/smaller.\",\n", + " \"Charles did a lot better/worse than his good friend Nancy on the test because/although _ had/hadn't studied so hard.\",\n", + " \"The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.\",\n", + " \"John thought that he would arrive earlier than Susan, but/and indeed _ was the first to arrive.\",\n", + " # reverse\n", + " \"John came then Mary came. They left in reverse order. _ left then _ left.\",\n", + " \"John came after Mary. They left in reverse order. _ left after _ .\",\n", + " \"John came first, then came Mary. They left in reverse order: _ left first, then left _ .\",\n", + " # compare sentences with same / opposite meaning, 2nd order\n", + " \"Though John is tall, Tom is taller than John. So John is _ than Tom.\",\n", + " \"Tom is taller than John. So _ is shorter than _.\",\n", + " # WSC-style: before /after\n", + " # \"Mary came before/after John. _ was late/early .\",\n", + " # yes / no, 2nd order\n", + " \"Was Tom taller than Susan? Yes, _ was taller.\",\n", + " # right / wrong, epistemic modality, 2nd order\n", + " \"John said/thought that the red ball was heavier than the blue ball. He was wrong. The _ ball was heavier\",\n", + " \"John was wrong in saying/thinking that the red ball was heavier than the blue ball. The _ ball was heavier\",\n", + " \"John said the rain was about to stop. Mary said the rain would continue. Later the rain stopped. _ was wrong/right.\",\n", + " \n", + " \"The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.\",\n", + " \"John thanked Mary because _ had given help to _ . \",\n", + " \"John felt vindicated/crushed when his longtime rival Mary revealed that _ was the winner of the competition.\",\n", + " \"John couldn't see the stage with Mary in front of him because _ is so short/tall.\",\n", + " \"Although they ran at about the same speed, John beat Sally because _ had such a bad start.\",\n", + " \"The fish ate the worm. The _ was hungry/tasty.\",\n", + " \n", + " \"John beat Mary. _ won the game/e winner.\",\n", + "]\n", + "text" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "with open('WSC_switched_label.json') as f:\n", + " examples = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "with open('WSC_child_problem.json') as f:\n", + " cexamples = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "for ce in cexamples:\n", + " for s in ce['sentences']:\n", + " for a in s['answer0'] + s['answer1']:\n", + " a = a.lower()\n", + "# if a not in tokenizer.vocab:\n", + "# ce\n", + "# print(a, 'not in vocab!!!')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "for ce in cexamples:\n", + " if len(ce['sentences']) > 0:\n", + " e = examples[ce['index']]\n", + " assert ce['index'] == e['index']\n", + " e['score'] = all([s['score'] for s in ce['sentences']])\n", + " assert len(set([s['adjacent_ref'] for s in ce['sentences']])) == 1, 'adjcent_refs are different!'\n", + " e['adjacent_ref'] = ce['sentences'][0]['adjacent_ref']" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "groups = defaultdict(list)\n", + "for e in examples:\n", + " if 'score' in e:\n", + " index = e['index']\n", + " if index < 252:\n", + " if index % 2 == 1:\n", + " index -= 1\n", + " elif index in [252, 253, 254]:\n", + " index = 252\n", + " else:\n", + " if index % 2 == 0:\n", + " index -= 1\n", + " groups[index].append(e)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(2,\n", + " \"The trophy doesn't fit into the brown suitcase because [it] is too large.\",\n", + " 'fit into:large/small'),\n", + " (4,\n", + " 'Joan made sure to thank Susan for all the help [she] had recieved.',\n", + " 'thank:receive/give'),\n", + " (10,\n", + " 'The delivery truck zoomed by the school bus because [it] was going so fast.',\n", + " 'zoom by:fast/slow'),\n", + " (12,\n", + " 'Frank felt vindicated when his longtime rival Bill revealed that [he] was the winner of the competition.',\n", + " 'vindicated/crushed:be the winner'),\n", + " (16,\n", + " 'The large ball crashed right through the table because [it] was made of steel.',\n", + " 'crash through:[hard]/[soft]'),\n", + " (18,\n", + " \"John couldn't see the stage with Billy in front of him because [he] is so short.\",\n", + " '[block]:short/tall'),\n", + " (20,\n", + " 'Tom threw his schoolbag down to Ray after [he] reached the top of the stairs.',\n", + " 'down to:top/bottom'),\n", + " (22,\n", + " 'Although they ran at about the same speed, Sue beat Sally because [she] had such a good start.',\n", + " 'beat:good/bad'),\n", + " (26,\n", + " \"Sam's drawing was hung just above Tina's and [it] did look much better with another one below it.\",\n", + " 'above/below'),\n", + " (28,\n", + " 'Anna did a lot better than her good friend Lucy on the test because [she] had studied so hard.',\n", + " 'better/worse:study hard'),\n", + " (30,\n", + " 'The firemen arrived after the police because [they] were coming from so far away.',\n", + " 'after/before:far away'),\n", + " (32,\n", + " \"Frank was upset with Tom because the toaster [he] had bought from him didn't work.\",\n", + " 'be upset with:buy from not work/sell not work'),\n", + " (36,\n", + " 'The sack of potatoes had been placed above the bag of flour, so [it] had to be moved first.',\n", + " 'above/below:moved first'),\n", + " (38,\n", + " 'Pete envies Martin although [he] is very successful.',\n", + " 'although/because'),\n", + " (42,\n", + " 'I poured water from the bottle into the cup until [it] was empty.',\n", + " 'pour:empty/full'),\n", + " (46,\n", + " \"Sid explained his theory to Mark but [he] couldn't convince him.\",\n", + " 'explain:convince/understand'),\n", + " (48,\n", + " \"Susan knew that Ann's son had been in a car accident, so [she] told her about it.\",\n", + " '?know tell:so/because'),\n", + " (50,\n", + " \"Joe's uncle can still beat him at tennis, even though [he] is 30 years younger.\",\n", + " 'beat:younger/older'),\n", + " (64,\n", + " 'In the middle of the outdoor concert, the rain started falling, but [it] continued until 10.',\n", + " 'but/and'),\n", + " (68,\n", + " 'Ann asked Mary what time the library closes, because [she] had forgotten.',\n", + " 'because/but'),\n", + " (84,\n", + " 'If the con artist has succeeded in fooling Sam, [he] would have gotten a lot of money.',\n", + " 'fool:get/lose'),\n", + " (92,\n", + " 'Alice tried frantically to stop her daughter from chatting at the party, leaving us to wonder why [she] was behaving so strangely.',\n", + " '?stop normal/stop abnormal:strange'),\n", + " (98,\n", + " \"I was trying to open the lock with the key, but someone had filled the keyhole with chewing gum, and I couldn't get [it] in.\",\n", + " 'put ... into filled with ... :get in/get out'),\n", + " (100,\n", + " 'The dog chased the cat, which ran up a tree. [It] waited at the bottom.',\n", + " 'up:at the bottom/at the top'),\n", + " (106,\n", + " 'John was doing research in the library when he heard a man humming and whistling. [He] was very annoyed.',\n", + " 'hear ... humming and whistling:annoyed/annoying'),\n", + " (108,\n", + " 'John was jogging through the park when he saw a man juggling watermelons. [He] was very impressed.',\n", + " 'see ... juggling watermelons:impressed/impressive'),\n", + " (132,\n", + " 'Jane knocked on the door, and Susan answered it. [She] invited her to come out.',\n", + " 'visit:invite come out/invite come in'),\n", + " (150,\n", + " 'Jackson was greatly influenced by Arnold, though [he] lived two centuries later.',\n", + " 'influence:later/earlier'),\n", + " (160,\n", + " 'The actress used to be named Terpsichore, but she changed it to Tina a few years ago, because she figured [it] was too hard to pronounce.',\n", + " 'change:hard/easy'),\n", + " (166,\n", + " 'Fred is the only man still alive who remembers my great-grandfather. [He] is a remarkable man.',\n", + " 'alive:is/was'),\n", + " (170,\n", + " \"In July, Kamtchatka declared war on Yakutsk. Since Yakutsk's army was much better equipped and ten times larger, [they] were defeated within weeks.\",\n", + " 'better equipped and large:defeated/victorious'),\n", + " (186,\n", + " 'When the sponsors of the bill got to the town hall, they were surprised to find that the room was full of opponents. [They] were very much in the minority.',\n", + " 'be full of:minority/majority'),\n", + " (188,\n", + " 'Everyone really loved the oatmeal cookies; only a few people liked the chocolate chip cookies. Next time, we should make more of [them] .',\n", + " 'like over:more/fewer'),\n", + " (190,\n", + " 'We had hoped to place copies of our newsletter on all the chairs in the auditorium, but there were simply not enough of [them] .',\n", + " 'place on all:not enough/too many'),\n", + " (196,\n", + " \"Steve follows Fred's example in everything. [He] admires him hugely.\",\n", + " 'follow:admire/influence'),\n", + " (198,\n", + " \"The table won't fit through the doorway because [it] is too wide.\",\n", + " 'fit through:wide/narrow'),\n", + " (200,\n", + " 'Grace was happy to trade me her sweater for my jacket. She thinks [it] looks dowdy on her.',\n", + " 'trade:dowdy/great'),\n", + " (202,\n", + " 'John hired Bill to take care of [him] .',\n", + " 'hire/hire oneself to:take care of'),\n", + " (204,\n", + " 'John promised Bill to leave, so an hour later [he] left.',\n", + " 'promise/order'),\n", + " (210,\n", + " \"Jane knocked on Susan's door but [she] did not get an answer.\",\n", + " 'knock:get an answer/answer'),\n", + " (212,\n", + " 'Joe paid the detective after [he] received the final report on the case.',\n", + " 'pay:receive/deliver'),\n", + " (226,\n", + " 'Bill passed the half-empty plate to John because [he] was full.',\n", + " 'pass the plate:full/hungry'),\n", + " (252,\n", + " 'George got free tickets to the play, but he gave them to Eric, even though [he] was particularly eager to see it.',\n", + " 'even though/because/not'),\n", + " (255,\n", + " \"Jane gave Joan candy because [she] wasn't hungry.\",\n", + " 'give:not hungry/hungry'),\n", + " (259,\n", + " 'James asked Robert for a favor but [he] was refused.',\n", + " 'ask for a favor:refuse/be refused`'),\n", + " (261,\n", + " 'Kirilov ceded the presidency to Shatov because [he] was less popular.',\n", + " 'cede:less popular/more popular'),\n", + " (263,\n", + " 'Emma did not pass the ball to Janie although [she] saw that she was open.',\n", + " 'not pass although:see open/open')]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def filter_dict(d, keys=['index', 'sentence', 'correct_answer', 'relational_word', 'is_associative', 'score']):\n", + " return {k: d[k] for k in d if k in keys}\n", + "\n", + "# ([[filter_dict(e) for e in eg] for eg in groups.values() if eg[0]['relational_word'] != 'none' and all([e['score'] for e in eg])])# / len([eg for eg in groups.values() if eg[0]['relational_word'] != 'none'])\n", + "# [(index, eg[0]['relational_word'], all([e['score'] for e in eg])) for index, eg in groups.items() if eg[0]['relational_word'] != 'none']\n", + "# len([filter_dict(e) for e in examples if 'score' in e and not e['score'] and e['adjacent_ref']])\n", + "# for e in examples:\n", + "# if e['index'] % 2 == 0:\n", + "# print(e['sentence'])\n", + "[(eg[0]['index'], eg[0]['sentence'], eg[0]['relational_word']) for index, eg in groups.items() if '/' in eg[0]['relational_word']]" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "179" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(['because' in e['sentence'] for e in examples]) + \\\n", + "sum(['so ' in e['sentence'] for e in examples]) + \\\n", + "sum(['but ' in e['sentence'] for e in examples]) + \\\n", + "sum(['though' in e['sentence'] for e in examples])" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "# with open('WSC_switched_label.json', 'w') as f:\n", + "# json.dump(examples, f)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "vis_attn_topk = 3\n", + "\n", + "def has_chinese_label(labels):\n", + " labels = [label.split('->')[0].strip() for label in labels]\n", + " r = sum([len(label) > 1 for label in labels if label not in ['BOS', 'EOS']]) * 1. / (len(labels) - 1)\n", + " return 0 < r < 0.5 # r == 0 means empty query labels used in self attention\n", + "\n", + "def _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col, color='b'):\n", + " assert len(query_labels) == attn.size(0)\n", + " assert len(key_labels) == attn.size(1)\n", + "\n", + " ax1.set_xlim([-1, 1])\n", + " ax1.set_xticks([])\n", + " ax2 = ax1.twinx()\n", + " nlabels = max(len(key_labels), len(query_labels))\n", + " pos = range(nlabels)\n", + " \n", + " if 'self' in attn_name and col < ncols - 1:\n", + " query_labels = ['' for _ in query_labels]\n", + "\n", + " for ax, labels in [(ax1, key_labels), (ax2, query_labels)]:\n", + " ax.set_yticks(pos)\n", + " if has_chinese_label(labels):\n", + " ax.set_yticklabels(labels, fontproperties=zhfont)\n", + " else:\n", + " ax.set_yticklabels(labels)\n", + " ax.set_ylim([nlabels - 1, 0])\n", + " ax.tick_params(width=0, labelsize='xx-large')\n", + "\n", + " for spine in ax.spines.values():\n", + " spine.set_visible(False)\n", + "\n", + "# mask, attn = filter_attn(attn)\n", + " for qi in range(attn.size(0)):\n", + "# if not mask[qi]:\n", + "# continue\n", + "# for ki in range(attn.size(1)):\n", + " for ki in attn[qi].topk(vis_attn_topk)[1]:\n", + " a = attn[qi, ki]\n", + " ax1.plot((-1, 1), (ki, qi), color, alpha=a)\n", + "# print(attn.mean(dim=0).topk(5)[0])\n", + "# ax1.barh(pos, attn.mean(dim=0).data.cpu().numpy())\n", + "\n", + "def plot_layer_attn(result_tuple, attn_name='dec_self_attns', layer=0, heads=None):\n", + " hypo, nheads, labels_dict = result_tuple\n", + " key_labels, query_labels = labels_dict[attn_name]\n", + " if heads is None:\n", + " heads = range(nheads)\n", + " else:\n", + " nheads = len(heads)\n", + " \n", + " stride = 2 if attn_name == 'dec_enc_attns' else 1\n", + " nlabels = max(len(key_labels), len(query_labels))\n", + " rcParams['figure.figsize'] = 20, int(round(nlabels * stride * nheads / 8 * 1.0))\n", + " \n", + " rows = nheads // ncols * stride\n", + " fig, axes = plt.subplots(rows, ncols)\n", + " \n", + " # for head in range(nheads):\n", + " for head_i, head in enumerate(heads):\n", + " row, col = head_i * stride // ncols, head_i * stride % ncols\n", + " ax1 = axes[row, col]\n", + " attn = hypo[attn_name][layer][head]\n", + " _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col)\n", + " if attn_name == 'dec_enc_attns':\n", + " col = col + 1\n", + " axes[row, col].axis('off') # next subfig acts as blank place holder\n", + " # plt.suptitle('%s with %d heads, Layer %d' % (attn_name, nheads, layer), fontsize=20)\n", + " plt.show() \n", + " \n", + "ncols = 4" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"max_position_embeddings\": 512,\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"type_vocab_size\": 2,\n", + " \"vocab_size\": 30522\n", + "}" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "config.num" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Untitled1.ipynb b/Untitled1.ipynb new file mode 100644 index 00000000000000..0a6ceec8cab0b2 --- /dev/null +++ b/Untitled1.ipynb @@ -0,0 +1,2971 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "InteractiveShell.ast_node_interactivity = 'all'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import itertools\n", + "from itertools import product, permutations\n", + "from random import sample" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch.utils.data import DataLoader, RandomSampler, SequentialSampler" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten.\n", + "Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead.\n", + "Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable.\n", + "Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable.\n", + "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n" + ] + } + ], + "source": [ + "from pytorch_pretrained_bert.tokenization import BertTokenizer\n", + "from pytorch_pretrained_bert.modeling import BertForPreTraining, BertForMaskedLM, BertConfig\n", + "from pytorch_pretrained_bert.optimization import BertAdam\n", + "from run_child_finetuning import *" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "06/09/2019 14:55:34 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt\n" + ] + } + ], + "source": [ + "BERT_DIR = '/nas/pretrain-bert/pretrain-pytorch/bert-base-uncased'\n", + "tokenizer = BertTokenizer.from_pretrained('/nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def assert_in_bert_vocab(tokens):\n", + " for token in tokens:\n", + " if isinstance(token, str): # entities\n", + " assert token.lower() in tokenizer.vocab, token + '->' + str(tokenizer.tokenize(token))\n", + " elif isinstance(token, tuple): # relations\n", + " assert len(token) == 2, str(token)\n", + " for rel in token:\n", + " rel = rel.split('..')[0]\n", + " assert rel in tokenizer.vocab, rel + '->' + str(tokenizer.tokenize(rel))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "19" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fruits = ['apple', 'banana', 'pear', 'orange', 'peach', 'berry', 'plum', 'pinapple', 'melon', 'cherry', 'grape', 'lemon',\n", + " 'papaya', 'durian', 'kiwi', 'mongo', 'date', 'jujube', 'watermelon']\n", + "len(fruits)\n", + "# http://www.manythings.org/vocabulary/lists/e/words.php?f=fruit" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "16" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "animals = ['dog', 'cat', 'pig', 'chicken', 'hen', 'cock', 'duck', 'goose', 'monkey', 'tiger', 'bird', 'bear', 'lion', 'bee', 'ant', 'elephant']\n", + "len(animals)\n", + "# see more at http://www.manythings.org/vocabulary/lists/a/words.php?f=animals_1\n", + "# http://www.manythings.org/vocabulary/lists/a/\n", + "# especially http://www.manythings.org/vocabulary/lists/a/words.php?f=classroom_1 things in classroom" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "male_names = ['James', 'John', 'Robert', ]#'Michael', 'David', 'Paul', 'Jeff', 'Daniel', 'Charles', 'Thomas']\n", + "female_names = ['Mary', 'Linda', 'Jennifer', ]#'Maria', 'Susan', 'Lisa', 'Sandra', 'Barbara', 'Patricia', 'Elizabeth']\n", + "len(male_names)\n", + "len(female_names)\n", + "people_names = (male_names, female_names)\n", + "assert_in_bert_vocab(male_names)\n", + "assert_in_bert_vocab(female_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "spatial_relations = (\n", + " ('above', 'below'), \n", + " ('in front of/in the front', 'behind/in the back'), \n", + " ('on the left..side of', 'on the right..side of')\n", + ")\n", + "people_adj_relations = (\n", + " ('taller..than', 'shorter..than'), \n", + "# ('thinner..than', 'fatter..than'), # fatter not in BERT vocab\n", + " ('younger..than', 'older..than'), \n", + "# ('stronger..than', 'weaker..than'), \n", + "# ('faster..than', 'slower..than'),\n", + "# ('richer..than', 'poorer..than')\n", + ")\n", + "animal_adj_relations = (\n", + " ('thinner..than', 'fatter..than'), \n", + " ('younger..than', 'older..than'), \n", + " ('stronger..than', 'weaker..than'), \n", + " ('faster..than', 'slower..than')\n", + ")\n", + "object_adj_relations = (\n", + " ('bigger..than', 'smaller..than'), \n", + " ('heavier..than', 'lighter..than'), \n", + " ('better..than', 'worse..than')\n", + ")\n", + "assert_in_bert_vocab(people_adj_relations)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "rel2entypes = {\n", + "# spatial_relations: [fruits, animals, people_names],\n", + " people_adj_relations: [people_names],\n", + "# animal_adj_relations: [animals],\n", + "# object_adj_relations: [fruits, animals]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "twoent_A_template = 'is {dt} {ent0} {rel} {dt} {ent1}'\n", + "twoent_B_template = '{dt} {ent} is {pred}'\n", + "twoent_template = '\"{A}?\" \"{conj} {B}.\"'" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def reverse(l):\n", + " return list(reversed(l)) if isinstance(l, list) else tuple(reversed(l))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def mask(ent_str):\n", + " tokens = ent_str.strip().split()\n", + " if len(tokens) == 1:\n", + " return '[%s]' % tokens[0]\n", + " elif len(tokens) == 2:\n", + " assert tokens[0] == 'the', ent_str\n", + " return '%s [%s]' % (tokens[0], tokens[1])\n", + " else:\n", + " assert False, ent_str" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def get_conj(join_type, A, B):\n", + " if join_type == 'no':\n", + " return 'no,'\n", + " return 'yes,'\n", + " assert join_type == 'yes'\n", + " subB = B.split('is')[0].split()[-1]\n", + " w0, w1, w2 = A.split()[: 3]\n", + " assert w0 == 'Is'\n", + " subA = w1 if w1 != 'the' else w2\n", + " if subA == subB and 'not' not in B: # B is repeating A\n", + " return 'Yes,'\n", + " else:\n", + " return 'Yes, in other words,'" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [], + "source": [ + "def make_sentences(A_template, B_template, join_template,\n", + " index=-1, orig_sentence='', entities=[\"John\", \"Mary\"], entity_substitutes=None, determiner=\"\", \n", + " relations=[],\n", + " packed_relations=[\"rel/~rel\", \"rev_rel/~rev_rel\"], packed_relation_substitutes=None, relation_suffix=\"\",\n", + " packed_predicates=[\"pred0/~pred0\", \"pred1/~pred1\"], predicate_substitutes=None,\n", + " predicate_dichotomy=True, reverse_causal=False):\n", + "# assert entities[0].lower() in tokenizer.vocab , entities[0]\n", + "# assert entities[1].lower() in tokenizer.vocab , entities[1]\n", + " determiner = 'the' if entities[0].islower() else ''\n", + " relations, predicates = ([r.replace('..', ' ') for r in relations], [r.split('..')[0] for r in relations]) \\\n", + " if '..' in relations[0] else ([r.split('/')[0] for r in relations], [r.split('/')[-1] for r in relations])\n", + " neg_predicates = ['not ' + p for p in predicates]\n", + " As = [A_template.format(dt=determiner, ent0=ent0, ent1=ent1, rel=rel, rel_suffix=relation_suffix) \n", + " for ent0, ent1, rel in [entities + relations[:1], reverse(entities) + reverse(relations)[:1]]]\n", + " negAs = [A_template.format(dt=determiner, ent0=ent0, ent1=ent1, rel=rel, rel_suffix=relation_suffix) \n", + " for ent0, ent1, rel in [entities + reverse(relations)[:1], reverse(entities) + relations[:1]]]\n", + " \n", + " Bs = [B_template.format(dt=determiner, ent=mask(ent), pred=pred) for ent, pred in zip(entities, predicates)]\n", + " negBs = [B_template.format(dt=determiner, ent=mask(ent), pred=pred) for ent, pred in zip(entities, neg_predicates)]\n", + " if predicate_dichotomy:\n", + " Bs += [B_template.format(dt=determiner, ent=mask(ent), pred=pred) for ent, pred in zip(entities, reversed(neg_predicates))]\n", + " negBs += [B_template.format(dt=determiner, ent=mask(ent), pred=pred) for ent, pred in zip(entities, reversed(predicates))]\n", + " \n", + " def form_sentences(sentence_template, join_type, As, Bs):\n", + " return [\" \".join(sentence_template.format(A=A, B=B, conj=get_conj(join_type, A, B)).split()) for A, B in itertools.product(As, Bs)]\n", + " \n", + " yes_sentences = []\n", + " for A, B in [(As, Bs), (negAs, negBs)]:\n", + " yes_sentences += form_sentences(join_template, 'yes', A, B)\n", + "# yes_sentences = list(itertools.chain.from_iterable([form_sentences(join_template, 'yes', A, B) for A, B in [(As, Bs), (negAs, negBs)]]))\n", + "\n", + " no_sentences = []\n", + " for A, B in [(As, negBs), (negAs, Bs)]:\n", + " no_sentences += form_sentences(join_template, 'no', A, B)\n", + " \n", + " return yes_sentences + no_sentences\n", + " \n", + "# make_sentences(\n", + "# twoent_A_template, twoent_B_template, twoent_template, entities=['apple', 'banana'], determiner='', relations=['taller..than', 'shorter..than'])" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'make_sentences' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;31m# yes_sent, no_sent = make_sentences(twoent_A_template, twoent_B_template, twoent_template, entities=list(ent_pair), relations=rel)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;31m# sentences += (yes_sent + no_sent)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0msentences\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mmake_sentences\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtwoent_A_template\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtwoent_B_template\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtwoent_template\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mentities\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ment_pair\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrelations\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0msample\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msentences\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m20\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0msentence_groups\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msentences\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'make_sentences' is not defined" + ] + } + ], + "source": [ + "sentence_groups = []\n", + "for relations, entity_types in rel2entypes.items():\n", + " sentences = []\n", + " ent_pairs = []\n", + " for entities in entity_types:\n", + " if isinstance(entities, list):\n", + " ent_pairs += permutations(entities, 2)\n", + " else:\n", + " assert isinstance(entities, tuple) and len(entities) == 2 # people_names\n", + " ent_pairs += product(entities[0], entities[1])\n", + " ent_pairs += product(entities[1], entities[0])\n", + " for (rel, ent_pair) in product(relations, ent_pairs):\n", + "# yes_sent, no_sent = make_sentences(twoent_A_template, twoent_B_template, twoent_template, entities=list(ent_pair), relations=rel)\n", + "# sentences += (yes_sent + no_sent)\n", + " sentences += make_sentences(twoent_A_template, twoent_B_template, twoent_template, entities=list(ent_pair), relations=rel)\n", + " sample(sentences, 20)\n", + " sentence_groups.append(sentences)" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "[78432, 38400, 32768, 59232]" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(sentence_groups)\n", + "[len(sg) for sg in sentence_groups]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def comparative2superlative(comparative_form, structured=False):\n", + " assert comparative_form.endswith('er'), comparative_form\n", + " superlative_form = 'the ' + comparative_form[:-2] + 'est' \\\n", + " if not structured else 'the ' + comparative_form + ' st'\n", + " return superlative_form" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def make_relational_atoms(relational_template, entities, relations):\n", + " neg_relations = [\"isn't \" + r for r in relations]\n", + " relations = [\"is \" + r for r in relations]\n", + " atoms = [relational_template.format(ent0=ent0, ent1=ent1, rel=rel) \n", + " for ent0, ent1, rel in [entities + relations[:1], reverse(entities) + reverse(relations)[:1]]]\n", + " atoms += [relational_template.format(ent0=ent0, ent1=ent1, rel=rel) \n", + " for ent0, ent1, rel in [entities + reverse(neg_relations)[:1], reverse(entities) + neg_relations[:1]]]\n", + " return atoms" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['John is taller than Mary . Mary is taller than Susan . ||| is Susan shorter than John ? [yes] .',\n", + " 'John is taller than Mary . Susan is shorter than Mary . ||| is Susan shorter than John ? [yes] .',\n", + " \"Mary isn't taller than John . Mary is taller than Susan . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Susan isn't taller than Mary . John is taller than Mary . ||| who is the tallest ? [John] .\",\n", + " \"John is taller than Mary . Susan isn't taller than Mary . ||| is John shorter than Susan ? [no] .\",\n", + " \"Mary is shorter than John . Mary isn't shorter than Susan . ||| is Susan shorter than John ? [yes] .\",\n", + " \"John isn't shorter than Mary . Mary is taller than Susan . ||| is Susan taller than John ? [no] .\",\n", + " \"Mary is taller than Susan . John isn't shorter than Mary . ||| is Susan taller than John ? [no] .\",\n", + " \"Mary is shorter than John . Susan isn't taller than Mary . ||| is Susan taller than John ? [no] .\",\n", + " \"Mary is shorter than John . Susan isn't taller than Mary . ||| who is the shortest ? [Susan] .\",\n", + " 'John is taller than Mary . Susan is shorter than Mary . ||| is John taller than Susan ? [yes] .',\n", + " 'Mary is taller than Susan . John is taller than Mary . ||| is Susan shorter than John ? [yes] .',\n", + " \"Mary isn't shorter than Susan . John isn't shorter than Mary . ||| who is the tallest ? [John] .\",\n", + " \"Susan is shorter than Mary . John isn't shorter than Mary . ||| is John taller than Susan ? [yes] .\",\n", + " \"Mary isn't taller than John . Mary is taller than Susan . ||| is John shorter than Susan ? [no] .\",\n", + " \"John isn't shorter than Mary . Susan is shorter than Mary . ||| is Susan taller than John ? [no] .\",\n", + " \"Mary isn't shorter than Susan . Mary is shorter than John . ||| is Susan taller than John ? [no] .\",\n", + " \"Mary is shorter than John . Susan isn't taller than Mary . ||| who is the tallest ? [John] .\",\n", + " \"Susan isn't taller than Mary . Mary isn't taller than John . ||| is John taller than Susan ? [yes] .\",\n", + " 'John is taller than Mary . Susan is shorter than Mary . ||| is Susan taller than John ? [no] .']" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "['John is taller than Mary . Mary is taller than Susan . ||| who is the tallest ? [John] .',\n", + " 'John is taller than Mary . Mary is taller than Susan . ||| who is the shortest ? [Susan] .',\n", + " 'John is taller than Mary . Mary is taller than Susan . ||| is John taller than Susan ? [yes] .',\n", + " 'John is taller than Mary . Mary is taller than Susan . ||| is John shorter than Susan ? [no] .',\n", + " 'John is taller than Mary . Mary is taller than Susan . ||| is Susan shorter than John ? [yes] .',\n", + " 'John is taller than Mary . Mary is taller than Susan . ||| is Susan taller than John ? [no] .',\n", + " 'John is taller than Mary . Susan is shorter than Mary . ||| who is the tallest ? [John] .',\n", + " 'John is taller than Mary . Susan is shorter than Mary . ||| who is the shortest ? [Susan] .',\n", + " 'John is taller than Mary . Susan is shorter than Mary . ||| is John taller than Susan ? [yes] .',\n", + " 'John is taller than Mary . Susan is shorter than Mary . ||| is John shorter than Susan ? [no] .',\n", + " 'John is taller than Mary . Susan is shorter than Mary . ||| is Susan shorter than John ? [yes] .',\n", + " 'John is taller than Mary . Susan is shorter than Mary . ||| is Susan taller than John ? [no] .',\n", + " \"John is taller than Mary . Mary isn't shorter than Susan . ||| who is the tallest ? [John] .\",\n", + " \"John is taller than Mary . Mary isn't shorter than Susan . ||| who is the shortest ? [Susan] .\",\n", + " \"John is taller than Mary . Mary isn't shorter than Susan . ||| is John taller than Susan ? [yes] .\",\n", + " \"John is taller than Mary . Mary isn't shorter than Susan . ||| is John shorter than Susan ? [no] .\",\n", + " \"John is taller than Mary . Mary isn't shorter than Susan . ||| is Susan shorter than John ? [yes] .\",\n", + " \"John is taller than Mary . Mary isn't shorter than Susan . ||| is Susan taller than John ? [no] .\",\n", + " \"John is taller than Mary . Susan isn't taller than Mary . ||| who is the tallest ? [John] .\",\n", + " \"John is taller than Mary . Susan isn't taller than Mary . ||| who is the shortest ? [Susan] .\",\n", + " \"John is taller than Mary . Susan isn't taller than Mary . ||| is John taller than Susan ? [yes] .\",\n", + " \"John is taller than Mary . Susan isn't taller than Mary . ||| is John shorter than Susan ? [no] .\",\n", + " \"John is taller than Mary . Susan isn't taller than Mary . ||| is Susan shorter than John ? [yes] .\",\n", + " \"John is taller than Mary . Susan isn't taller than Mary . ||| is Susan taller than John ? [no] .\",\n", + " 'Mary is shorter than John . Mary is taller than Susan . ||| who is the tallest ? [John] .',\n", + " 'Mary is shorter than John . Mary is taller than Susan . ||| who is the shortest ? [Susan] .',\n", + " 'Mary is shorter than John . Mary is taller than Susan . ||| is John taller than Susan ? [yes] .',\n", + " 'Mary is shorter than John . Mary is taller than Susan . ||| is John shorter than Susan ? [no] .',\n", + " 'Mary is shorter than John . Mary is taller than Susan . ||| is Susan shorter than John ? [yes] .',\n", + " 'Mary is shorter than John . Mary is taller than Susan . ||| is Susan taller than John ? [no] .',\n", + " 'Mary is shorter than John . Susan is shorter than Mary . ||| who is the tallest ? [John] .',\n", + " 'Mary is shorter than John . Susan is shorter than Mary . ||| who is the shortest ? [Susan] .',\n", + " 'Mary is shorter than John . Susan is shorter than Mary . ||| is John taller than Susan ? [yes] .',\n", + " 'Mary is shorter than John . Susan is shorter than Mary . ||| is John shorter than Susan ? [no] .',\n", + " 'Mary is shorter than John . Susan is shorter than Mary . ||| is Susan shorter than John ? [yes] .',\n", + " 'Mary is shorter than John . Susan is shorter than Mary . ||| is Susan taller than John ? [no] .',\n", + " \"Mary is shorter than John . Mary isn't shorter than Susan . ||| who is the tallest ? [John] .\",\n", + " \"Mary is shorter than John . Mary isn't shorter than Susan . ||| who is the shortest ? [Susan] .\",\n", + " \"Mary is shorter than John . Mary isn't shorter than Susan . ||| is John taller than Susan ? [yes] .\",\n", + " \"Mary is shorter than John . Mary isn't shorter than Susan . ||| is John shorter than Susan ? [no] .\",\n", + " \"Mary is shorter than John . Mary isn't shorter than Susan . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Mary is shorter than John . Mary isn't shorter than Susan . ||| is Susan taller than John ? [no] .\",\n", + " \"Mary is shorter than John . Susan isn't taller than Mary . ||| who is the tallest ? [John] .\",\n", + " \"Mary is shorter than John . Susan isn't taller than Mary . ||| who is the shortest ? [Susan] .\",\n", + " \"Mary is shorter than John . Susan isn't taller than Mary . ||| is John taller than Susan ? [yes] .\",\n", + " \"Mary is shorter than John . Susan isn't taller than Mary . ||| is John shorter than Susan ? [no] .\",\n", + " \"Mary is shorter than John . Susan isn't taller than Mary . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Mary is shorter than John . Susan isn't taller than Mary . ||| is Susan taller than John ? [no] .\",\n", + " \"John isn't shorter than Mary . Mary is taller than Susan . ||| who is the tallest ? [John] .\",\n", + " \"John isn't shorter than Mary . Mary is taller than Susan . ||| who is the shortest ? [Susan] .\",\n", + " \"John isn't shorter than Mary . Mary is taller than Susan . ||| is John taller than Susan ? [yes] .\",\n", + " \"John isn't shorter than Mary . Mary is taller than Susan . ||| is John shorter than Susan ? [no] .\",\n", + " \"John isn't shorter than Mary . Mary is taller than Susan . ||| is Susan shorter than John ? [yes] .\",\n", + " \"John isn't shorter than Mary . Mary is taller than Susan . ||| is Susan taller than John ? [no] .\",\n", + " \"John isn't shorter than Mary . Susan is shorter than Mary . ||| who is the tallest ? [John] .\",\n", + " \"John isn't shorter than Mary . Susan is shorter than Mary . ||| who is the shortest ? [Susan] .\",\n", + " \"John isn't shorter than Mary . Susan is shorter than Mary . ||| is John taller than Susan ? [yes] .\",\n", + " \"John isn't shorter than Mary . Susan is shorter than Mary . ||| is John shorter than Susan ? [no] .\",\n", + " \"John isn't shorter than Mary . Susan is shorter than Mary . ||| is Susan shorter than John ? [yes] .\",\n", + " \"John isn't shorter than Mary . Susan is shorter than Mary . ||| is Susan taller than John ? [no] .\",\n", + " \"John isn't shorter than Mary . Mary isn't shorter than Susan . ||| who is the tallest ? [John] .\",\n", + " \"John isn't shorter than Mary . Mary isn't shorter than Susan . ||| who is the shortest ? [Susan] .\",\n", + " \"John isn't shorter than Mary . Mary isn't shorter than Susan . ||| is John taller than Susan ? [yes] .\",\n", + " \"John isn't shorter than Mary . Mary isn't shorter than Susan . ||| is John shorter than Susan ? [no] .\",\n", + " \"John isn't shorter than Mary . Mary isn't shorter than Susan . ||| is Susan shorter than John ? [yes] .\",\n", + " \"John isn't shorter than Mary . Mary isn't shorter than Susan . ||| is Susan taller than John ? [no] .\",\n", + " \"John isn't shorter than Mary . Susan isn't taller than Mary . ||| who is the tallest ? [John] .\",\n", + " \"John isn't shorter than Mary . Susan isn't taller than Mary . ||| who is the shortest ? [Susan] .\",\n", + " \"John isn't shorter than Mary . Susan isn't taller than Mary . ||| is John taller than Susan ? [yes] .\",\n", + " \"John isn't shorter than Mary . Susan isn't taller than Mary . ||| is John shorter than Susan ? [no] .\",\n", + " \"John isn't shorter than Mary . Susan isn't taller than Mary . ||| is Susan shorter than John ? [yes] .\",\n", + " \"John isn't shorter than Mary . Susan isn't taller than Mary . ||| is Susan taller than John ? [no] .\",\n", + " \"Mary isn't taller than John . Mary is taller than Susan . ||| who is the tallest ? [John] .\",\n", + " \"Mary isn't taller than John . Mary is taller than Susan . ||| who is the shortest ? [Susan] .\",\n", + " \"Mary isn't taller than John . Mary is taller than Susan . ||| is John taller than Susan ? [yes] .\",\n", + " \"Mary isn't taller than John . Mary is taller than Susan . ||| is John shorter than Susan ? [no] .\",\n", + " \"Mary isn't taller than John . Mary is taller than Susan . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Mary isn't taller than John . Mary is taller than Susan . ||| is Susan taller than John ? [no] .\",\n", + " \"Mary isn't taller than John . Susan is shorter than Mary . ||| who is the tallest ? [John] .\",\n", + " \"Mary isn't taller than John . Susan is shorter than Mary . ||| who is the shortest ? [Susan] .\",\n", + " \"Mary isn't taller than John . Susan is shorter than Mary . ||| is John taller than Susan ? [yes] .\",\n", + " \"Mary isn't taller than John . Susan is shorter than Mary . ||| is John shorter than Susan ? [no] .\",\n", + " \"Mary isn't taller than John . Susan is shorter than Mary . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Mary isn't taller than John . Susan is shorter than Mary . ||| is Susan taller than John ? [no] .\",\n", + " \"Mary isn't taller than John . Mary isn't shorter than Susan . ||| who is the tallest ? [John] .\",\n", + " \"Mary isn't taller than John . Mary isn't shorter than Susan . ||| who is the shortest ? [Susan] .\",\n", + " \"Mary isn't taller than John . Mary isn't shorter than Susan . ||| is John taller than Susan ? [yes] .\",\n", + " \"Mary isn't taller than John . Mary isn't shorter than Susan . ||| is John shorter than Susan ? [no] .\",\n", + " \"Mary isn't taller than John . Mary isn't shorter than Susan . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Mary isn't taller than John . Mary isn't shorter than Susan . ||| is Susan taller than John ? [no] .\",\n", + " \"Mary isn't taller than John . Susan isn't taller than Mary . ||| who is the tallest ? [John] .\",\n", + " \"Mary isn't taller than John . Susan isn't taller than Mary . ||| who is the shortest ? [Susan] .\",\n", + " \"Mary isn't taller than John . Susan isn't taller than Mary . ||| is John taller than Susan ? [yes] .\",\n", + " \"Mary isn't taller than John . Susan isn't taller than Mary . ||| is John shorter than Susan ? [no] .\",\n", + " \"Mary isn't taller than John . Susan isn't taller than Mary . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Mary isn't taller than John . Susan isn't taller than Mary . ||| is Susan taller than John ? [no] .\",\n", + " 'Mary is taller than Susan . John is taller than Mary . ||| who is the tallest ? [John] .',\n", + " 'Mary is taller than Susan . John is taller than Mary . ||| who is the shortest ? [Susan] .',\n", + " 'Mary is taller than Susan . John is taller than Mary . ||| is John taller than Susan ? [yes] .',\n", + " 'Mary is taller than Susan . John is taller than Mary . ||| is John shorter than Susan ? [no] .',\n", + " 'Mary is taller than Susan . John is taller than Mary . ||| is Susan shorter than John ? [yes] .',\n", + " 'Mary is taller than Susan . John is taller than Mary . ||| is Susan taller than John ? [no] .',\n", + " 'Mary is taller than Susan . Mary is shorter than John . ||| who is the tallest ? [John] .',\n", + " 'Mary is taller than Susan . Mary is shorter than John . ||| who is the shortest ? [Susan] .',\n", + " 'Mary is taller than Susan . Mary is shorter than John . ||| is John taller than Susan ? [yes] .',\n", + " 'Mary is taller than Susan . Mary is shorter than John . ||| is John shorter than Susan ? [no] .',\n", + " 'Mary is taller than Susan . Mary is shorter than John . ||| is Susan shorter than John ? [yes] .',\n", + " 'Mary is taller than Susan . Mary is shorter than John . ||| is Susan taller than John ? [no] .',\n", + " \"Mary is taller than Susan . John isn't shorter than Mary . ||| who is the tallest ? [John] .\",\n", + " \"Mary is taller than Susan . John isn't shorter than Mary . ||| who is the shortest ? [Susan] .\",\n", + " \"Mary is taller than Susan . John isn't shorter than Mary . ||| is John taller than Susan ? [yes] .\",\n", + " \"Mary is taller than Susan . John isn't shorter than Mary . ||| is John shorter than Susan ? [no] .\",\n", + " \"Mary is taller than Susan . John isn't shorter than Mary . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Mary is taller than Susan . John isn't shorter than Mary . ||| is Susan taller than John ? [no] .\",\n", + " \"Mary is taller than Susan . Mary isn't taller than John . ||| who is the tallest ? [John] .\",\n", + " \"Mary is taller than Susan . Mary isn't taller than John . ||| who is the shortest ? [Susan] .\",\n", + " \"Mary is taller than Susan . Mary isn't taller than John . ||| is John taller than Susan ? [yes] .\",\n", + " \"Mary is taller than Susan . Mary isn't taller than John . ||| is John shorter than Susan ? [no] .\",\n", + " \"Mary is taller than Susan . Mary isn't taller than John . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Mary is taller than Susan . Mary isn't taller than John . ||| is Susan taller than John ? [no] .\",\n", + " 'Susan is shorter than Mary . John is taller than Mary . ||| who is the tallest ? [John] .',\n", + " 'Susan is shorter than Mary . John is taller than Mary . ||| who is the shortest ? [Susan] .',\n", + " 'Susan is shorter than Mary . John is taller than Mary . ||| is John taller than Susan ? [yes] .',\n", + " 'Susan is shorter than Mary . John is taller than Mary . ||| is John shorter than Susan ? [no] .',\n", + " 'Susan is shorter than Mary . John is taller than Mary . ||| is Susan shorter than John ? [yes] .',\n", + " 'Susan is shorter than Mary . John is taller than Mary . ||| is Susan taller than John ? [no] .',\n", + " 'Susan is shorter than Mary . Mary is shorter than John . ||| who is the tallest ? [John] .',\n", + " 'Susan is shorter than Mary . Mary is shorter than John . ||| who is the shortest ? [Susan] .',\n", + " 'Susan is shorter than Mary . Mary is shorter than John . ||| is John taller than Susan ? [yes] .',\n", + " 'Susan is shorter than Mary . Mary is shorter than John . ||| is John shorter than Susan ? [no] .',\n", + " 'Susan is shorter than Mary . Mary is shorter than John . ||| is Susan shorter than John ? [yes] .',\n", + " 'Susan is shorter than Mary . Mary is shorter than John . ||| is Susan taller than John ? [no] .',\n", + " \"Susan is shorter than Mary . John isn't shorter than Mary . ||| who is the tallest ? [John] .\",\n", + " \"Susan is shorter than Mary . John isn't shorter than Mary . ||| who is the shortest ? [Susan] .\",\n", + " \"Susan is shorter than Mary . John isn't shorter than Mary . ||| is John taller than Susan ? [yes] .\",\n", + " \"Susan is shorter than Mary . John isn't shorter than Mary . ||| is John shorter than Susan ? [no] .\",\n", + " \"Susan is shorter than Mary . John isn't shorter than Mary . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Susan is shorter than Mary . John isn't shorter than Mary . ||| is Susan taller than John ? [no] .\",\n", + " \"Susan is shorter than Mary . Mary isn't taller than John . ||| who is the tallest ? [John] .\",\n", + " \"Susan is shorter than Mary . Mary isn't taller than John . ||| who is the shortest ? [Susan] .\",\n", + " \"Susan is shorter than Mary . Mary isn't taller than John . ||| is John taller than Susan ? [yes] .\",\n", + " \"Susan is shorter than Mary . Mary isn't taller than John . ||| is John shorter than Susan ? [no] .\",\n", + " \"Susan is shorter than Mary . Mary isn't taller than John . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Susan is shorter than Mary . Mary isn't taller than John . ||| is Susan taller than John ? [no] .\",\n", + " \"Mary isn't shorter than Susan . John is taller than Mary . ||| who is the tallest ? [John] .\",\n", + " \"Mary isn't shorter than Susan . John is taller than Mary . ||| who is the shortest ? [Susan] .\",\n", + " \"Mary isn't shorter than Susan . John is taller than Mary . ||| is John taller than Susan ? [yes] .\",\n", + " \"Mary isn't shorter than Susan . John is taller than Mary . ||| is John shorter than Susan ? [no] .\",\n", + " \"Mary isn't shorter than Susan . John is taller than Mary . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Mary isn't shorter than Susan . John is taller than Mary . ||| is Susan taller than John ? [no] .\",\n", + " \"Mary isn't shorter than Susan . Mary is shorter than John . ||| who is the tallest ? [John] .\",\n", + " \"Mary isn't shorter than Susan . Mary is shorter than John . ||| who is the shortest ? [Susan] .\",\n", + " \"Mary isn't shorter than Susan . Mary is shorter than John . ||| is John taller than Susan ? [yes] .\",\n", + " \"Mary isn't shorter than Susan . Mary is shorter than John . ||| is John shorter than Susan ? [no] .\",\n", + " \"Mary isn't shorter than Susan . Mary is shorter than John . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Mary isn't shorter than Susan . Mary is shorter than John . ||| is Susan taller than John ? [no] .\",\n", + " \"Mary isn't shorter than Susan . John isn't shorter than Mary . ||| who is the tallest ? [John] .\",\n", + " \"Mary isn't shorter than Susan . John isn't shorter than Mary . ||| who is the shortest ? [Susan] .\",\n", + " \"Mary isn't shorter than Susan . John isn't shorter than Mary . ||| is John taller than Susan ? [yes] .\",\n", + " \"Mary isn't shorter than Susan . John isn't shorter than Mary . ||| is John shorter than Susan ? [no] .\",\n", + " \"Mary isn't shorter than Susan . John isn't shorter than Mary . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Mary isn't shorter than Susan . John isn't shorter than Mary . ||| is Susan taller than John ? [no] .\",\n", + " \"Mary isn't shorter than Susan . Mary isn't taller than John . ||| who is the tallest ? [John] .\",\n", + " \"Mary isn't shorter than Susan . Mary isn't taller than John . ||| who is the shortest ? [Susan] .\",\n", + " \"Mary isn't shorter than Susan . Mary isn't taller than John . ||| is John taller than Susan ? [yes] .\",\n", + " \"Mary isn't shorter than Susan . Mary isn't taller than John . ||| is John shorter than Susan ? [no] .\",\n", + " \"Mary isn't shorter than Susan . Mary isn't taller than John . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Mary isn't shorter than Susan . Mary isn't taller than John . ||| is Susan taller than John ? [no] .\",\n", + " \"Susan isn't taller than Mary . John is taller than Mary . ||| who is the tallest ? [John] .\",\n", + " \"Susan isn't taller than Mary . John is taller than Mary . ||| who is the shortest ? [Susan] .\",\n", + " \"Susan isn't taller than Mary . John is taller than Mary . ||| is John taller than Susan ? [yes] .\",\n", + " \"Susan isn't taller than Mary . John is taller than Mary . ||| is John shorter than Susan ? [no] .\",\n", + " \"Susan isn't taller than Mary . John is taller than Mary . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Susan isn't taller than Mary . John is taller than Mary . ||| is Susan taller than John ? [no] .\",\n", + " \"Susan isn't taller than Mary . Mary is shorter than John . ||| who is the tallest ? [John] .\",\n", + " \"Susan isn't taller than Mary . Mary is shorter than John . ||| who is the shortest ? [Susan] .\",\n", + " \"Susan isn't taller than Mary . Mary is shorter than John . ||| is John taller than Susan ? [yes] .\",\n", + " \"Susan isn't taller than Mary . Mary is shorter than John . ||| is John shorter than Susan ? [no] .\",\n", + " \"Susan isn't taller than Mary . Mary is shorter than John . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Susan isn't taller than Mary . Mary is shorter than John . ||| is Susan taller than John ? [no] .\",\n", + " \"Susan isn't taller than Mary . John isn't shorter than Mary . ||| who is the tallest ? [John] .\",\n", + " \"Susan isn't taller than Mary . John isn't shorter than Mary . ||| who is the shortest ? [Susan] .\",\n", + " \"Susan isn't taller than Mary . John isn't shorter than Mary . ||| is John taller than Susan ? [yes] .\",\n", + " \"Susan isn't taller than Mary . John isn't shorter than Mary . ||| is John shorter than Susan ? [no] .\",\n", + " \"Susan isn't taller than Mary . John isn't shorter than Mary . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Susan isn't taller than Mary . John isn't shorter than Mary . ||| is Susan taller than John ? [no] .\",\n", + " \"Susan isn't taller than Mary . Mary isn't taller than John . ||| who is the tallest ? [John] .\",\n", + " \"Susan isn't taller than Mary . Mary isn't taller than John . ||| who is the shortest ? [Susan] .\",\n", + " \"Susan isn't taller than Mary . Mary isn't taller than John . ||| is John taller than Susan ? [yes] .\",\n", + " \"Susan isn't taller than Mary . Mary isn't taller than John . ||| is John shorter than Susan ? [no] .\",\n", + " \"Susan isn't taller than Mary . Mary isn't taller than John . ||| is Susan shorter than John ? [yes] .\",\n", + " \"Susan isn't taller than Mary . Mary isn't taller than John . ||| is Susan taller than John ? [no] .\"]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transitive_P_template = '{ent0} {rel} {ent1} .'\n", + "transitive_wh_QA_template = '{which} is {pred} ? {ent} .'\n", + "transitive_yesno_QA_template = 'is {ent0} {rel} {ent1} ? {ans} .'\n", + "\n", + "def make_transitive(P_template, wh_QA_template, yesno_QA_template, join_template,\n", + " index=-1, orig_sentence='', entities=[\"John\", \"Mary\", \"Susan\"], entity_substitutes=None, determiner=\"\", \n", + " relations=('taller..than', 'shorter..than'), maybe=True, structured=False,\n", + " packed_predicates=[\"pred0/~pred0\", \"pred1/~pred1\"], predicate_substitutes=None,\n", + " predicate_dichotomy=True, reverse_causal=False):\n", + " if entities[0].islower():\n", + " entities = ['the ' + e for e in entities]\n", + "# print('relations =', relations)\n", + " relations, predicates = ([r.replace('..', ' ') for r in relations], [r.split('..')[0] for r in relations]) \\\n", + " if '..' in relations[0] else ([r.split('/')[0] for r in relations], [r.split('/')[-1] for r in relations])\n", + "# print('relations =', relations, 'predicates =', predicates)\n", + " predicates = [comparative2superlative(p, structured=structured) for p in predicates]\n", + " \n", + " P0_entities, P1_entities = ([entities[0], entities[1]], [entities[1], entities[2]]) \\\n", + " if not maybe else ([entities[0], entities[1]], [entities[0], entities[2]])\n", + " P0 = make_relational_atoms(P_template, P0_entities, relations)\n", + " P1 = make_relational_atoms(P_template, P1_entities, relations)\n", + " \n", + " wh_pronoun = 'which' if entities[0].startswith('the') else 'who'\n", + " wh_QA = [wh_QA_template.format(which=wh_pronoun, pred=pred, ent=ent) \n", + " for pred, ent in [(predicates[0], mask(entities[0])), (predicates[-1], mask(entities[-1] if not maybe else 'unknown'))]]\n", + " \n", + " def _maybe(s):\n", + " return s if not maybe else 'maybe'\n", + " yesno_entities = (entities[0], entities[-1]) if not maybe else (entities[1], entities[-1])\n", + " yesno_QA = [yesno_QA_template.format(ent0=ent0, ent1=ent1, rel=rel, ans=ans) \n", + " for ent0, ent1, rel, ans in [\n", + " (yesno_entities[0], yesno_entities[-1], relations[0], mask(_maybe('yes'))), \n", + " (yesno_entities[0], yesno_entities[-1], relations[-1], mask(_maybe('no'))),\n", + " (yesno_entities[-1], yesno_entities[0], relations[-1], mask(_maybe('yes'))),\n", + " (yesno_entities[-1], yesno_entities[0], relations[0], mask(_maybe('no')))]]\n", + " \n", + " Ps = [(p0, p1) for p0, p1 in list(product(P0, P1)) + list(product(P1, P0))]\n", + " QAs = wh_QA + yesno_QA\n", + " \n", + " def get_rel(atom):\n", + " for rel in relations:\n", + "# assert rel.startswith('is')\n", + " rel = rel.split()[0] # \"taller than\" -> \"taller\"\n", + " if rel in atom:\n", + " return rel\n", + " assert False\n", + " sentences = [p0 + ' ' + p1 + ' ||| ' + qas for (p0, p1), qas in product(Ps, QAs)\n", + " if not structured or get_rel(p0) == get_rel(p1) == get_rel(qas)]\n", + "# sentences = [s.replace('er st ', 'est ') for s in sentences]\n", + " return sentences\n", + "\n", + "sentences = make_transitive(transitive_P_template, transitive_wh_QA_template, transitive_yesno_QA_template, None, maybe=False, structured=False)\n", + "# len(sentences)\n", + "sample(sentences, 20)\n", + "sentences" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'a . . . b . . . c'" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + }, + { + "ename": "TypeError", + "evalue": "object of type 'NoneType' has no len()", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;34m'a'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' .'\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'b'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' .'\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'c'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m: object of type 'NoneType' has no len()" + ] + } + ], + "source": [ + "'a' + ' .'*random.randint(0, 10) + ' ' + 'b' + ' .'*random.randint(0, 10) + ' ' + 'c'\n", + "len(None)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['James is older than Jennifer . Jennifer is older than John . ||| is James older than John ? [yes] .',\n", + " \"James is younger than Jennifer . James isn't younger than Linda . ||| who is the younger st ? [Linda] .\",\n", + " \"Linda is shorter than Mary . Linda isn't shorter than Robert . ||| is Mary shorter than Robert ? [no] .\",\n", + " 'Linda is shorter than Robert . John is shorter than Linda . ||| who is the shorter st ? [John] .',\n", + " 'Mary is older than Robert . John is older than Mary . ||| is Robert older than John ? [no] .',\n", + " \"Jennifer isn't younger than Robert . James is younger than Robert . ||| is Jennifer younger than James ? [no] .\",\n", + " \"Mary is shorter than Jennifer . Mary isn't shorter than John . ||| who is the shorter st ? [John] .\",\n", + " \"Linda isn't taller than Robert . Linda is taller than John . ||| who is the taller st ? [Robert] .\",\n", + " \"Robert isn't younger than Mary . Mary isn't younger than Linda . ||| is Robert younger than Linda ? [no] .\",\n", + " \"Jennifer isn't taller than Linda . Mary isn't taller than Jennifer . ||| who is the taller st ? [Linda] .\",\n", + " \"Mary isn't older than Linda . John isn't older than Mary . ||| is John older than Linda ? [no] .\",\n", + " \"Linda is taller than Robert . John isn't taller than Robert . ||| is John taller than Linda ? [no] .\",\n", + " \"Robert isn't older than Jennifer . James is older than Jennifer . ||| is Robert older than James ? [no] .\",\n", + " \"Linda isn't older than Jennifer . Jennifer isn't older than James . ||| is Linda older than James ? [no] .\",\n", + " \"Jennifer is shorter than Robert . John isn't shorter than Robert . ||| is Jennifer shorter than John ? [yes] .\",\n", + " 'James is older than Mary . Jennifer is older than James . ||| is Mary older than Jennifer ? [no] .',\n", + " 'Jennifer is taller than John . John is taller than Robert . ||| is Jennifer taller than Robert ? [yes] .',\n", + " \"John is younger than Linda . Mary isn't younger than Linda . ||| who is the younger st ? [John] .\",\n", + " \"Jennifer is younger than Mary . Jennifer isn't younger than John . ||| who is the younger st ? [John] .\",\n", + " \"Robert is younger than John . Linda isn't younger than John . ||| is Linda younger than Robert ? [no] .\"]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "num_sent = 11520 -> 11520\n" + ] + } + ], + "source": [ + "sentence_groups = []\n", + "maybe = False\n", + "for relations, entity_types in rel2entypes.items():\n", + " sentences = []\n", + " ent_tuples = []\n", + " for entities in entity_types:\n", + " if isinstance(entities, list):\n", + " ent_tuples += permutations(entities, 3)\n", + " else:\n", + " assert isinstance(entities, tuple) and len(entities) == 2 # people_names\n", + " ent_tuples += permutations(entities[0] + entities[1], 3)\n", + " for (rel, ent_tuple) in product(relations, ent_tuples):\n", + " sentences += make_transitive(transitive_P_template, transitive_wh_QA_template, transitive_yesno_QA_template, None, \n", + " entities=list(ent_tuple), relations=rel, maybe=False, structured=True)\n", + " if maybe:\n", + " sentences += make_transitive(transitive_P_template, transitive_wh_QA_template, transitive_yesno_QA_template, None, \n", + " entities=list(ent_tuple), relations=rel, maybe=True, structured=True)\n", + " sample(sentences, 20)\n", + " print('num_sent =', len(sentences), '->', len(set(sentences)))\n", + " sentence_groups.append(sentences)" + ] + }, + { + "cell_type": "code", + "execution_count": 247, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "_StoreAction(option_strings=['--max_seq_length'], dest='max_seq_length', nargs=None, const=None, default=128, type=, choices=None, help='The maximum total input sequence length after WordPiece tokenization. \\nSequences longer than this will be truncated, and sequences shorter \\nthan this will be padded.', metavar=None)" + ] + }, + "execution_count": 247, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "_StoreTrueAction(option_strings=['--do_train'], dest='do_train', nargs=0, const=True, default=False, type=None, choices=None, help='Whether to run training.', metavar=None)" + ] + }, + "execution_count": 247, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "_StoreTrueAction(option_strings=['--do_eval'], dest='do_eval', nargs=0, const=True, default=False, type=None, choices=None, help='Whether to run eval on the dev set.', metavar=None)" + ] + }, + "execution_count": 247, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "_StoreAction(option_strings=['--train_batch_size'], dest='train_batch_size', nargs=None, const=None, default=32, type=, choices=None, help='Total batch size for training.', metavar=None)" + ] + }, + "execution_count": 247, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "_StoreAction(option_strings=['--eval_batch_size'], dest='eval_batch_size', nargs=None, const=None, default=32, type=, choices=None, help='Total batch size for eval.', metavar=None)" + ] + }, + "execution_count": 247, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "_StoreAction(option_strings=['--learning_rate'], dest='learning_rate', nargs=None, const=None, default=3e-05, type=, choices=None, help='The initial learning rate for Adam.', metavar=None)" + ] + }, + "execution_count": 247, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "_StoreAction(option_strings=['--num_train_epochs'], dest='num_train_epochs', nargs=None, const=None, default=3.0, type=, choices=None, help='Total number of training epochs to perform.', metavar=None)" + ] + }, + "execution_count": 247, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "_StoreAction(option_strings=['--warmup_proportion'], dest='warmup_proportion', nargs=None, const=None, default=0.1, type=, choices=None, help='Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% of training.', metavar=None)" + ] + }, + "execution_count": 247, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "_StoreTrueAction(option_strings=['--no_cuda'], dest='no_cuda', nargs=0, const=True, default=False, type=None, choices=None, help='Whether not to use CUDA when available', metavar=None)" + ] + }, + "execution_count": 247, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "_StoreTrueAction(option_strings=['--do_lower_case'], dest='do_lower_case', nargs=0, const=True, default=False, type=None, choices=None, help='Whether to lower case the input text. True for uncased models, False for cased models.', metavar=None)" + ] + }, + "execution_count": 247, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "_StoreAction(option_strings=['--seed'], dest='seed', nargs=None, const=None, default=42, type=, choices=None, help='random seed for initialization', metavar=None)" + ] + }, + "execution_count": 247, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "_StoreAction(option_strings=['--gradient_accumulation_steps'], dest='gradient_accumulation_steps', nargs=None, const=None, default=1, type=, choices=None, help='Number of updates steps to accumualte before performing a backward/update pass.', metavar=None)" + ] + }, + "execution_count": 247, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Namespace(do_eval=True, do_lower_case=True, do_train=True, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=0.0001, max_seq_length=128, no_cuda=False, num_train_epochs=100, seed=42, train_batch_size=32, warmup_proportion=0.1)\n" + ] + } + ], + "source": [ + "import argparse\n", + "parser = argparse.ArgumentParser()\n", + "\n", + "parser.add_argument(\"--max_seq_length\",\n", + " default=128,\n", + " type=int,\n", + " help=\"The maximum total input sequence length after WordPiece tokenization. \\n\"\n", + " \"Sequences longer than this will be truncated, and sequences shorter \\n\"\n", + " \"than this will be padded.\")\n", + "parser.add_argument(\"--do_train\",\n", + " action='store_true',\n", + " help=\"Whether to run training.\")\n", + "parser.add_argument(\"--do_eval\",\n", + " action='store_true',\n", + " help=\"Whether to run eval on the dev set.\")\n", + "parser.add_argument(\"--train_batch_size\",\n", + " default=32,\n", + " type=int,\n", + " help=\"Total batch size for training.\")\n", + "parser.add_argument(\"--eval_batch_size\",\n", + " default=32,\n", + " type=int,\n", + " help=\"Total batch size for eval.\")\n", + "parser.add_argument(\"--learning_rate\",\n", + " default=3e-5,\n", + " type=float,\n", + " help=\"The initial learning rate for Adam.\")\n", + "parser.add_argument(\"--num_train_epochs\",\n", + " default=3.0,\n", + " type=float,\n", + " help=\"Total number of training epochs to perform.\")\n", + "parser.add_argument(\"--warmup_proportion\",\n", + " default=0.1,\n", + " type=float,\n", + " help=\"Proportion of training to perform linear learning rate warmup for. \"\n", + " \"E.g., 0.1 = 10%% of training.\")\n", + "parser.add_argument(\"--no_cuda\",\n", + " action='store_true',\n", + " help=\"Whether not to use CUDA when available\")\n", + "parser.add_argument(\"--do_lower_case\",\n", + " action='store_true',\n", + " help=\"Whether to lower case the input text. True for uncased models, False for cased models.\")\n", + "parser.add_argument('--seed',\n", + " type=int,\n", + " default=42,\n", + " help=\"random seed for initialization\")\n", + "parser.add_argument('--gradient_accumulation_steps',\n", + " type=int,\n", + " default=1,\n", + " help=\"Number of updates steps to accumualte before performing a backward/update pass.\")\n", + "parser.add_argument(\"--dev_percent\",\n", + " default=0.5,\n", + " type=float)\n", + "# args = parser.parse_args(['--output_dir', '/home'])\n", + "args = parser.parse_args([])\n", + "args.do_lower_case = True\n", + "args.do_train = True\n", + "args.do_eval = True\n", + "args.eval_batch_size = 128\n", + "args.learning_rate = 1e-4\n", + "args.num_train_epochs = 100\n", + "print(args)" + ] + }, + { + "cell_type": "code", + "execution_count": 243, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "num_train_steps = 10800\n" + ] + } + ], + "source": [ + "child_dataset = CHILDDataset(tokenizer, sentence_groups[0], dev_percent=0.5)\n", + "train_features = child_dataset.get_train_features()\n", + "num_train_steps = int(\n", + " len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)\n", + "print('num_train_steps =', num_train_steps)\n", + "eval_features = child_dataset.get_dev_features()\n", + "\n", + "train_dataset = child_dataset.build_dataset(train_features)\n", + "eval_dataset = child_dataset.build_dataset(eval_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 250, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "06/09/2019 10:05:44 - INFO - run_child_finetuning - device: cuda n_gpu: 1\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 250, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n", + "n_gpu = torch.cuda.device_count()\n", + "logger.info(\"device: {} n_gpu: {}\".format(\n", + " device, n_gpu))\n", + "\n", + "args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)\n", + "\n", + "random.seed(args.seed)\n", + "np.random.seed(args.seed)\n", + "torch.manual_seed(args.seed)\n", + "if n_gpu > 0:\n", + " torch.cuda.manual_seed_all(args.seed)\n", + "\n", + "# Prepare model\n", + "# model = BertForMaskedLM.from_pretrained(BERT_DIR)\n", + "CONFIG_NAME = 'bert_config_small.json'\n", + "config = BertConfig(os.path.join(BERT_DIR, CONFIG_NAME))\n", + "model = BertForMaskedLM(config)\n", + "_ = model.to(device)\n", + "if n_gpu > 1:\n", + " model = torch.nn.DataParallel(model)" + ] + }, + { + "cell_type": "code", + "execution_count": 252, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare optimizer\n", + "param_optimizer = list(model.named_parameters())\n", + "no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']\n", + "optimizer_grouped_parameters = [\n", + " {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},\n", + " {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}\n", + " ]\n", + "optimizer = BertAdam(optimizer_grouped_parameters,\n", + " lr=args.learning_rate,\n", + " warmup=args.warmup_proportion,\n", + " t_total=num_train_steps)" + ] + }, + { + "cell_type": "code", + "execution_count": 253, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "\n", + "Epoch: 0%| | 0/100 [00:00 1:\n", + " loss = loss.mean() # mean() to average on multi-gpu.\n", + " if args.gradient_accumulation_steps > 1:\n", + " loss = loss / args.gradient_accumulation_steps\n", + " loss.backward()\n", + " tr_loss += loss.item()\n", + " nb_tr_examples += input_ids.size(0)\n", + " nb_tr_steps += 1\n", + " if (step + 1) % args.gradient_accumulation_steps == 0:\n", + " # modify learning rate with special warm up BERT uses\n", + " lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_steps, args.warmup_proportion)\n", + " if global_step % 1000 == 0:\n", + " print('global_step %d, lr = %f' % (global_step, lr_this_step))\n", + " for param_group in optimizer.param_groups:\n", + " param_group['lr'] = lr_this_step\n", + " optimizer.step()\n", + " optimizer.zero_grad()\n", + " global_step += 1\n", + "\n", + " if args.do_eval:\n", + " logger.info(\"Epoch %d\" % (epoch + 1))\n", + " logger.info(\"Evaluating on train set...\")\n", + " validate(model, train_dataset, device)\n", + " logger.info(\"Evaluating on valid set...\")\n", + " validate(model, eval_dataset, device)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Untitled3.ipynb b/Untitled3.ipynb new file mode 100644 index 00000000000000..eee4c4c8357630 --- /dev/null +++ b/Untitled3.ipynb @@ -0,0 +1,804 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "InteractiveShell.ast_node_interactivity = 'all'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import itertools\n", + "from itertools import product, chain\n", + "\n", + "from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "01/24/2019 22:16:56 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/vocab.txt\n" + ] + } + ], + "source": [ + "CONFIG_NAME = 'bert_config.json'\n", + "BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/'\n", + "tokenizer = BertTokenizer.from_pretrained(os.path.join(BERT_DIR, 'vocab.txt'))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def reverse(l):\n", + " return list(reversed(l))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def mask(ent_str):\n", + " tokens = ent_str.strip().split()\n", + " if len(tokens) == 1:\n", + " return '[%s]' % tokens[0]\n", + " elif len(tokens) == 2:\n", + " assert tokens[0] == 'the', ent_str\n", + " return '%s [%s]' % (tokens[0], tokens[1])\n", + " else:\n", + " assert False, ent_str" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "A_template = \"{dt} {ent0} {rel} {dt} {ent1} {rel_suffix}\"\n", + "B_template = \"{dt} {ent} {pred}\"\n", + "\n", + "causal_templates = [[\"{A} because {B}.\"],# \"{B} so {A}.\"], \n", + " [\"{A} so {B}.\"],# \"{B} because {A}.\"]\n", + " ]\n", + "turning_templates = [[\"{A} although {B}.\"],# \"{B} but {A}.\"], \n", + " [\"{A} but {B}.\"],# \"{B} although {A}.\"]\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "def make_sentences(A_template, B_template, causal_templates, turning_templates,\n", + " index=-1, orig_sentence='', entities=[\"John\", \"Mary\"], entity_substitutes=None, determiner=\"\", \n", + " packed_relations=[\"rel/~rel\", \"rev_rel/~rev_rel\"], packed_relation_substitutes=None, relation_suffix=\"\",\n", + " packed_predicates=[\"pred0/~pred0\", \"pred1/~pred1\"], predicate_substitutes=None,\n", + " predicate_dichotomy=True, reverse_causal=False):\n", + " assert entities[0].lower() in tokenizer.vocab , entities[0]\n", + " assert entities[1].lower() in tokenizer.vocab , entities[1]\n", + " \n", + " relations, neg_relations = zip(*[rel.split(\"/\") for rel in packed_relations])\n", + " relations, neg_relations = list(relations), list(neg_relations)\n", + " predicates, neg_predicates = zip(*[pred.split(\"/\") for pred in packed_predicates])\n", + " predicates, neg_predicates = list(predicates), list(neg_predicates)\n", + " \n", + " As = [A_template.format(dt=determiner, ent0=ent0, ent1=ent1, rel=rel, rel_suffix=relation_suffix) \n", + " for ent0, ent1, rel in [entities + relations[:1], reverse(entities) + reverse(relations)[:1]]]\n", + " negAs = [A_template.format(dt=determiner, ent0=ent0, ent1=ent1, rel=rel, rel_suffix=relation_suffix) \n", + " for ent0, ent1, rel in [entities + neg_relations[:1], reverse(entities) + reverse(neg_relations)[:1]]]\n", + "\n", + " Bs = [B_template.format(dt=determiner, ent=mask(ent), pred=pred) for ent, pred in zip(entities, predicates)]\n", + " negBs = [B_template.format(dt=determiner, ent=mask(ent), pred=pred) for ent, pred in zip(entities, neg_predicates)]\n", + " if predicate_dichotomy:\n", + " Bs += [B_template.format(dt=determiner, ent=mask(ent), pred=pred) for ent, pred in zip(entities, reversed(neg_predicates))]\n", + " negBs += [B_template.format(dt=determiner, ent=mask(ent), pred=pred) for ent, pred in zip(entities, reversed(predicates))]\n", + "\n", + " def form_sentences(sentence_template, As, Bs):\n", + " return [\" \".join(sentence_template.format(A=A, B=B).split()) for A, B in product(As, Bs)]\n", + "\n", + " causal_sentences = []\n", + " for causal_template in causal_templates[int(reverse_causal)]:\n", + " for A, B in [(As, Bs), (negAs, negBs)]:\n", + " causal_sentences.extend(form_sentences(causal_template, A, B))\n", + "\n", + " turning_sentences = []\n", + " for turning_template in turning_templates[int(reverse_causal)]:\n", + " for A, B in [(As, negBs), (negAs, Bs)]:\n", + " turning_sentences.extend(form_sentences(turning_template, A, B))\n", + " \n", + " sentences = causal_sentences + turning_sentences\n", + " substituted_sentences = sentences\n", + " \n", + " if packed_relation_substitutes is not None:\n", + " packed_relation_substitutes = list(itertools.product(packed_relations[:1] + packed_relation_substitutes[0], \n", + " packed_relations[1:] + packed_relation_substitutes[1]))\n", + " substituted_sentences = []\n", + " for packed_sub_relations in packed_relation_substitutes:\n", + " sub_relations, sub_neg_relations = zip(*[rel.split(\"/\") for rel in packed_sub_relations])\n", + " substituted_sentences += [sent.replace(relations[0], sub_relations[0]).replace(relations[1], sub_relations[1])\n", + " .replace(neg_relations[0], sub_neg_relations[0]).replace(neg_relations[1], sub_neg_relations[1]) \n", + " for sent in sentences]\n", + " substituted_sentences = list(set(substituted_sentences))\n", + " \n", + " if entity_substitutes is not None:\n", + " for sub in entity_substitutes:\n", + " for ent in sub:\n", + " assert ent.lower() in tokenizer.vocab , ent + \" not in BERT vocab\"\n", + " assert len(set(chain.from_iterable(entity_substitutes))) == 4, entity_substitutes\n", + " assert len(set(chain.from_iterable(entity_substitutes)).union(set(entities))) == 6 \n", + " \n", + " entity_substitutes = list(itertools.product(entities[:1] + entity_substitutes[0], entities[1:] + entity_substitutes[1]))\n", + " substituted_sentences = [sent.replace(entities[0], sub[0]).replace(entities[1], sub[1]) \n", + " for sent in substituted_sentences for sub in entity_substitutes]\n", + " return causal_sentences, turning_sentences, substituted_sentences" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "frames = \\\n", + "[\n", + " {\n", + " \"index\": 2,\n", + " \"orig_sentence\": \"The trophy doesn't fit into the brown suitcase because [it] is too large/small.\",\n", + " \"entities\": [\"trophy\", \"suitcase\"],\n", + " \"entitity_substitutes\": [[\"ball\", \"toy\"], [\"bag\", \"box\"]],\n", + " \"determiner\": \"the\",\n", + " \"packed_relations\": [\"doesn't fit into/can fit into\", \"doesn't hold/can hold\"],\n", + " \"packed_relation_substitutes\": [[\"can't be put into/can be put into\"], [\"doesn't have enough room for/has enough room for\"]],\n", + " \"relation_suffix\": \"\",\n", + " \"packed_predicates\": [\"is large/isn't large\", \"is small/isn't small\"],\n", + " \"predicate_dichotomy\": True,\n", + " \"reverse_causal\": False\n", + " },\n", + " {\n", + " \"index\": 4,\n", + " \"orig_sentence\": \"Joan made sure to thank Susan for all the help [she] had recieved/given.\",\n", + " \"entities\": [\"John\", \"Susan\"],\n", + " \"entity_substitutes\": [[\"David\", \"Michael\"], [\"Mary\", \"Linda\"]],\n", + " \"determiner\": \"\",\n", + " \"packed_relations\": [\"thanked/didn't thank\", \"took good care of/didn't good care of\"],\n", + " \"packed_relation_substitutes\": [[\"felt grateful to/didn't feel grateful to\"], [\"was appreciated by/wasn't appreciated by\"]],\n", + " \"relation_suffix\": \"\",\n", + " \"packed_predicates\": [\"had received a lot of help/hadn't received a lot of help\", \"had given a lot of help/hadn't given a lot of help\"],\n", + " \"predicate_dichotomy\": False,\n", + " \"reverse_causal\": False\n", + " },\n", + " {\n", + " \"index\": 4000,\n", + " \"orig_sentence\": \"John gave a lot of money to Susan because [he] was very rich/poor.\",\n", + " \"entities\": [\"John\", \"Susan\"],\n", + " \"entity_substitutes\": [[\"David\", \"Michael\"], [\"Mary\", \"Linda\"]],\n", + " \"determiner\": \"\",\n", + " \"packed_relations\": [\"gave a lot of money to/didn't give a lot of money to\", \"received a lot of money from/didn't receive a lot of money from\"],\n", + " \"packed_relation_substitutes\": [[\"subsidized/didn't subsidize\"], [\"borrowed a lot of money from/didn't borrow any money from\"]],\n", + " \"relation_suffix\": \"\",\n", + " \"packed_predicates\": [\"was rich/wasn't rich\", \"was poor/wasn't poor\"],\n", + " \"predicate_dichotomy\": True,\n", + " \"reverse_causal\": False\n", + " },\n", + " {\n", + " \"index\": 10,\n", + " \"orig_sentence\": \"The delivery truck zoomed by the school bus because [it] was going so fast/slow.\",\n", + " \"entities\": [\"truck\", \"bus\"],\n", + " \"entity_substitutes\": [[\"car\", \"ambulance\"], [\"bicycle\", \"tram\"]],\n", + " \"determiner\": \"the\",\n", + " \"packed_relations\": [\"overtook/couldn't overtake\", \"fell far behind/didn't fall far behind\"],\n", + " \"packed_relation_substitutes\": [[\"zoomed by/didn't pass\"], [\"was left behind/wasn't left far behind\"]],\n", + " \"relation_suffix\": \"\",\n", + " \"packed_predicates\": [\"was going fast/wasn't going fast\", \"was going slow/wasn't going slow\"],\n", + " \"predicate_dichotomy\": True,\n", + " \"reverse_causal\": False\n", + " },\n", + " {\n", + " \"index\": 12,\n", + " \"orig_sentence\": \"Frank felt vindicated/crushed when his longtime rival Bill revealed that [he] was the winner of the competition.\",\n", + " \"entities\": [\"John\", \"Susan\"],\n", + " \"entity_substitutes\": [[\"David\", \"Michael\"], [\"Mary\", \"Linda\"]],\n", + " \"determiner\": \"\",\n", + " \"packed_relations\": [\"beat/didn't beat\", \"lost to/didn't lose to\"],\n", + " \"relation_suffix\": \"in the game\",\n", + " \"packed_predicates\": [\"was happy/wasn't happy\", \"was sad/wasn't sad\"],\n", + " \"packed_relation_substitutes\": None,\n", + " \"predicate_dichotomy\": True,\n", + " \"reverse_causal\": True\n", + " },\n", + " {\n", + " \"index\": 16,\n", + " \"orig_sentence\": \"The large ball crashed right through the table because [it] was made of steel/styrofoam.\",\n", + " \"entities\": [\"ball\", \"board\"],\n", + " \"substitutes\": [[\"bullet\", \"arrow\"], [\"shield\", \"disk\"]],\n", + " \"determiner\": \"the\",\n", + " \"relations\": [\"crashed right through\", \"failed to block\"],\n", + " \"neg_relations\": [\"didn't crash through\", \"blocked\"],\n", + " \"relation_suffix\": \"\",\n", + " \"predicates\": [\"was hard\", \"was soft\"],\n", + " \"neg_predicates\": [\"wasn't hard\", \"wasn't soft\"],\n", + " \"predicate_dichotomy\": True,\n", + " \"reverse_causal\": False\n", + " },\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "causal_sentences, turning_sentences, substituted_sentences = \\\n", + " make_sentences(A_template, B_template, causal_templates, turning_templates, **frames[-1])" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['John beat Susan in the game so [John] was happy.',\n", + " 'John beat Susan in the game so [Susan] was sad.',\n", + " \"John beat Susan in the game so [John] wasn't sad.\",\n", + " \"John beat Susan in the game so [Susan] wasn't happy.\",\n", + " 'Susan lost to John in the game so [John] was happy.',\n", + " 'Susan lost to John in the game so [Susan] was sad.',\n", + " \"Susan lost to John in the game so [John] wasn't sad.\",\n", + " \"Susan lost to John in the game so [Susan] wasn't happy.\",\n", + " \"John didn't beat Susan in the game so [John] wasn't happy.\",\n", + " \"John didn't beat Susan in the game so [Susan] wasn't sad.\",\n", + " \"John didn't beat Susan in the game so [John] was sad.\",\n", + " \"John didn't beat Susan in the game so [Susan] was happy.\",\n", + " \"Susan didn't lose to John in the game so [John] wasn't happy.\",\n", + " \"Susan didn't lose to John in the game so [Susan] wasn't sad.\",\n", + " \"Susan didn't lose to John in the game so [John] was sad.\",\n", + " \"Susan didn't lose to John in the game so [Susan] was happy.\",\n", + " \"John beat Susan in the game but [John] wasn't happy.\",\n", + " \"John beat Susan in the game but [Susan] wasn't sad.\",\n", + " 'John beat Susan in the game but [John] was sad.',\n", + " 'John beat Susan in the game but [Susan] was happy.',\n", + " \"Susan lost to John in the game but [John] wasn't happy.\",\n", + " \"Susan lost to John in the game but [Susan] wasn't sad.\",\n", + " 'Susan lost to John in the game but [John] was sad.',\n", + " 'Susan lost to John in the game but [Susan] was happy.',\n", + " \"John didn't beat Susan in the game but [John] was happy.\",\n", + " \"John didn't beat Susan in the game but [Susan] was sad.\",\n", + " \"John didn't beat Susan in the game but [John] wasn't sad.\",\n", + " \"John didn't beat Susan in the game but [Susan] wasn't happy.\",\n", + " \"Susan didn't lose to John in the game but [John] was happy.\",\n", + " \"Susan didn't lose to John in the game but [Susan] was sad.\",\n", + " \"Susan didn't lose to John in the game but [John] wasn't sad.\",\n", + " \"Susan didn't lose to John in the game but [Susan] wasn't happy.\"]" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "['John beat Susan in the game so [John] was happy.',\n", + " 'John beat Mary in the game so [John] was happy.',\n", + " 'John beat Linda in the game so [John] was happy.',\n", + " 'David beat Susan in the game so [David] was happy.',\n", + " 'David beat Mary in the game so [David] was happy.',\n", + " 'David beat Linda in the game so [David] was happy.',\n", + " 'Michael beat Susan in the game so [Michael] was happy.',\n", + " 'Michael beat Mary in the game so [Michael] was happy.',\n", + " 'Michael beat Linda in the game so [Michael] was happy.',\n", + " 'John beat Susan in the game so [Susan] was sad.',\n", + " 'John beat Mary in the game so [Mary] was sad.',\n", + " 'John beat Linda in the game so [Linda] was sad.',\n", + " 'David beat Susan in the game so [Susan] was sad.',\n", + " 'David beat Mary in the game so [Mary] was sad.',\n", + " 'David beat Linda in the game so [Linda] was sad.',\n", + " 'Michael beat Susan in the game so [Susan] was sad.',\n", + " 'Michael beat Mary in the game so [Mary] was sad.',\n", + " 'Michael beat Linda in the game so [Linda] was sad.',\n", + " \"John beat Susan in the game so [John] wasn't sad.\",\n", + " \"John beat Mary in the game so [John] wasn't sad.\",\n", + " \"John beat Linda in the game so [John] wasn't sad.\",\n", + " \"David beat Susan in the game so [David] wasn't sad.\",\n", + " \"David beat Mary in the game so [David] wasn't sad.\",\n", + " \"David beat Linda in the game so [David] wasn't sad.\",\n", + " \"Michael beat Susan in the game so [Michael] wasn't sad.\",\n", + " \"Michael beat Mary in the game so [Michael] wasn't sad.\",\n", + " \"Michael beat Linda in the game so [Michael] wasn't sad.\",\n", + " \"John beat Susan in the game so [Susan] wasn't happy.\",\n", + " \"John beat Mary in the game so [Mary] wasn't happy.\",\n", + " \"John beat Linda in the game so [Linda] wasn't happy.\",\n", + " \"David beat Susan in the game so [Susan] wasn't happy.\",\n", + " \"David beat Mary in the game so [Mary] wasn't happy.\",\n", + " \"David beat Linda in the game so [Linda] wasn't happy.\",\n", + " \"Michael beat Susan in the game so [Susan] wasn't happy.\",\n", + " \"Michael beat Mary in the game so [Mary] wasn't happy.\",\n", + " \"Michael beat Linda in the game so [Linda] wasn't happy.\",\n", + " 'Susan lost to John in the game so [John] was happy.',\n", + " 'Mary lost to John in the game so [John] was happy.',\n", + " 'Linda lost to John in the game so [John] was happy.',\n", + " 'Susan lost to David in the game so [David] was happy.',\n", + " 'Mary lost to David in the game so [David] was happy.',\n", + " 'Linda lost to David in the game so [David] was happy.',\n", + " 'Susan lost to Michael in the game so [Michael] was happy.',\n", + " 'Mary lost to Michael in the game so [Michael] was happy.',\n", + " 'Linda lost to Michael in the game so [Michael] was happy.',\n", + " 'Susan lost to John in the game so [Susan] was sad.',\n", + " 'Mary lost to John in the game so [Mary] was sad.',\n", + " 'Linda lost to John in the game so [Linda] was sad.',\n", + " 'Susan lost to David in the game so [Susan] was sad.',\n", + " 'Mary lost to David in the game so [Mary] was sad.',\n", + " 'Linda lost to David in the game so [Linda] was sad.',\n", + " 'Susan lost to Michael in the game so [Susan] was sad.',\n", + " 'Mary lost to Michael in the game so [Mary] was sad.',\n", + " 'Linda lost to Michael in the game so [Linda] was sad.',\n", + " \"Susan lost to John in the game so [John] wasn't sad.\",\n", + " \"Mary lost to John in the game so [John] wasn't sad.\",\n", + " \"Linda lost to John in the game so [John] wasn't sad.\",\n", + " \"Susan lost to David in the game so [David] wasn't sad.\",\n", + " \"Mary lost to David in the game so [David] wasn't sad.\",\n", + " \"Linda lost to David in the game so [David] wasn't sad.\",\n", + " \"Susan lost to Michael in the game so [Michael] wasn't sad.\",\n", + " \"Mary lost to Michael in the game so [Michael] wasn't sad.\",\n", + " \"Linda lost to Michael in the game so [Michael] wasn't sad.\",\n", + " \"Susan lost to John in the game so [Susan] wasn't happy.\",\n", + " \"Mary lost to John in the game so [Mary] wasn't happy.\",\n", + " \"Linda lost to John in the game so [Linda] wasn't happy.\",\n", + " \"Susan lost to David in the game so [Susan] wasn't happy.\",\n", + " \"Mary lost to David in the game so [Mary] wasn't happy.\",\n", + " \"Linda lost to David in the game so [Linda] wasn't happy.\",\n", + " \"Susan lost to Michael in the game so [Susan] wasn't happy.\",\n", + " \"Mary lost to Michael in the game so [Mary] wasn't happy.\",\n", + " \"Linda lost to Michael in the game so [Linda] wasn't happy.\",\n", + " \"John didn't beat Susan in the game so [John] wasn't happy.\",\n", + " \"John didn't beat Mary in the game so [John] wasn't happy.\",\n", + " \"John didn't beat Linda in the game so [John] wasn't happy.\",\n", + " \"David didn't beat Susan in the game so [David] wasn't happy.\",\n", + " \"David didn't beat Mary in the game so [David] wasn't happy.\",\n", + " \"David didn't beat Linda in the game so [David] wasn't happy.\",\n", + " \"Michael didn't beat Susan in the game so [Michael] wasn't happy.\",\n", + " \"Michael didn't beat Mary in the game so [Michael] wasn't happy.\",\n", + " \"Michael didn't beat Linda in the game so [Michael] wasn't happy.\",\n", + " \"John didn't beat Susan in the game so [Susan] wasn't sad.\",\n", + " \"John didn't beat Mary in the game so [Mary] wasn't sad.\",\n", + " \"John didn't beat Linda in the game so [Linda] wasn't sad.\",\n", + " \"David didn't beat Susan in the game so [Susan] wasn't sad.\",\n", + " \"David didn't beat Mary in the game so [Mary] wasn't sad.\",\n", + " \"David didn't beat Linda in the game so [Linda] wasn't sad.\",\n", + " \"Michael didn't beat Susan in the game so [Susan] wasn't sad.\",\n", + " \"Michael didn't beat Mary in the game so [Mary] wasn't sad.\",\n", + " \"Michael didn't beat Linda in the game so [Linda] wasn't sad.\",\n", + " \"John didn't beat Susan in the game so [John] was sad.\",\n", + " \"John didn't beat Mary in the game so [John] was sad.\",\n", + " \"John didn't beat Linda in the game so [John] was sad.\",\n", + " \"David didn't beat Susan in the game so [David] was sad.\",\n", + " \"David didn't beat Mary in the game so [David] was sad.\",\n", + " \"David didn't beat Linda in the game so [David] was sad.\",\n", + " \"Michael didn't beat Susan in the game so [Michael] was sad.\",\n", + " \"Michael didn't beat Mary in the game so [Michael] was sad.\",\n", + " \"Michael didn't beat Linda in the game so [Michael] was sad.\",\n", + " \"John didn't beat Susan in the game so [Susan] was happy.\",\n", + " \"John didn't beat Mary in the game so [Mary] was happy.\",\n", + " \"John didn't beat Linda in the game so [Linda] was happy.\",\n", + " \"David didn't beat Susan in the game so [Susan] was happy.\",\n", + " \"David didn't beat Mary in the game so [Mary] was happy.\",\n", + " \"David didn't beat Linda in the game so [Linda] was happy.\",\n", + " \"Michael didn't beat Susan in the game so [Susan] was happy.\",\n", + " \"Michael didn't beat Mary in the game so [Mary] was happy.\",\n", + " \"Michael didn't beat Linda in the game so [Linda] was happy.\",\n", + " \"Susan didn't lose to John in the game so [John] wasn't happy.\",\n", + " \"Mary didn't lose to John in the game so [John] wasn't happy.\",\n", + " \"Linda didn't lose to John in the game so [John] wasn't happy.\",\n", + " \"Susan didn't lose to David in the game so [David] wasn't happy.\",\n", + " \"Mary didn't lose to David in the game so [David] wasn't happy.\",\n", + " \"Linda didn't lose to David in the game so [David] wasn't happy.\",\n", + " \"Susan didn't lose to Michael in the game so [Michael] wasn't happy.\",\n", + " \"Mary didn't lose to Michael in the game so [Michael] wasn't happy.\",\n", + " \"Linda didn't lose to Michael in the game so [Michael] wasn't happy.\",\n", + " \"Susan didn't lose to John in the game so [Susan] wasn't sad.\",\n", + " \"Mary didn't lose to John in the game so [Mary] wasn't sad.\",\n", + " \"Linda didn't lose to John in the game so [Linda] wasn't sad.\",\n", + " \"Susan didn't lose to David in the game so [Susan] wasn't sad.\",\n", + " \"Mary didn't lose to David in the game so [Mary] wasn't sad.\",\n", + " \"Linda didn't lose to David in the game so [Linda] wasn't sad.\",\n", + " \"Susan didn't lose to Michael in the game so [Susan] wasn't sad.\",\n", + " \"Mary didn't lose to Michael in the game so [Mary] wasn't sad.\",\n", + " \"Linda didn't lose to Michael in the game so [Linda] wasn't sad.\",\n", + " \"Susan didn't lose to John in the game so [John] was sad.\",\n", + " \"Mary didn't lose to John in the game so [John] was sad.\",\n", + " \"Linda didn't lose to John in the game so [John] was sad.\",\n", + " \"Susan didn't lose to David in the game so [David] was sad.\",\n", + " \"Mary didn't lose to David in the game so [David] was sad.\",\n", + " \"Linda didn't lose to David in the game so [David] was sad.\",\n", + " \"Susan didn't lose to Michael in the game so [Michael] was sad.\",\n", + " \"Mary didn't lose to Michael in the game so [Michael] was sad.\",\n", + " \"Linda didn't lose to Michael in the game so [Michael] was sad.\",\n", + " \"Susan didn't lose to John in the game so [Susan] was happy.\",\n", + " \"Mary didn't lose to John in the game so [Mary] was happy.\",\n", + " \"Linda didn't lose to John in the game so [Linda] was happy.\",\n", + " \"Susan didn't lose to David in the game so [Susan] was happy.\",\n", + " \"Mary didn't lose to David in the game so [Mary] was happy.\",\n", + " \"Linda didn't lose to David in the game so [Linda] was happy.\",\n", + " \"Susan didn't lose to Michael in the game so [Susan] was happy.\",\n", + " \"Mary didn't lose to Michael in the game so [Mary] was happy.\",\n", + " \"Linda didn't lose to Michael in the game so [Linda] was happy.\",\n", + " \"John beat Susan in the game but [John] wasn't happy.\",\n", + " \"John beat Mary in the game but [John] wasn't happy.\",\n", + " \"John beat Linda in the game but [John] wasn't happy.\",\n", + " \"David beat Susan in the game but [David] wasn't happy.\",\n", + " \"David beat Mary in the game but [David] wasn't happy.\",\n", + " \"David beat Linda in the game but [David] wasn't happy.\",\n", + " \"Michael beat Susan in the game but [Michael] wasn't happy.\",\n", + " \"Michael beat Mary in the game but [Michael] wasn't happy.\",\n", + " \"Michael beat Linda in the game but [Michael] wasn't happy.\",\n", + " \"John beat Susan in the game but [Susan] wasn't sad.\",\n", + " \"John beat Mary in the game but [Mary] wasn't sad.\",\n", + " \"John beat Linda in the game but [Linda] wasn't sad.\",\n", + " \"David beat Susan in the game but [Susan] wasn't sad.\",\n", + " \"David beat Mary in the game but [Mary] wasn't sad.\",\n", + " \"David beat Linda in the game but [Linda] wasn't sad.\",\n", + " \"Michael beat Susan in the game but [Susan] wasn't sad.\",\n", + " \"Michael beat Mary in the game but [Mary] wasn't sad.\",\n", + " \"Michael beat Linda in the game but [Linda] wasn't sad.\",\n", + " 'John beat Susan in the game but [John] was sad.',\n", + " 'John beat Mary in the game but [John] was sad.',\n", + " 'John beat Linda in the game but [John] was sad.',\n", + " 'David beat Susan in the game but [David] was sad.',\n", + " 'David beat Mary in the game but [David] was sad.',\n", + " 'David beat Linda in the game but [David] was sad.',\n", + " 'Michael beat Susan in the game but [Michael] was sad.',\n", + " 'Michael beat Mary in the game but [Michael] was sad.',\n", + " 'Michael beat Linda in the game but [Michael] was sad.',\n", + " 'John beat Susan in the game but [Susan] was happy.',\n", + " 'John beat Mary in the game but [Mary] was happy.',\n", + " 'John beat Linda in the game but [Linda] was happy.',\n", + " 'David beat Susan in the game but [Susan] was happy.',\n", + " 'David beat Mary in the game but [Mary] was happy.',\n", + " 'David beat Linda in the game but [Linda] was happy.',\n", + " 'Michael beat Susan in the game but [Susan] was happy.',\n", + " 'Michael beat Mary in the game but [Mary] was happy.',\n", + " 'Michael beat Linda in the game but [Linda] was happy.',\n", + " \"Susan lost to John in the game but [John] wasn't happy.\",\n", + " \"Mary lost to John in the game but [John] wasn't happy.\",\n", + " \"Linda lost to John in the game but [John] wasn't happy.\",\n", + " \"Susan lost to David in the game but [David] wasn't happy.\",\n", + " \"Mary lost to David in the game but [David] wasn't happy.\",\n", + " \"Linda lost to David in the game but [David] wasn't happy.\",\n", + " \"Susan lost to Michael in the game but [Michael] wasn't happy.\",\n", + " \"Mary lost to Michael in the game but [Michael] wasn't happy.\",\n", + " \"Linda lost to Michael in the game but [Michael] wasn't happy.\",\n", + " \"Susan lost to John in the game but [Susan] wasn't sad.\",\n", + " \"Mary lost to John in the game but [Mary] wasn't sad.\",\n", + " \"Linda lost to John in the game but [Linda] wasn't sad.\",\n", + " \"Susan lost to David in the game but [Susan] wasn't sad.\",\n", + " \"Mary lost to David in the game but [Mary] wasn't sad.\",\n", + " \"Linda lost to David in the game but [Linda] wasn't sad.\",\n", + " \"Susan lost to Michael in the game but [Susan] wasn't sad.\",\n", + " \"Mary lost to Michael in the game but [Mary] wasn't sad.\",\n", + " \"Linda lost to Michael in the game but [Linda] wasn't sad.\",\n", + " 'Susan lost to John in the game but [John] was sad.',\n", + " 'Mary lost to John in the game but [John] was sad.',\n", + " 'Linda lost to John in the game but [John] was sad.',\n", + " 'Susan lost to David in the game but [David] was sad.',\n", + " 'Mary lost to David in the game but [David] was sad.',\n", + " 'Linda lost to David in the game but [David] was sad.',\n", + " 'Susan lost to Michael in the game but [Michael] was sad.',\n", + " 'Mary lost to Michael in the game but [Michael] was sad.',\n", + " 'Linda lost to Michael in the game but [Michael] was sad.',\n", + " 'Susan lost to John in the game but [Susan] was happy.',\n", + " 'Mary lost to John in the game but [Mary] was happy.',\n", + " 'Linda lost to John in the game but [Linda] was happy.',\n", + " 'Susan lost to David in the game but [Susan] was happy.',\n", + " 'Mary lost to David in the game but [Mary] was happy.',\n", + " 'Linda lost to David in the game but [Linda] was happy.',\n", + " 'Susan lost to Michael in the game but [Susan] was happy.',\n", + " 'Mary lost to Michael in the game but [Mary] was happy.',\n", + " 'Linda lost to Michael in the game but [Linda] was happy.',\n", + " \"John didn't beat Susan in the game but [John] was happy.\",\n", + " \"John didn't beat Mary in the game but [John] was happy.\",\n", + " \"John didn't beat Linda in the game but [John] was happy.\",\n", + " \"David didn't beat Susan in the game but [David] was happy.\",\n", + " \"David didn't beat Mary in the game but [David] was happy.\",\n", + " \"David didn't beat Linda in the game but [David] was happy.\",\n", + " \"Michael didn't beat Susan in the game but [Michael] was happy.\",\n", + " \"Michael didn't beat Mary in the game but [Michael] was happy.\",\n", + " \"Michael didn't beat Linda in the game but [Michael] was happy.\",\n", + " \"John didn't beat Susan in the game but [Susan] was sad.\",\n", + " \"John didn't beat Mary in the game but [Mary] was sad.\",\n", + " \"John didn't beat Linda in the game but [Linda] was sad.\",\n", + " \"David didn't beat Susan in the game but [Susan] was sad.\",\n", + " \"David didn't beat Mary in the game but [Mary] was sad.\",\n", + " \"David didn't beat Linda in the game but [Linda] was sad.\",\n", + " \"Michael didn't beat Susan in the game but [Susan] was sad.\",\n", + " \"Michael didn't beat Mary in the game but [Mary] was sad.\",\n", + " \"Michael didn't beat Linda in the game but [Linda] was sad.\",\n", + " \"John didn't beat Susan in the game but [John] wasn't sad.\",\n", + " \"John didn't beat Mary in the game but [John] wasn't sad.\",\n", + " \"John didn't beat Linda in the game but [John] wasn't sad.\",\n", + " \"David didn't beat Susan in the game but [David] wasn't sad.\",\n", + " \"David didn't beat Mary in the game but [David] wasn't sad.\",\n", + " \"David didn't beat Linda in the game but [David] wasn't sad.\",\n", + " \"Michael didn't beat Susan in the game but [Michael] wasn't sad.\",\n", + " \"Michael didn't beat Mary in the game but [Michael] wasn't sad.\",\n", + " \"Michael didn't beat Linda in the game but [Michael] wasn't sad.\",\n", + " \"John didn't beat Susan in the game but [Susan] wasn't happy.\",\n", + " \"John didn't beat Mary in the game but [Mary] wasn't happy.\",\n", + " \"John didn't beat Linda in the game but [Linda] wasn't happy.\",\n", + " \"David didn't beat Susan in the game but [Susan] wasn't happy.\",\n", + " \"David didn't beat Mary in the game but [Mary] wasn't happy.\",\n", + " \"David didn't beat Linda in the game but [Linda] wasn't happy.\",\n", + " \"Michael didn't beat Susan in the game but [Susan] wasn't happy.\",\n", + " \"Michael didn't beat Mary in the game but [Mary] wasn't happy.\",\n", + " \"Michael didn't beat Linda in the game but [Linda] wasn't happy.\",\n", + " \"Susan didn't lose to John in the game but [John] was happy.\",\n", + " \"Mary didn't lose to John in the game but [John] was happy.\",\n", + " \"Linda didn't lose to John in the game but [John] was happy.\",\n", + " \"Susan didn't lose to David in the game but [David] was happy.\",\n", + " \"Mary didn't lose to David in the game but [David] was happy.\",\n", + " \"Linda didn't lose to David in the game but [David] was happy.\",\n", + " \"Susan didn't lose to Michael in the game but [Michael] was happy.\",\n", + " \"Mary didn't lose to Michael in the game but [Michael] was happy.\",\n", + " \"Linda didn't lose to Michael in the game but [Michael] was happy.\",\n", + " \"Susan didn't lose to John in the game but [Susan] was sad.\",\n", + " \"Mary didn't lose to John in the game but [Mary] was sad.\",\n", + " \"Linda didn't lose to John in the game but [Linda] was sad.\",\n", + " \"Susan didn't lose to David in the game but [Susan] was sad.\",\n", + " \"Mary didn't lose to David in the game but [Mary] was sad.\",\n", + " \"Linda didn't lose to David in the game but [Linda] was sad.\",\n", + " \"Susan didn't lose to Michael in the game but [Susan] was sad.\",\n", + " \"Mary didn't lose to Michael in the game but [Mary] was sad.\",\n", + " \"Linda didn't lose to Michael in the game but [Linda] was sad.\",\n", + " \"Susan didn't lose to John in the game but [John] wasn't sad.\",\n", + " \"Mary didn't lose to John in the game but [John] wasn't sad.\",\n", + " \"Linda didn't lose to John in the game but [John] wasn't sad.\",\n", + " \"Susan didn't lose to David in the game but [David] wasn't sad.\",\n", + " \"Mary didn't lose to David in the game but [David] wasn't sad.\",\n", + " \"Linda didn't lose to David in the game but [David] wasn't sad.\",\n", + " \"Susan didn't lose to Michael in the game but [Michael] wasn't sad.\",\n", + " \"Mary didn't lose to Michael in the game but [Michael] wasn't sad.\",\n", + " \"Linda didn't lose to Michael in the game but [Michael] wasn't sad.\",\n", + " \"Susan didn't lose to John in the game but [Susan] wasn't happy.\",\n", + " \"Mary didn't lose to John in the game but [Mary] wasn't happy.\",\n", + " \"Linda didn't lose to John in the game but [Linda] wasn't happy.\",\n", + " \"Susan didn't lose to David in the game but [Susan] wasn't happy.\",\n", + " \"Mary didn't lose to David in the game but [Mary] wasn't happy.\",\n", + " \"Linda didn't lose to David in the game but [Linda] wasn't happy.\",\n", + " \"Susan didn't lose to Michael in the game but [Susan] wasn't happy.\",\n", + " \"Mary didn't lose to Michael in the game but [Mary] wasn't happy.\",\n", + " \"Linda didn't lose to Michael in the game but [Linda] wasn't happy.\"]" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "causal_sentences\n", + "turning_sentences\n", + "# substituted_sentences" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "examples = [(2,\n", + " \"The trophy doesn't fit into the brown suitcase because [it] is too large.\",\n", + " 'fit into:large/small'),\n", + " (4,\n", + " 'Joan made sure to thank Susan for all the help [she] had recieved.',\n", + " 'thank:receive/give'),\n", + " (10,\n", + " 'The delivery truck zoomed by the school bus because [it] was going so fast.',\n", + " 'zoom by:fast/slow'),\n", + " (12,\n", + " 'Frank felt vindicated when his longtime rival Bill revealed that [he] was the winner of the competition.',\n", + " 'vindicated/crushed:be the winner'),\n", + " (16,\n", + " 'The large ball crashed right through the table because [it] was made of steel.',\n", + " 'crash through:[hard]/[soft]'),\n", + " (18,\n", + " \"John couldn't see the stage with Billy in front of him because [he] is so short.\",\n", + " '[block]:short/tall'),\n", + " (20,\n", + " 'Tom threw his schoolbag down to Ray after [he] reached the top of the stairs.',\n", + " 'down to:top/bottom'),\n", + " (22,\n", + " 'Although they ran at about the same speed, Sue beat Sally because [she] had such a good start.',\n", + " 'beat:good/bad'),\n", + " (26,\n", + " \"Sam's drawing was hung just above Tina's and [it] did look much better with another one below it.\",\n", + " 'above/below'),\n", + " (28,\n", + " 'Anna did a lot better than her good friend Lucy on the test because [she] had studied so hard.',\n", + " 'better/worse:study hard'),\n", + " (30,\n", + " 'The firemen arrived after the police because [they] were coming from so far away.',\n", + " 'after/before:far away'),\n", + " (32,\n", + " \"Frank was upset with Tom because the toaster [he] had bought from him didn't work.\",\n", + " 'be upset with:buy from not work/sell not work'),\n", + " (36,\n", + " 'The sack of potatoes had been placed above the bag of flour, so [it] had to be moved first.',\n", + " 'above/below:moved first'),\n", + " (38,\n", + " 'Pete envies Martin although [he] is very successful.',\n", + " 'although/because'),\n", + " (42,\n", + " 'I poured water from the bottle into the cup until [it] was empty.',\n", + " 'pour:empty/full'),\n", + " (46,\n", + " \"Sid explained his theory to Mark but [he] couldn't convince him.\",\n", + " 'explain:convince/understand'),\n", + " (48,\n", + " \"Susan knew that Ann's son had been in a car accident, so [she] told her about it.\",\n", + " '?know tell:so/because'),\n", + " (50,\n", + " \"Joe's uncle can still beat him at tennis, even though [he] is 30 years younger.\",\n", + " 'beat:younger/older'),\n", + " (64,\n", + " 'In the middle of the outdoor concert, the rain started falling, but [it] continued until 10.',\n", + " 'but/and'),\n", + " (68,\n", + " 'Ann asked Mary what time the library closes, because [she] had forgotten.',\n", + " 'because/but'),\n", + " (84,\n", + " 'If the con artist has succeeded in fooling Sam, [he] would have gotten a lot of money.',\n", + " 'fool:get/lose'),\n", + " (92,\n", + " 'Alice tried frantically to stop her daughter from chatting at the party, leaving us to wonder why [she] was behaving so strangely.',\n", + " '?stop normal/stop abnormal:strange'),\n", + " (98,\n", + " \"I was trying to open the lock with the key, but someone had filled the keyhole with chewing gum, and I couldn't get [it] in.\",\n", + " 'put ... into filled with ... :get in/get out'),\n", + " (100,\n", + " 'The dog chased the cat, which ran up a tree. [It] waited at the bottom.',\n", + " 'up:at the bottom/at the top'),\n", + " (106,\n", + " 'John was doing research in the library when he heard a man humming and whistling. [He] was very annoyed.',\n", + " 'hear ... humming and whistling:annoyed/annoying'),\n", + " (108,\n", + " 'John was jogging through the park when he saw a man juggling watermelons. [He] was very impressed.',\n", + " 'see ... juggling watermelons:impressed/impressive'),\n", + " (132,\n", + " 'Jane knocked on the door, and Susan answered it. [She] invited her to come out.',\n", + " 'visit:invite come out/invite come in'),\n", + " (150,\n", + " 'Jackson was greatly influenced by Arnold, though [he] lived two centuries later.',\n", + " 'influence:later/earlier'),\n", + " (160,\n", + " 'The actress used to be named Terpsichore, but she changed it to Tina a few years ago, because she figured [it] was too hard to pronounce.',\n", + " 'change:hard/easy'),\n", + " (166,\n", + " 'Fred is the only man still alive who remembers my great-grandfather. [He] is a remarkable man.',\n", + " 'alive:is/was'),\n", + " (170,\n", + " \"In July, Kamtchatka declared war on Yakutsk. Since Yakutsk's army was much better equipped and ten times larger, [they] were defeated within weeks.\",\n", + " 'better equipped and large:defeated/victorious'),\n", + " (186,\n", + " 'When the sponsors of the bill got to the town hall, they were surprised to find that the room was full of opponents. [They] were very much in the minority.',\n", + " 'be full of:minority/majority'),\n", + " (188,\n", + " 'Everyone really loved the oatmeal cookies; only a few people liked the chocolate chip cookies. Next time, we should make more of [them] .',\n", + " 'like over:more/fewer'),\n", + " (190,\n", + " 'We had hoped to place copies of our newsletter on all the chairs in the auditorium, but there were simply not enough of [them] .',\n", + " 'place on all:not enough/too many'),\n", + " (196,\n", + " \"Steve follows Fred's example in everything. [He] admires him hugely.\",\n", + " 'follow:admire/influence'),\n", + " (198,\n", + " \"The table won't fit through the doorway because [it] is too wide.\",\n", + " 'fit through:wide/narrow'),\n", + " (200,\n", + " 'Grace was happy to trade me her sweater for my jacket. She thinks [it] looks dowdy on her.',\n", + " 'trade:dowdy/great'),\n", + " (202,\n", + " 'John hired Bill to take care of [him] .',\n", + " 'hire/hire oneself to:take care of'),\n", + " (204,\n", + " 'John promised Bill to leave, so an hour later [he] left.',\n", + " 'promise/order'),\n", + " (210,\n", + " \"Jane knocked on Susan's door but [she] did not get an answer.\",\n", + " 'knock:get an answer/answer'),\n", + " (212,\n", + " 'Joe paid the detective after [he] received the final report on the case.',\n", + " 'pay:receive/deliver'),\n", + " (226,\n", + " 'Bill passed the half-empty plate to John because [he] was full.',\n", + " 'pass the plate:full/hungry'),\n", + " (252,\n", + " 'George got free tickets to the play, but he gave them to Eric, even though [he] was particularly eager to see it.',\n", + " 'even though/because/not'),\n", + " (255,\n", + " \"Jane gave Joan candy because [she] wasn't hungry.\",\n", + " 'give:not hungry/hungry'),\n", + " (259,\n", + " 'James asked Robert for a favor but [he] was refused.',\n", + " 'ask for a favor:refuse/be refused`'),\n", + " (261,\n", + " 'Kirilov ceded the presidency to Shatov because [he] was less popular.',\n", + " 'cede:less popular/more popular'),\n", + " (263,\n", + " 'Emma did not pass the ball to Janie although [she] saw that she was open.',\n", + " 'not pass although:see open/open')]" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "47" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(examples)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Untitled_likunlin-Copy1.ipynb b/Untitled_likunlin-Copy1.ipynb new file mode 100644 index 00000000000000..a48277551d3723 --- /dev/null +++ b/Untitled_likunlin-Copy1.ipynb @@ -0,0 +1,827 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "InteractiveShell.ast_node_interactivity = 'all'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/xd/projects/pytorch-pretrained-BERT/pytorch_pretrained_bert/__init__.py\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "\n", + "import numpy as np\n", + "import math\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "from pylab import rcParams\n", + "\n", + "import torch\n", + "import torch.nn.functional as F\n", + "from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig\n", + "from examples.extract_features import *\n", + "\n", + "import pytorch_pretrained_bert\n", + "print(pytorch_pretrained_bert.__file__)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "01/03/2019 16:37:32 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/xd/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084\n", + "01/03/2019 16:37:32 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/\n", + "01/03/2019 16:37:32 - INFO - pytorch_pretrained_bert.modeling - Model config {\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"max_position_embeddings\": 512,\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"type_vocab_size\": 2,\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "class Args:\n", + " def __init__(self):\n", + " pass\n", + " \n", + "args = Args()\n", + "args.no_cuda = False\n", + "\n", + "CONFIG_NAME = 'bert_config.json'\n", + "BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/'\n", + "config_file = os.path.join(BERT_DIR, CONFIG_NAME)\n", + "config = BertConfig.from_json_file(config_file)\n", + "\n", + "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", + "model = BertForPreTraining.from_pretrained(BERT_DIR)\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n", + "_ = model.to(device)\n", + "_ = model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "def convert_text_to_examples(text):\n", + " examples = []\n", + " unique_id = 0\n", + " if True:\n", + " for line in text:\n", + " line = line.strip()\n", + " text_a = None\n", + " text_b = None\n", + " m = re.match(r\"^(.*) \\|\\|\\| (.*)$\", line)\n", + " if m is None:\n", + " text_a = line\n", + " else:\n", + " text_a = m.group(1)\n", + " text_b = m.group(2)\n", + " examples.append(\n", + " InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))\n", + " unique_id += 1\n", + " return examples\n", + "\n", + "def convert_examples_to_features(examples, tokenizer, append_special_tokens=True, replace_mask=True, print_info=False):\n", + " features = []\n", + " for (ex_index, example) in enumerate(examples):\n", + " tokens_a = tokenizer.tokenize(example.text_a)\n", + " tokens_b = None\n", + " if example.text_b:\n", + " tokens_b = tokenizer.tokenize(example.text_b)\n", + "\n", + " tokens = []\n", + " input_type_ids = []\n", + " if append_special_tokens:\n", + " tokens.append(\"[CLS]\")\n", + " input_type_ids.append(0)\n", + " for token in tokens_a:\n", + " if replace_mask and token == '_': # XD\n", + " token = \"[MASK]\"\n", + " tokens.append(token)\n", + " input_type_ids.append(0)\n", + " if append_special_tokens:\n", + " tokens.append(\"[SEP]\")\n", + " input_type_ids.append(0)\n", + "\n", + " if tokens_b:\n", + " for token in tokens_b:\n", + " if replace_mask and token == '_': # XD\n", + " token = \"[MASK]\"\n", + " tokens.append(token)\n", + " input_type_ids.append(1)\n", + " if append_special_tokens:\n", + " tokens.append(\"[SEP]\")\n", + " input_type_ids.append(1)\n", + "\n", + " input_ids = tokenizer.convert_tokens_to_ids(tokens)\n", + " input_mask = [1] * len(input_ids)\n", + "\n", + " if ex_index < 5:\n", + "# logger.info(\"*** Example ***\")\n", + "# logger.info(\"unique_id: %s\" % (example.unique_id))\n", + " logger.info(\"tokens: %s\" % \" \".join([str(x) for x in tokens]))\n", + "# logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n", + "# logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n", + "# logger.info(\n", + "# \"input_type_ids: %s\" % \" \".join([str(x) for x in input_type_ids]))\n", + " \n", + " features.append(\n", + " InputFeatures(\n", + " unique_id=example.unique_id,\n", + " tokens=tokens,\n", + " input_ids=input_ids,\n", + " input_mask=input_mask,\n", + " input_type_ids=input_type_ids))\n", + " return features\n", + "\n", + "def copy_and_mask_feature(feature, masked_tokens=None):\n", + " import copy\n", + " tokens = feature.tokens\n", + " masked_positions = [tokens.index(t) for t in masked_tokens if t in tokens] \\\n", + " if masked_tokens is not None else range(len(tokens))\n", + " assert len(masked_positions) > 0\n", + " masked_feature_copies = []\n", + " for masked_pos in masked_positions:\n", + " feature_copy = copy.deepcopy(feature)\n", + " feature_copy.input_ids[masked_pos] = tokenizer.vocab[\"[MASK]\"]\n", + " masked_feature_copies.append(feature_copy)\n", + " return masked_feature_copies, masked_positions" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def show_lm_probs(tokens, input_ids, probs, topk=5, firstk=20):\n", + " def print_pair(token, prob, end_str='', hit_mark=' '):\n", + " if i < firstk:\n", + " # token = token.replace('', '').replace('\\n', '/n')\n", + " print('{}{: >3} | {: <12}'.format(hit_mark, int(round(prob*100)), token), end=end_str)\n", + " \n", + " ret = None\n", + " for i in range(len(tokens)):\n", + " ind_ = input_ids[i].item() if input_ids is not None else tokenizer.vocab[tokens[i]]\n", + " prob_ = probs[i][ind_].item()\n", + " print_pair(tokens[i], prob_, end_str='\\t')\n", + " values, indices = probs[i].topk(topk)\n", + " top_pairs = []\n", + " for j in range(topk):\n", + " ind, prob = indices[j].item(), values[j].item()\n", + " hit_mark = '*' if ind == ind_ else ' '\n", + " token = tokenizer.ids_to_tokens[ind]\n", + " print_pair(token, prob, hit_mark=hit_mark, end_str='' if j < topk - 1 else '\\n')\n", + " top_pairs.append((token, prob))\n", + " if tokens[i] == \"[MASK]\":\n", + " ret = top_pairs\n", + " return ret" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import colored\n", + "from colored import stylize\n", + "\n", + "def show_abnormals(tokens, probs, show_suggestions=False):\n", + " def gap2color(gap):\n", + " if gap <= 5:\n", + " return 'yellow_1'\n", + " elif gap <= 10:\n", + " return 'orange_1'\n", + " else:\n", + " return 'red_1'\n", + " \n", + " def print_token(token, suggestion, gap):\n", + " if gap == 0:\n", + " print(stylize(token + ' ', colored.fg('white') + colored.bg('black')), end='')\n", + " else:\n", + " print(stylize(token, colored.fg(gap2color(gap)) + colored.bg('black')), end='')\n", + " if show_suggestions and gap > 5:\n", + " print(stylize('/' + suggestion + ' ', colored.fg('green' if gap > 10 else 'cyan') + colored.bg('black')), end='')\n", + " else:\n", + " print(stylize(' ', colored.fg(gap2color(gap)) + colored.bg('black')), end='')\n", + " # print('/' + suggestion, end=' ')\n", + " # print('%.2f' % gap, end=' ')\n", + " \n", + " avg_gap = 0.\n", + " for i in range(1, len(tokens) - 1): # skip first [CLS] and last [SEP]\n", + " ind_ = tokenizer.vocab[tokens[i]]\n", + " prob_ = probs[i][ind_].item()\n", + " top_prob = probs[i].max().item()\n", + " top_ind = probs[i].argmax().item()\n", + " gap = math.log(top_prob) - math.log(prob_)\n", + " suggestion = tokenizer.ids_to_tokens[top_ind]\n", + " print_token(tokens[i], suggestion, gap)\n", + " avg_gap += gap\n", + " avg_gap /= (len(tokens) - 2)\n", + " print()\n", + " print(avg_gap)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "analyzed_cache = {}\n", + "\n", + "def analyze_text(text, masked_tokens=None, show_suggestions=False, show_firstk_probs=20):\n", + " if text[0] in analyzed_cache:\n", + " features, mlm_probs = analyzed_cache[text[0]]\n", + " given_mask = \"[MASK]\" in features[0].tokens\n", + " tokens = features[0].tokens\n", + " else:\n", + " examples = convert_text_to_examples(text)\n", + " features = convert_examples_to_features(examples, tokenizer, print_info=False)\n", + " given_mask = \"[MASK]\" in features[0].tokens\n", + " if not given_mask or masked_tokens is not None:\n", + " assert len(features) == 1\n", + " features, masked_positions = copy_and_mask_feature(features[0], masked_tokens=masked_tokens)\n", + "\n", + " input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n", + " input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long)\n", + " input_ids = input_ids.to(device)\n", + " input_type_ids = input_type_ids.to(device)\n", + "\n", + " mlm_logits, _ = model(input_ids, input_type_ids)\n", + " mlm_probs = F.softmax(mlm_logits, dim=-1)\n", + "\n", + " tokens = features[0].tokens\n", + " if not given_mask or masked_tokens is not None:\n", + " bsz, seq_len, vocab_size = mlm_probs.size()\n", + " assert bsz == len(masked_positions)\n", + " # reduced_mlm_probs = torch.Tensor(1, seq_len, vocab_size)\n", + " # for i in range(seq_len):\n", + " # reduced_mlm_probs[0, i] = mlm_probs[i, i]\n", + " reduced_mlm_probs = torch.Tensor(1, len(masked_positions), vocab_size)\n", + " for i, pos in enumerate(masked_positions):\n", + " reduced_mlm_probs[0, i] = mlm_probs[i, pos]\n", + " mlm_probs = reduced_mlm_probs\n", + " tokens = [tokens[i] for i in masked_positions]\n", + " \n", + " analyzed_cache[text[0]] = (features, mlm_probs)\n", + " \n", + " top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=show_firstk_probs)\n", + " if not given_mask:\n", + " show_abnormals(tokens, mlm_probs[0], show_suggestions=show_suggestions)\n", + " return top_pairs" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "01/03/2019 17:13:21 - INFO - examples.extract_features - tokens: [CLS] what ingredients account for the marvelous function of a dream ? [SEP]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0 | [CLS] \t 3 | . 1 | the 1 | , 1 | ) 1 | \" \n", + " 35 | what \t* 35 | what 25 | do 9 | can 7 | could 5 | would \n", + " 0 | ingredients \t 51 | could 23 | would 13 | can 8 | might 2 | may \n", + " 0 | account \t 32 | were 26 | are 7 | remained 6 | existed 6 | exist \n", + " 100 | for \t*100 | for 0 | to 0 | of 0 | up 0 | all \n", + " 98 | the \t* 98 | the 2 | this 0 | a 0 | that 0 | such \n", + " 0 | marvelous \t 5 | biological 5 | normal 4 | cognitive 2 | specific 2 | physiological\n", + " 0 | function \t 21 | ##ness 8 | beauty 5 | quality 5 | nature 4 | power \n", + " 91 | of \t* 91 | of 8 | in 0 | within 0 | as 0 | during \n", + " 14 | a \t 55 | the 16 | this * 14 | a 4 | my 3 | his \n", + " 0 | dream \t 3 | heart 3 | plant 3 | soul 2 | brain 2 | body \n", + " 98 | ? \t* 98 | ? 2 | . 0 | ; 0 | ! 0 | | \n", + " 0 | [SEP] \t 13 | what 12 | \" 7 | they 4 | and 4 | ' \n", + "\u001b[38;5;15m\u001b[48;5;0mwhat \u001b[0m\u001b[38;5;196m\u001b[48;5;0mingredients\u001b[0m\u001b[38;5;196m\u001b[48;5;0m \u001b[0m\u001b[38;5;226m\u001b[48;5;0maccount\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\u001b[38;5;15m\u001b[48;5;0mfor \u001b[0m\u001b[38;5;15m\u001b[48;5;0mthe \u001b[0m\u001b[38;5;214m\u001b[48;5;0mmarvelous\u001b[0m\u001b[38;5;214m\u001b[48;5;0m \u001b[0m\u001b[38;5;214m\u001b[48;5;0mfunction\u001b[0m\u001b[38;5;214m\u001b[48;5;0m \u001b[0m\u001b[38;5;15m\u001b[48;5;0mof \u001b[0m\u001b[38;5;226m\u001b[48;5;0ma\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\u001b[38;5;226m\u001b[48;5;0mdream\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\u001b[38;5;15m\u001b[48;5;0m? \u001b[0m\n", + "3.421217077676471\n" + ] + } + ], + "source": [ + "# text = [\"Who was Jim Henson? Jim Henson _ a puppeteer.\"]\n", + "text = [\"What ingredients account for the marvelous function of a dream?\"]\n", + "# text = [\"Last week I went to the theatre. I had a very good seat. The play was very interesting. But I didn't enjoy it. A young man and a young woman were sitting behind me. They were talking loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angrily. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'\"]\n", + "# text = [\"After the outbreak of the disease, the Ministry of Agriculture and rural areas immediately sent a supervision team to the local. Local Emergency Response Mechanism has been activated in accordance with the requirements, to take blockade, culling, harmless treatment, disinfection and other treatment measures to all disease and culling of pigs for harmless treatment. At the same time, all live pigs and their products are prohibited from transferring out of the blockade area, and live pigs are not allowed to be transported into the blockade area. At present, all the above measures have been implemented.\"]\n", + "# text = [\"Early critics of Emily Dickinson's poetry mistook for simplemindedness the surface of artlessness that in fact she constructed with such innocence.\"]\n", + "analyze_text(text, show_firstk_probs=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "01/03/2019 17:10:45 - INFO - examples.extract_features - tokens: [CLS] the trophy doesn ' t fit into the brown suitcase because the [MASK] is too large . [SEP]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0 | [CLS] \t 2 | . 1 | ) 1 | the 1 | , 1 | \" \n", + " 100 | the \t*100 | the 0 | his 0 | a 0 | its 0 | her \n", + " 97 | trophy \t* 97 | trophy 0 | cup 0 | prize 0 | trophies 0 | competition \n", + " 100 | doesn \t*100 | doesn 0 | can 0 | does 0 | won 0 | didn \n", + " 100 | ' \t*100 | ' 0 | t 0 | \" 0 | = 0 | ` \n", + " 100 | t \t*100 | t 0 | not 0 | s 0 | n 0 | to \n", + " 100 | fit \t*100 | fit 0 | fits 0 | sit 0 | get 0 | fitting \n", + " 100 | into \t*100 | into 0 | in 0 | inside 0 | onto 0 | within \n", + " 100 | the \t*100 | the 0 | her 0 | his 0 | a 0 | my \n", + " 100 | brown \t*100 | brown 0 | black 0 | green 0 | blue 0 | plastic \n", + " 95 | suitcase \t* 95 | suitcase 3 | bag 1 | luggage 0 | backpack 0 | trunk \n", + " 100 | because \t*100 | because 0 | as 0 | since 0 | due 0 | . \n", + " 100 | the \t*100 | the 0 | its 0 | his 0 | it 0 | her \n", + " 0 | [MASK] \t 21 | suitcase 19 | bag 6 | box 2 | luggage 2 | case \n", + " 99 | is \t* 99 | is 1 | was 0 | being 0 | has 0 | it \n", + " 100 | too \t*100 | too 0 | very 0 | extra 0 | overly 0 | more \n", + " 87 | large \t* 87 | large 11 | big 1 | small 1 | huge 0 | larger \n", + " 100 | . \t*100 | . 0 | ; 0 | , 0 | ! 0 | ' \n", + " 0 | [SEP] \t 35 | . 8 | ) 5 | , 4 | ( 3 | it \n" + ] + } + ], + "source": [ + "text = [\"The trophy doesn't fit into the brown suitcase because the _ is too large.\"]\n", + "# text = [\"Mary beat John in the match because _ was very strong.\"]\n", + "features = convert_examples_to_features(convert_text_to_examples(text), tokenizer, print_info=False)\n", + "input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).to(device)\n", + "input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long).to(device)\n", + "mlm_logits, _ = model(input_ids, input_type_ids)\n", + "mlm_probs = F.softmax(mlm_logits, dim=-1)\n", + "tokens = features[0].tokens\n", + "top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have the same hair color.',\n", + " 'Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have different hair colors.']" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text = [\n", + " # same / different\n", + " \"Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have the same hair color.\",\n", + " \"Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have different hair colors.\",\n", + " \"Tom has yellow hair. Mary has black hair. John has black hair. Mary and _ have the same hair color.\",\n", + " # because / although\n", + " \"John is taller/shorter than Mary because/although _ is older/younger.\",\n", + " \"The red ball is heavier/lighter than the blue ball because/although the _ ball is bigger/smaller.\",\n", + " \"Charles did a lot better/worse than his good friend Nancy on the test because/although _ had/hadn't studied so hard.\",\n", + " \"The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.\",\n", + " \"John thought that he would arrive earlier than Susan, but/and indeed _ was the first to arrive.\",\n", + " # reverse\n", + " \"John came then Mary came. They left in reverse order. _ left then _ left.\",\n", + " \"John came after Mary. They left in reverse order. _ left after _ .\",\n", + " \"John came first, then came Mary. They left in reverse order: _ left first, then left _ .\",\n", + " # compare\n", + " \"Though John is tall, Tom is taller than John. So John is _ than Tom.\",\n", + " \"Tom is taller than John. So _ is shorter than _.\",\n", + " # WSC-style: before /after\n", + " \"Mary came before/after John. _ was late/early .\",\n", + " # yes / no\n", + " \"Was Tom taller than Susan? Yes, _ was taller.\",\n", + " # right / wrong, epistemic modality\n", + " \"John said the rain was about to stop. Mary said the rain would continue. Later the rain stopped. _ was wrong.\",\n", + " \n", + " \"The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.\",\n", + " \"John thanked Mary because _ had given help to _ . \",\n", + " \"John felt vindicated/crushed when his longtime rival Mary revealed that _ was the winner of the competition.\",\n", + " \"John couldn't see the stage with Mary in front of him because _ is so short/tall.\",\n", + " \"Although they ran at about the same speed, John beat Sally because _ had such a bad start.\",\n", + " \"The fish ate the worm. The _ was hungry/tasty.\",\n", + " \n", + " \"John beat Mary. _ won the game/e winner.\",\n", + "]\n", + "text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "with open('WSC_switched_label.json') as f:\n", + " examples = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "with open('WSC_child_problem.json') as f:\n", + " cexamples = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "for ce in cexamples:\n", + " for s in ce['sentences']:\n", + " for a in s['answer0'] + s['answer1']:\n", + " a = a.lower()\n", + " if a not in tokenizer.vocab:\n", + " ce\n", + " print(a, 'not in vocab!!!')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "for ce in cexamples:\n", + " if len(ce['sentences']) > 0:\n", + " e = examples[ce['index']]\n", + " assert ce['index'] == e['index']\n", + " e['score'] = all([s['score'] for s in ce['sentences']])\n", + " assert len(set([s['adjacent_ref'] for s in ce['sentences']])) == 1, 'adjcent_refs are different!'\n", + " e['adjacent_ref'] = ce['sentences'][0]['adjacent_ref']" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "groups = defaultdict(list)\n", + "for e in examples:\n", + " if 'score' in e:\n", + " index = e['index']\n", + " if index < 252:\n", + " if index % 2 == 1:\n", + " index -= 1\n", + " elif index in [252, 253, 254]:\n", + " index = 252\n", + " else:\n", + " if index % 2 == 0:\n", + " index -= 1\n", + " groups[index].append(e)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(2, 'fit into:large/small', False),\n", + " (4, 'thank:receive/give', False),\n", + " (6, 'call:successful available', True),\n", + " (8, 'ask:repeat answer', False),\n", + " (10, 'zoom by:fast/slow', False),\n", + " (12, 'vindicated/crushed:be the winner', False),\n", + " (14, 'lift:weak heavy', False),\n", + " (16, 'crash through:[hard]/[soft]', False),\n", + " (18, '[block]:short/tall', False),\n", + " (20, 'down to:top/bottom', False),\n", + " (22, 'beat:good/bad', False),\n", + " (24, 'roll off:anchored level', False),\n", + " (26, 'above/below', False),\n", + " (28, 'better/worse:study hard', False),\n", + " (30, 'after/before:far away', False),\n", + " (32, 'be upset with:buy from not work/sell not work', True),\n", + " (34, '?yell at comfort:upset', False),\n", + " (36, 'above/below:moved first', False),\n", + " (38, 'although/because', False),\n", + " (40, 'bully:punish rescue', False),\n", + " (42, 'pour:empty/full', False),\n", + " (44, 'know:nosy indiscreet', False),\n", + " (46, 'explain:convince/understand', True),\n", + " (48, '?know tell:so/because', True),\n", + " (50, 'beat:younger/older', False),\n", + " (56, 'clog:cleaned removed', True),\n", + " (58, '?immediately follow:short delayed', False),\n", + " (60, '?between:see see around', True),\n", + " (64, 'but/and', False),\n", + " (66, 'clean:put in the trash put in the drawer', False),\n", + " (68, 'because/but', False),\n", + " (70, 'out of:handy lighter', False),\n", + " (72, 'put:tall high', False),\n", + " (74, 'show:good famous', True),\n", + " (76, 'pay for:generous grateful', False),\n", + " (78, 'but', False),\n", + " (80, 'if', False),\n", + " (82, 'if', False),\n", + " (84, 'fool:get/lose', False),\n", + " (88, 'wait:impatient cautious', False),\n", + " (90, 'give birth:woman baby', True),\n", + " (92, '?stop normal/stop abnormal:strange', False),\n", + " (96, 'eat:hungry tasty', False),\n", + " (98, 'put ... into filled with ... :get in/get out', False),\n", + " (100, 'up:at the bottom/at the top', False),\n", + " (102, 'crash through:removed repaired', False),\n", + " (104, 'stab:taken to the police station taken to the hospital', False),\n", + " (106, 'hear ... humming and whistling:annoyed/annoying', True),\n", + " (108, 'see ... juggling watermelons:impressed/impressive', True),\n", + " (114, 'tell lies: truthful skeptical', True),\n", + " (130, 'but:disappointed', True),\n", + " (132, 'visit:invite come out/invite come in', True),\n", + " (134, 'take classes from:eager known to speak it fluently', False),\n", + " (138, 'cover:out gone', True),\n", + " (144, 'tuck:work sleep', True),\n", + " (150, 'influence:later/earlier', False),\n", + " (152, 'can not cut:thick small', False),\n", + " (154, 'attack:kill guard', False),\n", + " (156, 'attack:bold nervous', False),\n", + " (160, 'change:hard:easy', False),\n", + " (166, 'alive:is/was', False),\n", + " (168, 'infant:twelve years old twelve months old', False),\n", + " (170, 'better equipped and large:defeated/victorious', False),\n", + " (178, 'interview:persistent cooperative', False),\n", + " (186, 'be full of:minority/majority', False),\n", + " (188, 'like over:more/fewer', False),\n", + " (190, 'place on all:not enough/too many', True),\n", + " (192, 'stick:leave have', True),\n", + " (196, 'follow:admire/influence', True),\n", + " (198, 'fit through:wide/narrow', False),\n", + " (200, 'trade:dowdy/great', False),\n", + " (202, 'hire/hire oneself to:take care of', True),\n", + " (204, 'promise/order', False),\n", + " (208, 'mother:education place', True),\n", + " (210, 'knock:get an answer/answer', True),\n", + " (212, 'pay:receive/deliver', False),\n", + " (218, '?', False),\n", + " (220, 'say check:move take', False),\n", + " (222, '?', False),\n", + " (224, 'give a life:drive alone walk', False),\n", + " (226, 'pass the plate:full/hungry', False),\n", + " (228, 'pass:turn over turn next', False),\n", + " (232, 'stretch pat', True),\n", + " (234, 'accept share', False),\n", + " (236, 'speak:break silence break concentration', False),\n", + " (240, 'carry:leg ache leg dangle', True),\n", + " (242, 'carry:in arms in bassinet', False),\n", + " (244, 'hold:against chest against will', True),\n", + " (250, 'stop', False),\n", + " (252, 'even though/because/not', False),\n", + " (255, 'give:not hungry/hungry', False),\n", + " (259, 'ask for a favor:refuse/be refused`', False),\n", + " (261, 'cede:less popular/more popular', False),\n", + " (263, 'not pass although:see open/open', True),\n", + " (271, 'suspect regret', True)]" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def filter_dict(d, keys=['index', 'sentence', 'correct_answer', 'relational_word', 'is_associative', 'score']):\n", + " return {k: d[k] for k in d if k in keys}\n", + "\n", + "# ([[filter_dict(e) for e in eg] for eg in groups.values() if eg[0]['relational_word'] != 'none' and all([e['score'] for e in eg])])# / len([eg for eg in groups.values() if eg[0]['relational_word'] != 'none'])\n", + "[(index, eg[0]['relational_word'], all([e['score'] for e in eg])) for index, eg in groups.items() if eg[0]['relational_word'] != 'none']\n", + "# len([filter_dict(e) for e in examples if 'score' in e and not e['score'] and e['adjacent_ref']])\n", + "# for e in examples:\n", + "# if e['index'] % 2 == 0:\n", + "# print(e['sentence'])" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "179" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(['because' in e['sentence'] for e in examples]) + \\\n", + "sum(['so ' in e['sentence'] for e in examples]) + \\\n", + "sum(['but ' in e['sentence'] for e in examples]) + \\\n", + "sum(['though' in e['sentence'] for e in examples])" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "# with open('WSC_switched_label.json', 'w') as f:\n", + "# json.dump(examples, f)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "vis_attn_topk = 3\n", + "\n", + "def has_chinese_label(labels):\n", + " labels = [label.split('->')[0].strip() for label in labels]\n", + " r = sum([len(label) > 1 for label in labels if label not in ['BOS', 'EOS']]) * 1. / (len(labels) - 1)\n", + " return 0 < r < 0.5 # r == 0 means empty query labels used in self attention\n", + "\n", + "def _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col, color='b'):\n", + " assert len(query_labels) == attn.size(0)\n", + " assert len(key_labels) == attn.size(1)\n", + "\n", + " ax1.set_xlim([-1, 1])\n", + " ax1.set_xticks([])\n", + " ax2 = ax1.twinx()\n", + " nlabels = max(len(key_labels), len(query_labels))\n", + " pos = range(nlabels)\n", + " \n", + " if 'self' in attn_name and col < ncols - 1:\n", + " query_labels = ['' for _ in query_labels]\n", + "\n", + " for ax, labels in [(ax1, key_labels), (ax2, query_labels)]:\n", + " ax.set_yticks(pos)\n", + " if has_chinese_label(labels):\n", + " ax.set_yticklabels(labels, fontproperties=zhfont)\n", + " else:\n", + " ax.set_yticklabels(labels)\n", + " ax.set_ylim([nlabels - 1, 0])\n", + " ax.tick_params(width=0, labelsize='xx-large')\n", + "\n", + " for spine in ax.spines.values():\n", + " spine.set_visible(False)\n", + "\n", + "# mask, attn = filter_attn(attn)\n", + " for qi in range(attn.size(0)):\n", + "# if not mask[qi]:\n", + "# continue\n", + "# for ki in range(attn.size(1)):\n", + " for ki in attn[qi].topk(vis_attn_topk)[1]:\n", + " a = attn[qi, ki]\n", + " ax1.plot((-1, 1), (ki, qi), color, alpha=a)\n", + "# print(attn.mean(dim=0).topk(5)[0])\n", + "# ax1.barh(pos, attn.mean(dim=0).data.cpu().numpy())\n", + "\n", + "def plot_layer_attn(result_tuple, attn_name='dec_self_attns', layer=0, heads=None):\n", + " hypo, nheads, labels_dict = result_tuple\n", + " key_labels, query_labels = labels_dict[attn_name]\n", + " if heads is None:\n", + " heads = range(nheads)\n", + " else:\n", + " nheads = len(heads)\n", + " \n", + " stride = 2 if attn_name == 'dec_enc_attns' else 1\n", + " nlabels = max(len(key_labels), len(query_labels))\n", + " rcParams['figure.figsize'] = 20, int(round(nlabels * stride * nheads / 8 * 1.0))\n", + " \n", + " rows = nheads // ncols * stride\n", + " fig, axes = plt.subplots(rows, ncols)\n", + " \n", + " # for head in range(nheads):\n", + " for head_i, head in enumerate(heads):\n", + " row, col = head_i * stride // ncols, head_i * stride % ncols\n", + " ax1 = axes[row, col]\n", + " attn = hypo[attn_name][layer][head]\n", + " _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col)\n", + " if attn_name == 'dec_enc_attns':\n", + " col = col + 1\n", + " axes[row, col].axis('off') # next subfig acts as blank place holder\n", + " # plt.suptitle('%s with %d heads, Layer %d' % (attn_name, nheads, layer), fontsize=20)\n", + " plt.show() \n", + " \n", + "ncols = 4" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "attn_name = 'enc_self_attns'\n", + "hypo = {attn_name: [model.bert.encoder.layer[i].attention.self.attention_probs[0] for i in range(config.num_hidden_layers)]}\n", + "key_labels = query_labels = tokens\n", + "labels_dict = {attn_name: (key_labels, query_labels)}\n", + "result_tuple = (hypo, config.num_attention_heads, labels_dict)\n", + "plot_layer_attn(result_tuple, attn_name=attn_name, layer=10, heads=None)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Untitled_likunlin.ipynb b/Untitled_likunlin.ipynb new file mode 100644 index 00000000000000..6d561c5185b780 --- /dev/null +++ b/Untitled_likunlin.ipynb @@ -0,0 +1,884 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "InteractiveShell.ast_node_interactivity = 'all'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten.\n", + "Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead.\n", + "Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable.\n", + "Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable.\n", + "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "\n", + "import numpy as np\n", + "import math\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "from pylab import rcParams\n", + "\n", + "import torch\n", + "import torch.nn.functional as F\n", + "from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig\n", + "from examples.extract_features import *" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "05/14/2019 15:48:11 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/xd/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084\n", + "05/14/2019 15:48:12 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/\n", + "05/14/2019 15:48:12 - INFO - pytorch_pretrained_bert.modeling - Model config {\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"max_position_embeddings\": 512,\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"type_vocab_size\": 2,\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "class Args:\n", + " def __init__(self):\n", + " pass\n", + " \n", + "args = Args()\n", + "args.no_cuda = True\n", + "\n", + "CONFIG_NAME = 'bert_config.json'\n", + "BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/'\n", + "config_file = os.path.join(BERT_DIR, CONFIG_NAME)\n", + "config = BertConfig.from_json_file(config_file)\n", + "\n", + "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')#do_lower_case:在标记化时将文本转换为小写。默认= True\n", + "model = BertForPreTraining.from_pretrained(BERT_DIR)\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n", + "_ = model.to(device)\n", + "_ = model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[2331, 2351, 2757, 3280, 5996, 8289]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokens = ['death','died','dead','die','dying','dies']\n", + "tokenizer.convert_tokens_to_ids(tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BertForPreTraining:\n", + "Outputs:\n", + " if `masked_lm_labels` and `next_sentence_label` are not `None`:\n", + " Outputs the total_loss which is the sum of the masked language modeling loss and the next\n", + " sentence classification loss.\n", + " if `masked_lm_labels` or `next_sentence_label` is `None`:\n", + " Outputs a tuple comprising\n", + " - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and\n", + " - the next sentence classification logits of shape [batch_size, 2]." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "from_pretrained:\n", + "Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.\n", + "Download and cache the pre-trained model file if needed." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "05/14/2019 15:48:15 - INFO - examples.extract_features - tokens: [CLS] i love you [SEP]\n", + "05/14/2019 15:48:15 - INFO - examples.extract_features - tokens: [CLS] hello everybody [SEP]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['[CLS]', 'i', 'love', 'you', '[SEP]']\n", + "[101, 1045, 2293, 2017, 102]\n", + "['[CLS]', 'hello', 'everybody', '[SEP]']\n", + "[101, 7592, 7955, 102]\n" + ] + } + ], + "source": [ + "import re\n", + "def convert_text_to_examples(text): #把每一行的句子变成一个实例,一个实例中包含text_a,text_b(text_b目前是没用的)\n", + " examples = []\n", + " unique_id = 0\n", + " if True:\n", + " for line in text:\n", + " line = line.strip()\n", + " text_a = None\n", + " text_b = None\n", + " m = re.match(r\"^(.*) \\|\\|\\| (.*)$\", line) #想要匹配这样的字符串'You are my sunshine. ||| I love you.'\n", + " \n", + " if m is None:\n", + " text_a = line\n", + " else:\n", + " text_a = m.group(1) #匹配的第一句,比如You are my sunshine,my only sunshine.\n", + " text_b = m.group(2) #匹配的第二句,比如I love you.\n", + " \n", + " examples.append(\n", + " InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))\n", + " unique_id += 1\n", + " return examples\n", + "#疑问,当text是一行的时候,line是一个个字母 -> text是[\"***\"]的形式\n", + "#print(convert_text_to_examples({\"I love you\",\"hello everybody\"})[0].text_a)\n", + "\n", + "def convert_examples_to_features(examples, tokenizer, append_special_tokens=True, replace_mask=True, print_info=False):\n", + " #把实例变成一个特征\n", + " features = []\n", + " for (ex_index, example) in enumerate(examples):\n", + " tokens_a = tokenizer.tokenize(example.text_a) #tokenizer的作用是\n", + " #print(example.unique_id) #*****************************\n", + " tokens_b = None\n", + " if example.text_b:\n", + " tokens_b = tokenizer.tokenize(example.text_b)\n", + "\n", + " tokens = []\n", + " input_type_ids = [] #segment embedding\n", + " if append_special_tokens: #输入参数中默认为true\n", + " tokens.append(\"[CLS]\")\n", + " input_type_ids.append(0)\n", + " for token in tokens_a:\n", + " if replace_mask and token == '_': # XD\n", + " token = \"[MASK]\"\n", + " tokens.append(token)\n", + " input_type_ids.append(0)\n", + " if append_special_tokens:\n", + " tokens.append(\"[SEP]\")\n", + " input_type_ids.append(0)\n", + "\n", + " if tokens_b:\n", + " for token in tokens_b:\n", + " if replace_mask and token == '_': # XD\n", + " token = \"[MASK]\"\n", + " tokens.append(token)\n", + " input_type_ids.append(1)\n", + " if append_special_tokens:\n", + " tokens.append(\"[SEP]\")\n", + " input_type_ids.append(1)\n", + " print(tokens) #*******************************\n", + " input_ids = tokenizer.convert_tokens_to_ids(tokens) #把原来句子中的词语编成在字典中的编号\n", + " input_mask = [1] * len(input_ids) \n", + " print(input_ids)#***********************************\n", + " if ex_index < 5:\n", + "# logger.info(\"*** Example ***\")\n", + "# logger.info(\"unique_id: %s\" % (example.unique_id))\n", + " logger.info(\"tokens: %s\" % \" \".join([str(x) for x in tokens]))\n", + "# logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n", + "# logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n", + "# logger.info(\n", + "# \"input_type_ids: %s\" % \" \".join([str(x) for x in input_type_ids]))\n", + " \n", + " features.append(\n", + " InputFeatures(\n", + " unique_id=example.unique_id,\n", + " tokens=tokens,\n", + " input_ids=input_ids,#字符串中的每个单词在词典中的index序列\n", + " input_mask=input_mask, #一堆1\n", + " input_type_ids=input_type_ids)) #第0类和第一类,对text_a,text_b的区分\n", + " return features\n", + " \n", + "examples = convert_text_to_examples({\"I love you\",\"hello everybody\"})\n", + "features = convert_examples_to_features(examples, tokenizer, print_info=False)\n", + "\n", + "def copy_and_mask_feature(feature, masked_tokens=None):\n", + " import copy\n", + " tokens = feature.tokens\n", + " masked_positions = [tokens.index(t) for t in masked_tokens if t in tokens] \\\n", + " if masked_tokens is not None else range(len(tokens))\n", + " \n", + " assert len(masked_positions) > 0\n", + " masked_feature_copies = []\n", + " for masked_pos in masked_positions: #用[mask]依次掩盖每一个位置\n", + " feature_copy = copy.deepcopy(feature)\n", + " feature_copy.input_ids[masked_pos] = tokenizer.vocab[\"[MASK]\"]\n", + " masked_feature_copies.append(feature_copy)\n", + " return masked_feature_copies, masked_positions\n", + "\n", + "#masked_feature_copies, masked_positions = copy_and_mask_feature(features[1])\n", + "#print(masked_feature_copies[0].input_ids) #结果[101, 1045, 2293, 103, 102]\n", + "#print(masked_positions) #结果是一个range(0,5)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "def show_lm_probs(tokens, input_ids, probs, topk=5, firstk=20): #输出结果的函数,要最高概率topk个输出\n", + " def print_pair(token, prob, end_str='', hit_mark=' '):\n", + " if i < firstk:\n", + " # token = token.replace('', '').replace('\\n', '/n')\n", + " print('{}{: >3} | {: <12}'.format(hit_mark, int(round(prob*100)), token), end=end_str)\n", + " \n", + " ret = None\n", + " for i in range(len(tokens)):\n", + " ind_ = input_ids[i].item() if input_ids is not None else tokenizer.vocab[tokens[i]]\n", + " prob_ = probs[i][ind_].item() #这个probs是该字符串第i个位置上填上词典上各个词的概率\n", + " print_pair(tokens[i], prob_, end_str='\\t')\n", + " values, indices = probs[i].topk(topk)\n", + " top_pairs = []\n", + " for j in range(topk):\n", + " ind, prob = indices[j].item(), values[j].item()\n", + " hit_mark = '*' if ind == ind_ else ' '\n", + " token = tokenizer.ids_to_tokens[ind]\n", + " print_pair(token, prob, hit_mark=hit_mark, end_str='' if j < topk - 1 else '\\n')\n", + " top_pairs.append((token, prob))\n", + " if tokens[i] == \"[MASK]\":\n", + " ret = top_pairs\n", + " return ret #返回的这是个啥" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "import colored\n", + "from colored import stylize\n", + "\n", + "def show_abnormals(tokens, probs, show_suggestions=False):\n", + " def gap2color(gap):\n", + " if gap <= 5:\n", + " return 'yellow_1'\n", + " elif gap <= 10:\n", + " return 'orange_1'\n", + " else:\n", + " return 'red_1'\n", + " \n", + " def print_token(token, suggestion, gap):\n", + " \n", + " if gap == 0:\n", + " print(stylize(token + ' ', colored.fg('white') + colored.bg('black')), end='')\n", + " else:\n", + " print(stylize(token, colored.fg(gap2color(gap)) + colored.bg('black')), end='')\n", + " if show_suggestions and gap > 5:\n", + " print(stylize('/' + suggestion + ' ', colored.fg('green' if gap > 10 else 'cyan') + colored.bg('black')), end='')\n", + " else:\n", + " print(stylize(' ', colored.fg(gap2color(gap)) + colored.bg('black')), end='')\n", + " \n", + " # print('/' + suggestion, end=' ')\n", + " # print('%.2f' % gap, end=' ')\n", + " #print(gap)\n", + " avg_gap = 0.\n", + " for i in range(1, len(tokens) - 1): # skip first [CLS] and last [SEP]\n", + " ind_ = tokenizer.vocab[tokens[i]]\n", + " prob_ = probs[i][ind_].item()\n", + " top_prob = probs[i].max().item()\n", + " top_ind = probs[i].argmax().item()\n", + " gap = math.log(top_prob) - math.log(prob_) #计算两个词之间的差距\n", + " #print(top_prob,prob_)\n", + " suggestion = tokenizer.ids_to_tokens[top_ind]\n", + " print_token(tokens[i], suggestion, gap)\n", + " avg_gap += gap\n", + " avg_gap /= (len(tokens) - 2)\n", + " print()\n", + " print(avg_gap)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "analyzed_cache = {}\n", + "\n", + "def analyze_text(text, masked_tokens=None, show_suggestions=True, show_firstk_probs=20):\n", + " if text[0] in analyzed_cache: #分析过的缓存\n", + " features, mlm_probs = analyzed_cache[text[0]]\n", + " given_mask = \"[MASK]\" in features[0].tokens\n", + " tokens = features[0].tokens \n", + " else:\n", + " examples = convert_text_to_examples(text)\n", + " features = convert_examples_to_features(examples, tokenizer, print_info=False)\n", + " given_mask = \"[MASK]\" in features[0].tokens\n", + " if not given_mask or masked_tokens is not None:\n", + " assert len(features) == 1\n", + " features, masked_positions = copy_and_mask_feature(features[0], masked_tokens=masked_tokens)\n", + "\n", + " input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) #把input_ids增加了一个维度,变成[n_features,sequence_len]\n", + " #这里的n_features实际上是句子有多少个单词位置,每个位置依次换成[mask]\n", + " input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样\n", + " input_ids = input_ids.to(device) #拿去GPU\n", + " input_type_ids = input_type_ids.to(device)\n", + " \n", + " time_start=time.time()\n", + " mlm_logits, _ = model(input_ids, input_type_ids)\n", + " time_end=time.time()\n", + " print('time cost1',time_end-time_start,'s')\n", + " \n", + " mlm_probs = F.softmax(mlm_logits, dim=-1) #最后一维,也就是vocab 换算成概率和为百分之百\n", + " #print(mlm_probs.size())#这里实验的是torch.Size([5, 5, 30522])\n", + " tokens = features[0].tokens #不知道要干嘛\n", + " if not given_mask or masked_tokens is not None:\n", + " bsz, seq_len, vocab_size = mlm_probs.size() #三个维度分别是batch_size, sequence_length, vocab_size\n", + " assert bsz == len(masked_positions)\n", + " # reduced_mlm_probs = torch.Tensor(1, seq_len, vocab_size)\n", + " # for i in range(seq_len):\n", + " # reduced_mlm_probs[0, i] = mlm_probs[i, i]\n", + " reduced_mlm_probs = torch.Tensor(1, len(masked_positions), vocab_size)\n", + " for i, pos in enumerate(masked_positions):\n", + " reduced_mlm_probs[0, i] = mlm_probs[i, pos]\n", + " mlm_probs = reduced_mlm_probs #压缩一下大小,节约不必要浪费的空间(只需要第i个batch里面[mask]位置的词汇表概率即可)\n", + " tokens = [tokens[i] for i in masked_positions]\n", + " \n", + " analyzed_cache[text[0]] = (features, mlm_probs)\n", + " \n", + " top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=show_firstk_probs) #传入的probs是二维的\n", + " #print(\"************************************************************************************************************\")\n", + " #print(top_pairs) #******************************\n", + " if not given_mask:\n", + " show_abnormals(tokens, mlm_probs[0], show_suggestions=show_suggestions)\n", + " return top_pairs\n" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "05/21/2019 16:22:56 - INFO - examples.extract_features - tokens: [CLS] he is dies . [SEP]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['[CLS]', 'he', 'is', 'dies', '.', '[SEP]']\n", + "[101, 2002, 2003, 8289, 1012, 102]\n", + "time cost1 0.0779261589050293 s\n", + " 0 | [CLS] \t 4 | . 1 | , 1 | the 1 | ) 1 | \" \n", + " 19 | he \t* 19 | he 8 | it 6 | she 3 | and 2 | the \n", + " 0 | is \t 33 | then 15 | soon 12 | eventually 7 | later 4 | also \n", + " 0 | dies \t 4 | dead 3 | alive 3 | right 2 | beautiful 2 | not \n", + " 93 | . \t* 93 | . 6 | ; 1 | ! 0 | ? 0 | | \n", + " 0 | [SEP] \t 11 | \" 5 | he 2 | . 1 | and 1 | it \n", + "\u001b[38;5;15m\u001b[48;5;0mhe \u001b[0m\u001b[38;5;214m\u001b[48;5;0mis\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/then \u001b[0m\u001b[38;5;214m\u001b[48;5;0mdies\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/dead \u001b[0m\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "3.6602350262062977\n", + "time cost 0.0883021354675293 s\n" + ] + } + ], + "source": [ + "# text = [\"Who was Jim Henson? Jim Henson _ a puppeteer.\"]\n", + "#text = [\"Last week I went to the theatre. I had very good seat. The play was very interesting. But I didn't enjoy it. A young man and a young woman were sitting behind me. They were talking loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angry. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'\"]\n", + "#text = [\"After the outbreak of the disease, the Ministry of Agriculture and rural areas immediately sent a supervision team to the local. Local Emergency Response Mechanism has been activated in accordance with the requirements, to take blockade, culling, harmless treatment, disinfection and other treatment measures to all disease and culling of pigs for harmless treatment. At the same time, all live pigs and their products are prohibited from transferring out of the blockade area, and live pigs are not allowed to be transported into the blockade area. At present, all the above measures have been implemented.\"]\n", + "#text = [\"The journey was long and tired. We left London at five o'clock in the evening and spend eight hours in the train. We had been travelled for 3 hours after someone appeared selling food and drinks. It was darkness all the time we were crossing Wales, but we could see nothing through the windows. When we finally arrived at Holyhead nearly , everyone was slept. As soon as the train stopped, everybody come to life, grabbing their suitcases and rushing onto the platform.\"]\n", + "#text = [\"When I was little, Friday's night was our family game night. After supper, we would play card games of all sort in the sitting room. As the kid, I loved to watch cartoons,but no matter how many times I asked for watching them, my parents would not to let me.They would say to us that playing card games would help my brain. Still I unwilling to play the games for them sometimes. \"]\n", + "#text = [\"After the outbreak of the disease, the Ministry of Agriculture and rural areas immediately sent a supervision team to the local. Local Emergency Response Mechanism has been activated in accordance with the requirements, to take blockade, culling, harmless treatment, disinfection and other treatment measures to all disease and culling of pigs for harmless treatment. At the same time, all live pigs and their products are prohibited from transferring out of the blockade area, and live pigs are not allowed to be transported into the blockade area. At present, all the above measures have been implemented.\"]\n", + "# text = [\"Early critics of Emily Dickinson's poetry mistook for simplemindedness the surface of artlessness that in fact she constructed with such innocence.\"]\n", + "#text = [\"During my last winter holiday, I went to the countryside with my father to visit my grandparents. I find a big change there. The first time I went there, they were living in a small house with dogs, ducks, and another animals. Last winter when I went here again, they had a big separate house to raise dozens of chicken. They also had a small pond which they raised fish. My grandpa said last summer they earned quite a lot by sell the fish. I felt happily that their life had improved. At the end of our trip,I told my father that I planned to return for every two years, but he agreed.\"]\n", + "# text = ['The problem is difficult than that one.']\n", + "#text = [\"It was Monday morning, and the writing class had just begin. Everyone was silent, wait to see who would be called upon to read his and her paragraph aloud. Some of us were confident and eagerly take part in the class activity, others were nervous and anxious. I had done myself homework but I was shy. I was afraid that to speak in front of a larger group of people. At that moment, I remembered that my father once said, 'The classroom is a place for learning and that include learning from the textbooks, and mistakes as well.' Immediate, I raised my hand.\"]\n", + "text = [\"He is dies.\"]\n", + "import time\n", + "time_start=time.time()\n", + "#text = [\"The play was very interesting.\"]\n", + "#text = [\"The question is easy than that one.\"]\n", + "#text =[\"The apple a eat by me. I had a very good seat. The play was very interesting.But I didn't enjoy it. A young man and a young woman were sitting behind me.They were talking loudly. I got very angry.\"]#因为外面有中括号,所以是二维的\n", + "analyze_text(text, show_firstk_probs=200)\n", + "#print(analyzed_cache)\n", + "time_end=time.time()\n", + "print('time cost',time_end-time_start,'s')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have the same hair color.',\n", + " 'Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have different hair colors.']" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text = [\n", + " # same / different\n", + " \"Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have the same hair color.\",\n", + " \"Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have different hair colors.\",\n", + " \"Tom has yellow hair. Mary has black hair. John has black hair. Mary and _ have the same hair color.\",\n", + " # because / although\n", + " \"John is taller/shorter than Mary because/although _ is older/younger.\",\n", + " \"The red ball is heavier/lighter than the blue ball because/although the _ ball is bigger/smaller.\",\n", + " \"Charles did a lot better/worse than his good friend Nancy on the test because/although _ had/hadn't studied so hard.\",\n", + " \"The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.\",\n", + " \"John thought that he would arrive earlier than Susan, but/and indeed _ was the first to arrive.\",\n", + " # reverse\n", + " \"John came then Mary came. They left in reverse order. _ left then _ left.\",\n", + " \"John came after Mary. They left in reverse order. _ left after _ .\",\n", + " \"John came first, then came Mary. They left in reverse order: _ left first, then left _ .\",\n", + " # compare\n", + " \"Though John is tall, Tom is taller than John. So John is _ than Tom.\",\n", + " \"Tom is taller than John. So _ is shorter than _.\",\n", + " # WSC-style: before /after\n", + " \"Mary came before/after John. _ was late/early .\",\n", + " # yes / no\n", + " \"Was Tom taller than Susan? Yes, _ was taller.\",\n", + " # right / wrong, epistemic modality\n", + " \"John said the rain was about to stop. Mary said the rain would continue. Later the rain stopped. _ was wrong.\",\n", + " \n", + " \"The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.\",\n", + " \"John thanked Mary because _ had given help to _ . \",\n", + " \"John felt vindicated/crushed when his longtime rival Mary revealed that _ was the winner of the competition.\",\n", + " \"John couldn't see the stage with Mary in front of him because _ is so short/tall.\",\n", + " \"Although they ran at about the same speed, John beat Sally because _ had such a bad start.\",\n", + " \"The fish ate the worm. The _ was hungry/tasty.\",\n", + " \n", + " \"John beat Mary. _ won the game/e winner.\",\n", + "]\n", + "text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "with open('WSC_switched_label.json') as f:\n", + " examples = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "with open('WSC_child_problem.json') as f:\n", + " cexamples = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "for ce in cexamples:\n", + " for s in ce['sentences']:\n", + " for a in s['answer0'] + s['answer1']:\n", + " a = a.lower()\n", + " if a not in tokenizer.vocab:\n", + " ce\n", + " print(a, 'not in vocab!!!')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "for ce in cexamples:\n", + " if len(ce['sentences']) > 0:\n", + " e = examples[ce['index']]\n", + " assert ce['index'] == e['index']\n", + " e['score'] = all([s['score'] for s in ce['sentences']])\n", + " assert len(set([s['adjacent_ref'] for s in ce['sentences']])) == 1, 'adjcent_refs are different!'\n", + " e['adjacent_ref'] = ce['sentences'][0]['adjacent_ref']" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "groups = defaultdict(list)\n", + "for e in examples:\n", + " if 'score' in e:\n", + " index = e['index']\n", + " if index < 252:\n", + " if index % 2 == 1:\n", + " index -= 1\n", + " elif index in [252, 253, 254]:\n", + " index = 252\n", + " else:\n", + " if index % 2 == 0:\n", + " index -= 1\n", + " groups[index].append(e)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(2, 'fit into:large/small', False),\n", + " (4, 'thank:receive/give', False),\n", + " (6, 'call:successful available', True),\n", + " (8, 'ask:repeat answer', False),\n", + " (10, 'zoom by:fast/slow', False),\n", + " (12, 'vindicated/crushed:be the winner', False),\n", + " (14, 'lift:weak heavy', False),\n", + " (16, 'crash through:[hard]/[soft]', False),\n", + " (18, '[block]:short/tall', False),\n", + " (20, 'down to:top/bottom', False),\n", + " (22, 'beat:good/bad', False),\n", + " (24, 'roll off:anchored level', False),\n", + " (26, 'above/below', False),\n", + " (28, 'better/worse:study hard', False),\n", + " (30, 'after/before:far away', False),\n", + " (32, 'be upset with:buy from not work/sell not work', True),\n", + " (34, '?yell at comfort:upset', False),\n", + " (36, 'above/below:moved first', False),\n", + " (38, 'although/because', False),\n", + " (40, 'bully:punish rescue', False),\n", + " (42, 'pour:empty/full', False),\n", + " (44, 'know:nosy indiscreet', False),\n", + " (46, 'explain:convince/understand', True),\n", + " (48, '?know tell:so/because', True),\n", + " (50, 'beat:younger/older', False),\n", + " (56, 'clog:cleaned removed', True),\n", + " (58, '?immediately follow:short delayed', False),\n", + " (60, '?between:see see around', True),\n", + " (64, 'but/and', False),\n", + " (66, 'clean:put in the trash put in the drawer', False),\n", + " (68, 'because/but', False),\n", + " (70, 'out of:handy lighter', False),\n", + " (72, 'put:tall high', False),\n", + " (74, 'show:good famous', True),\n", + " (76, 'pay for:generous grateful', False),\n", + " (78, 'but', False),\n", + " (80, 'if', False),\n", + " (82, 'if', False),\n", + " (84, 'fool:get/lose', False),\n", + " (88, 'wait:impatient cautious', False),\n", + " (90, 'give birth:woman baby', True),\n", + " (92, '?stop normal/stop abnormal:strange', False),\n", + " (96, 'eat:hungry tasty', False),\n", + " (98, 'put ... into filled with ... :get in/get out', False),\n", + " (100, 'up:at the bottom/at the top', False),\n", + " (102, 'crash through:removed repaired', False),\n", + " (104, 'stab:taken to the police station taken to the hospital', False),\n", + " (106, 'hear ... humming and whistling:annoyed/annoying', True),\n", + " (108, 'see ... juggling watermelons:impressed/impressive', True),\n", + " (114, 'tell lies: truthful skeptical', True),\n", + " (130, 'but:disappointed', True),\n", + " (132, 'visit:invite come out/invite come in', True),\n", + " (134, 'take classes from:eager known to speak it fluently', False),\n", + " (138, 'cover:out gone', True),\n", + " (144, 'tuck:work sleep', True),\n", + " (150, 'influence:later/earlier', False),\n", + " (152, 'can not cut:thick small', False),\n", + " (154, 'attack:kill guard', False),\n", + " (156, 'attack:bold nervous', False),\n", + " (160, 'change:hard:easy', False),\n", + " (166, 'alive:is/was', False),\n", + " (168, 'infant:twelve years old twelve months old', False),\n", + " (170, 'better equipped and large:defeated/victorious', False),\n", + " (178, 'interview:persistent cooperative', False),\n", + " (186, 'be full of:minority/majority', False),\n", + " (188, 'like over:more/fewer', False),\n", + " (190, 'place on all:not enough/too many', True),\n", + " (192, 'stick:leave have', True),\n", + " (196, 'follow:admire/influence', True),\n", + " (198, 'fit through:wide/narrow', False),\n", + " (200, 'trade:dowdy/great', False),\n", + " (202, 'hire/hire oneself to:take care of', True),\n", + " (204, 'promise/order', False),\n", + " (208, 'mother:education place', True),\n", + " (210, 'knock:get an answer/answer', True),\n", + " (212, 'pay:receive/deliver', False),\n", + " (218, '?', False),\n", + " (220, 'say check:move take', False),\n", + " (222, '?', False),\n", + " (224, 'give a life:drive alone walk', False),\n", + " (226, 'pass the plate:full/hungry', False),\n", + " (228, 'pass:turn over turn next', False),\n", + " (232, 'stretch pat', True),\n", + " (234, 'accept share', False),\n", + " (236, 'speak:break silence break concentration', False),\n", + " (240, 'carry:leg ache leg dangle', True),\n", + " (242, 'carry:in arms in bassinet', False),\n", + " (244, 'hold:against chest against will', True),\n", + " (250, 'stop', False),\n", + " (252, 'even though/because/not', False),\n", + " (255, 'give:not hungry/hungry', False),\n", + " (259, 'ask for a favor:refuse/be refused`', False),\n", + " (261, 'cede:less popular/more popular', False),\n", + " (263, 'not pass although:see open/open', True),\n", + " (271, 'suspect regret', True)]" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def filter_dict(d, keys=['index', 'sentence', 'correct_answer', 'relational_word', 'is_associative', 'score']):\n", + " return {k: d[k] for k in d if k in keys}\n", + "\n", + "# ([[filter_dict(e) for e in eg] for eg in groups.values() if eg[0]['relational_word'] != 'none' and all([e['score'] for e in eg])])# / len([eg for eg in groups.values() if eg[0]['relational_word'] != 'none'])\n", + "[(index, eg[0]['relational_word'], all([e['score'] for e in eg])) for index, eg in groups.items() if eg[0]['relational_word'] != 'none']\n", + "# len([filter_dict(e) for e in examples if 'score' in e and not e['score'] and e['adjacent_ref']])\n", + "# for e in examples:\n", + "# if e['index'] % 2 == 0:\n", + "# print(e['sentence'])" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "179" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(['because' in e['sentence'] for e in examples]) + \\\n", + "sum(['so ' in e['sentence'] for e in examples]) + \\\n", + "sum(['but ' in e['sentence'] for e in examples]) + \\\n", + "sum(['though' in e['sentence'] for e in examples])" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "# with open('WSC_switched_label.json', 'w') as f:\n", + "# json.dump(examples, f)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "vis_attn_topk = 3\n", + "\n", + "def has_chinese_label(labels):\n", + " labels = [label.split('->')[0].strip() for label in labels]\n", + " r = sum([len(label) > 1 for label in labels if label not in ['BOS', 'EOS']]) * 1. / (len(labels) - 1)\n", + " return 0 < r < 0.5 # r == 0 means empty query labels used in self attention\n", + "\n", + "def _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col, color='b'):\n", + " assert len(query_labels) == attn.size(0)\n", + " assert len(key_labels) == attn.size(1)\n", + "\n", + " ax1.set_xlim([-1, 1])\n", + " ax1.set_xticks([])\n", + " ax2 = ax1.twinx()\n", + " nlabels = max(len(key_labels), len(query_labels))\n", + " pos = range(nlabels)\n", + " \n", + " if 'self' in attn_name and col < ncols - 1:\n", + " query_labels = ['' for _ in query_labels]\n", + "\n", + " for ax, labels in [(ax1, key_labels), (ax2, query_labels)]:\n", + " ax.set_yticks(pos)\n", + " if has_chinese_label(labels):\n", + " ax.set_yticklabels(labels, fontproperties=zhfont)\n", + " else:\n", + " ax.set_yticklabels(labels)\n", + " ax.set_ylim([nlabels - 1, 0])\n", + " ax.tick_params(width=0, labelsize='xx-large')\n", + "\n", + " for spine in ax.spines.values():\n", + " spine.set_visible(False)\n", + "\n", + "# mask, attn = filter_attn(attn)\n", + " for qi in range(attn.size(0)):\n", + "# if not mask[qi]:\n", + "# continue\n", + "# for ki in range(attn.size(1)):\n", + " for ki in attn[qi].topk(vis_attn_topk)[1]:\n", + " a = attn[qi, ki]\n", + " ax1.plot((-1, 1), (ki, qi), color, alpha=a)\n", + "# print(attn.mean(dim=0).topk(5)[0])\n", + "# ax1.barh(pos, attn.mean(dim=0).data.cpu().numpy())\n", + "\n", + "def plot_layer_attn(result_tuple, attn_name='dec_self_attns', layer=0, heads=None):\n", + " hypo, nheads, labels_dict = result_tuple\n", + " key_labels, query_labels = labels_dict[attn_name]\n", + " if heads is None:\n", + " heads = range(nheads)\n", + " else:\n", + " nheads = len(heads)\n", + " \n", + " stride = 2 if attn_name == 'dec_enc_attns' else 1\n", + " nlabels = max(len(key_labels), len(query_labels))\n", + " rcParams['figure.figsize'] = 20, int(round(nlabels * stride * nheads / 8 * 1.0))\n", + " \n", + " rows = nheads // ncols * stride\n", + " fig, axes = plt.subplots(rows, ncols)\n", + " \n", + " # for head in range(nheads):\n", + " for head_i, head in enumerate(heads):\n", + " row, col = head_i * stride // ncols, head_i * stride % ncols\n", + " ax1 = axes[row, col]\n", + " attn = hypo[attn_name][layer][head]\n", + " _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col)\n", + " if attn_name == 'dec_enc_attns':\n", + " col = col + 1\n", + " axes[row, col].axis('off') # next subfig acts as blank place holder\n", + " # plt.suptitle('%s with %d heads, Layer %d' % (attn_name, nheads, layer), fontsize=20)\n", + " plt.show() \n", + " \n", + "ncols = 4" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'BertSelfAttention' object has no attribute 'attention_probs'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mattn_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'enc_self_attns'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mhypo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mattn_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbert\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattention\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattention_probs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_hidden_layers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mkey_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mquery_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokens\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mlabels_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mattn_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mkey_labels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery_labels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mresult_tuple\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mhypo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_attention_heads\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mattn_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'enc_self_attns'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mhypo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mattn_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbert\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattention\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattention_probs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_hidden_layers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mkey_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mquery_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokens\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mlabels_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mattn_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mkey_labels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery_labels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mresult_tuple\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mhypo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_attention_heads\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/qsj/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 516\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmodules\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 517\u001b[0m raise AttributeError(\"'{}' object has no attribute '{}'\".format(\n\u001b[0;32m--> 518\u001b[0;31m type(self).__name__, name))\n\u001b[0m\u001b[1;32m 519\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'BertSelfAttention' object has no attribute 'attention_probs'" + ] + } + ], + "source": [ + "attn_name = 'enc_self_attns'\n", + "hypo = {attn_name: [model.bert.encoder.layer[i].attention.self.attention_probs[0] for i in range(config.num_hidden_layers)]}\n", + "key_labels = query_labels = tokens\n", + "labels_dict = {attn_name: (key_labels, query_labels)}\n", + "result_tuple = (hypo, config.num_attention_heads, labels_dict)\n", + "plot_layer_attn(result_tuple, attn_name=attn_name, layer=10, heads=None)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Untitled_linzhuo.ipynb b/Untitled_linzhuo.ipynb new file mode 100644 index 00000000000000..9627f95eb0ef4b --- /dev/null +++ b/Untitled_linzhuo.ipynb @@ -0,0 +1,239 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "InteractiveShell.ast_node_interactivity = 'all'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "\n", + "import numpy as np\n", + "import math\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "from pylab import rcParams\n", + "\n", + "import torch\n", + "import torch.nn.functional as F\n", + "from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig\n", + "from examples.extract_features import *" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "01/09/2019 14:00:34 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/vocab.txt\n", + "01/09/2019 14:00:34 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/\n", + "01/09/2019 14:00:34 - INFO - pytorch_pretrained_bert.modeling - Model config {\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"max_position_embeddings\": 512,\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"type_vocab_size\": 2,\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "class Args:\n", + " def __init__(self):\n", + " pass\n", + " \n", + "args = Args()\n", + "args.no_cuda = True\n", + "\n", + "CONFIG_NAME = 'bert_config.json'\n", + "BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/'\n", + "config_file = os.path.join(BERT_DIR, CONFIG_NAME)\n", + "config = BertConfig.from_json_file(config_file)\n", + "\n", + "tokenizer = BertTokenizer.from_pretrained(os.path.join(BERT_DIR, 'vocab.txt'))\n", + "model = BertForPreTraining.from_pretrained(BERT_DIR)\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n", + "_ = model.to(device)\n", + "_ = model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_layer = model.bert.embeddings\n", + "layers = model.bert.encoder.layer" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "layer = layers[3]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "BertSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1)\n", + ")" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "layer.attention.self" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([14460])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "words = 'policeman'\n", + "tokens = tokenizer.tokenize(words)\n", + "assert len(tokens) == len(words.split()), tokens\n", + "input_ids = [tokenizer.vocab[token] for token in tokens]\n", + "input_ids = torch.tensor(input_ids, dtype=torch.long).to(device)\n", + "input_ids" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 768])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embedding_layer.average_position_embeddings = embedding_layer.position_embeddings.weight.mean(dim=0, keepdim=True)\n", + "\n", + "def embedding_forward(self, input_ids, token_type_ids=None): \n", + " if token_type_ids is None:\n", + " token_type_ids = torch.zeros_like(input_ids)\n", + " \n", + " word_embeddings = self.word_embeddings(input_ids)\n", + " position_embeddings = self.average_position_embeddings\n", + " token_type_embeddings = self.token_type_embeddings(token_type_ids)\n", + " \n", + " embeddings = word_embeddings + position_embeddings + token_type_embeddings\n", + " embeddings = self.LayerNorm(embeddings)\n", + " return embeddings\n", + "\n", + "embeddings = embedding_forward(embedding_layer, input_ids)\n", + "embeddings.size()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([30522, 768])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def embedding_get_all(self):\n", + " all_embeddings = self.word_embeddings.weight\n", + " token_type_ids = torch.zeros(all_embeddings.size(0), dtype=torch.long)\n", + " token_type_embeddings = self.token_type_embeddings(token_type_ids)\n", + " all_embeddings = all_embeddings + self.average_position_embeddings + token_type_embeddings\n", + " return all_embeddings\n", + "\n", + "all_embeddings = embedding_get_all(embedding_layer)\n", + "all_embeddings.size()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Untitled_linzhuo_maskedlm.ipynb b/Untitled_linzhuo_maskedlm.ipynb new file mode 100644 index 00000000000000..20caa7bf8b9416 --- /dev/null +++ b/Untitled_linzhuo_maskedlm.ipynb @@ -0,0 +1,1082 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "InteractiveShell.ast_node_interactivity = 'all'" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "# import seaborn as sns\n", + "import os\n", + "import json\n", + "\n", + "import numpy as np\n", + "import math\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "from pylab import rcParams\n", + "\n", + "import torch\n", + "import torch.nn.functional as F\n", + "from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig\n", + "from examples.extract_features import *" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "01/17/2019 18:31:04 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/vocab.txt\n", + "01/17/2019 18:31:04 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/\n", + "01/17/2019 18:31:04 - INFO - pytorch_pretrained_bert.modeling - Model config {\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"max_position_embeddings\": 512,\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"type_vocab_size\": 2,\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "class Args:\n", + " def __init__(self):\n", + " pass\n", + " \n", + "args = Args()\n", + "args.no_cuda = True\n", + "\n", + "CONFIG_NAME = 'bert_config.json'\n", + "BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/'\n", + "config_file = os.path.join(BERT_DIR, CONFIG_NAME)\n", + "config = BertConfig.from_json_file(config_file)\n", + "\n", + "tokenizer = BertTokenizer.from_pretrained(os.path.join(BERT_DIR, 'vocab.txt'))\n", + "model = BertForPreTraining.from_pretrained(BERT_DIR)\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n", + "_ = model.to(device)\n", + "_ = model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "def convert_text_to_examples(text):\n", + " examples = []\n", + " unique_id = 0\n", + " if True:\n", + " for line in text:\n", + " line = line.strip()\n", + " text_a = None\n", + " text_b = None\n", + " m = re.match(r\"^(.*) \\|\\|\\| (.*)$\", line)\n", + " if m is None:\n", + " text_a = line\n", + " else:\n", + " text_a = m.group(1)\n", + " text_b = m.group(2)\n", + " examples.append(\n", + " InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))\n", + " unique_id += 1\n", + " return examples\n", + "\n", + "def convert_examples_to_features(examples, tokenizer, append_special_tokens=True, replace_mask=True, print_info=False):\n", + " features = []\n", + " for (ex_index, example) in enumerate(examples):\n", + " tokens_a = tokenizer.tokenize(example.text_a)\n", + " tokens_b = None\n", + " if example.text_b:\n", + " tokens_b = tokenizer.tokenize(example.text_b)\n", + "\n", + " tokens = []\n", + " input_type_ids = []\n", + " if append_special_tokens:\n", + " tokens.append(\"[CLS]\")\n", + " input_type_ids.append(0)\n", + " for token in tokens_a:\n", + " if replace_mask and token == '_': # XD\n", + " token = \"[MASK]\"\n", + " tokens.append(token)\n", + " input_type_ids.append(0)\n", + " if append_special_tokens:\n", + " tokens.append(\"[SEP]\")\n", + " input_type_ids.append(0)\n", + "\n", + " if tokens_b:\n", + " for token in tokens_b:\n", + " if replace_mask and token == '_': # XD\n", + " token = \"[MASK]\"\n", + " tokens.append(token)\n", + " input_type_ids.append(1)\n", + " if append_special_tokens:\n", + " tokens.append(\"[SEP]\")\n", + " input_type_ids.append(1)\n", + "\n", + " input_ids = tokenizer.convert_tokens_to_ids(tokens)\n", + " input_mask = [1] * len(input_ids)\n", + "\n", + " if ex_index < 5:\n", + "# logger.info(\"*** Example ***\")\n", + "# logger.info(\"unique_id: %s\" % (example.unique_id))\n", + " logger.info(\"tokens: %s\" % \" \".join([str(x) for x in tokens]))\n", + "# logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n", + "# logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n", + "# logger.info(\n", + "# \"input_type_ids: %s\" % \" \".join([str(x) for x in input_type_ids]))\n", + " \n", + " features.append(\n", + " InputFeatures(\n", + " unique_id=example.unique_id,\n", + " tokens=tokens,\n", + " input_ids=input_ids,\n", + " input_mask=input_mask,\n", + " input_type_ids=input_type_ids))\n", + " return features\n", + "\n", + "def copy_and_mask_feature(feature, masked_tokens=None):\n", + " import copy\n", + " tokens = feature.tokens\n", + " masked_positions = [tokens.index(t) for t in masked_tokens if t in tokens] \\\n", + " if masked_tokens is not None else range(len(tokens))\n", + " assert len(masked_positions) > 0\n", + " masked_feature_copies = []\n", + " for masked_pos in masked_positions:\n", + " feature_copy = copy.deepcopy(feature)\n", + " feature_copy.input_ids[masked_pos] = tokenizer.vocab[\"[MASK]\"]\n", + " masked_feature_copies.append(feature_copy)\n", + " return masked_feature_copies, masked_positions" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "def show_lm_probs(tokens, input_ids, probs, topk=5, firstk=20):\n", + " def print_pair(token, prob, end_str='', hit_mark=' '):\n", + " if i < firstk:\n", + " # token = token.replace('', '').replace('\\n', '/n')\n", + " print('{}{: >3} | {: <12}'.format(hit_mark, int(round(prob*100)), token), end=end_str)\n", + " \n", + " ret = None\n", + " for i in range(len(tokens)):\n", + " ind_ = input_ids[i].item() if input_ids is not None else tokenizer.vocab[tokens[i]]\n", + " prob_ = probs[i][ind_].item()\n", + " print_pair(tokens[i], prob_, end_str='\\t')\n", + " values, indices = probs[i].topk(topk)\n", + " top_pairs = []\n", + " for j in range(topk):\n", + " ind, prob = indices[j].item(), values[j].item()\n", + " hit_mark = '*' if ind == ind_ else ' '\n", + " token = tokenizer.ids_to_tokens[ind]\n", + " print_pair(token, prob, hit_mark=hit_mark, end_str='' if j < topk - 1 else '\\n')\n", + " top_pairs.append((token, prob))\n", + " if tokens[i] == \"[MASK]\":\n", + " ret = top_pairs\n", + " return ret" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "import colored\n", + "from colored import stylize\n", + "\n", + "def show_abnormals(tokens, probs, show_suggestions=False):\n", + " def gap2color(gap):\n", + " if gap <= 5:\n", + " return 'yellow_1'\n", + " elif gap <= 10:\n", + " return 'orange_1'\n", + " else:\n", + " return 'red_1'\n", + " \n", + " def print_token(token, suggestion, gap):\n", + " if gap == 0:\n", + " print(stylize(token + ' ', colored.fg('white') + colored.bg('black')), end='')\n", + " else:\n", + " print(stylize(token, colored.fg(gap2color(gap)) + colored.bg('black')), end='')\n", + " if show_suggestions and gap > 5:\n", + " print(stylize('/' + suggestion + ' ', colored.fg('green' if gap > 10 else 'cyan') + colored.bg('black')), end='')\n", + " else:\n", + " print(stylize(' ', colored.fg(gap2color(gap)) + colored.bg('black')), end='')\n", + " # print('/' + suggestion, end=' ')\n", + " # print('%.2f' % gap, end=' ')\n", + " \n", + " avg_gap = 0.\n", + " for i in range(1, len(tokens) - 1): # skip first [CLS] and last [SEP]\n", + " ind_ = tokenizer.vocab[tokens[i]]\n", + " prob_ = probs[i][ind_].item()\n", + " top_prob = probs[i].max().item()\n", + " top_ind = probs[i].argmax().item()\n", + " gap = math.log(top_prob) - math.log(prob_)\n", + " suggestion = tokenizer.ids_to_tokens[top_ind]\n", + " print_token(tokens[i], suggestion, gap)\n", + " avg_gap += gap\n", + " avg_gap /= (len(tokens) - 2)\n", + " print()\n", + " print(avg_gap)" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "analyzed_cache = {}\n", + "\n", + "def analyze_text(text, masked_tokens=None, show_suggestions=False, show_firstk_probs=20):\n", + " if text[0] in analyzed_cache:\n", + " features, mlm_probs = analyzed_cache[text[0]]\n", + " given_mask = \"[MASK]\" in features[0].tokens\n", + " tokens = features[0].tokens\n", + " else:\n", + " examples = convert_text_to_examples(text)\n", + " features = convert_examples_to_features(examples, tokenizer, print_info=False)\n", + " given_mask = \"[MASK]\" in features[0].tokens\n", + " if not given_mask or masked_tokens is not None:\n", + " assert len(features) == 1\n", + " features, masked_positions = copy_and_mask_feature(features[0], masked_tokens=masked_tokens)\n", + "\n", + " input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n", + " input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long)\n", + " input_ids = input_ids.to(device)\n", + " input_type_ids = input_type_ids.to(device)\n", + "\n", + " mlm_logits, _ = model(input_ids, input_type_ids)\n", + " mlm_probs = F.softmax(mlm_logits, dim=-1)\n", + "\n", + " tokens = features[0].tokens\n", + " if not given_mask or masked_tokens is not None:\n", + " bsz, seq_len, vocab_size = mlm_probs.size()\n", + " assert bsz == len(masked_positions)\n", + " # reduced_mlm_probs = torch.Tensor(1, seq_len, vocab_size)\n", + " # for i in range(seq_len):\n", + " # reduced_mlm_probs[0, i] = mlm_probs[i, i]\n", + " reduced_mlm_probs = torch.Tensor(1, len(masked_positions), vocab_size)\n", + " for i, pos in enumerate(masked_positions):\n", + " reduced_mlm_probs[0, i] = mlm_probs[i, pos]\n", + " mlm_probs = reduced_mlm_probs\n", + " tokens = [tokens[i] for i in masked_positions]\n", + " \n", + " analyzed_cache[text[0]] = (features, mlm_probs)\n", + " \n", + " top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=show_firstk_probs)\n", + " if not given_mask:\n", + " show_abnormals(tokens, mlm_probs[0], show_suggestions=show_suggestions)\n", + " return top_pairs" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/qsj/miniconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", + " return f(*args, **kwds)\n" + ] + } + ], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "110300" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train =pd.read_csv('/nas/xd/data/gan_prompt_remain_OPENAI_TOKENED_new2.txt',delimiter='\\t',header=None,quotechar='&')\n", + "len(df_train)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "ss = [row[0]+' '+row[1] for row in df_train[[3,4]].values]" + ] + }, + { + "cell_type": "code", + "execution_count": 215, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"Why didn't the skeleton go to the dance? He had no-BODY to go with.\"" + ] + }, + "execution_count": 215, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sss=ss[22]\n", + "sss" + ] + }, + { + "cell_type": "code", + "execution_count": 229, + "metadata": {}, + "outputs": [], + "source": [ + "sss =\"why didn ' t the girls come to the party ? i had no - one to party with .\"" + ] + }, + { + "cell_type": "code", + "execution_count": 230, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "01/17/2019 19:40:30 - INFO - examples.extract_features - tokens: [CLS] why didn ' t the girls come to the party ? i had no - one to party with . [SEP]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0 | [CLS] \t 3 | . 1 | the 1 | ) 1 | , 1 | \" \n", + " 100 | why \t*100 | why 0 | and 0 | but 0 | \" 0 | ' \n", + " 63 | didn \t* 63 | didn 15 | couldn 9 | wouldn 5 | don 3 | hadn \n", + " 100 | ' \t*100 | ' 0 | - 0 | , 0 | = 0 | ` \n", + " 100 | t \t*100 | t 0 | d 0 | s 0 | n 0 | ts \n", + " 69 | the \t* 69 | the 9 | other 6 | my 3 | these 2 | any \n", + " 32 | girls \t* 32 | girls 8 | boys 7 | guys 4 | police 4 | others \n", + " 73 | come \t* 73 | come 25 | go 0 | get 0 | stay 0 | came \n", + " 98 | to \t* 98 | to 1 | for 0 | into 0 | at 0 | after \n", + " 42 | the \t* 42 | the 40 | my 5 | this 3 | our 3 | that \n", + " 83 | party \t* 83 | party 3 | house 2 | club 2 | parties 1 | dance \n", + " 99 | ? \t* 99 | ? 0 | when 0 | . 0 | because 0 | , \n", + " 38 | i \t* 38 | i 20 | she 16 | they 10 | we 5 | he \n", + " 81 | had \t* 81 | had 17 | have 0 | was 0 | saw 0 | has \n", + " 100 | no \t*100 | no 0 | twenty 0 | number 0 | non 0 | zero \n", + " 88 | - \t* 88 | - 4 | other 2 | ' 1 | real 1 | . \n", + " 99 | one \t* 99 | one 0 | ones 0 | friends 0 | girls 0 | girl \n", + " 100 | to \t*100 | to 0 | i 0 | they 0 | a 0 | the \n", + " 15 | party \t* 15 | party 9 | be 8 | celebrate 8 | dance 7 | play \n", + " 97 | with \t* 97 | with 1 | to 1 | for 0 | tonight 0 | around \n", + " 96 | . \t* 96 | . 2 | ! 2 | ; 0 | ? 0 | | \n", + " 0 | [SEP] \t 5 | i 5 | \" 2 | and 2 | the 2 | . \n", + "\u001b[38;5;15m\u001b[48;5;0mwhy \u001b[0m\u001b[38;5;15m\u001b[48;5;0mdidn \u001b[0m\u001b[38;5;15m\u001b[48;5;0m' \u001b[0m\u001b[38;5;15m\u001b[48;5;0mt \u001b[0m\u001b[38;5;15m\u001b[48;5;0mthe \u001b[0m\u001b[38;5;15m\u001b[48;5;0mgirls \u001b[0m\u001b[38;5;15m\u001b[48;5;0mcome \u001b[0m\u001b[38;5;15m\u001b[48;5;0mto \u001b[0m\u001b[38;5;15m\u001b[48;5;0mthe \u001b[0m\u001b[38;5;15m\u001b[48;5;0mparty \u001b[0m\u001b[38;5;15m\u001b[48;5;0m? \u001b[0m\u001b[38;5;15m\u001b[48;5;0mi \u001b[0m\u001b[38;5;15m\u001b[48;5;0mhad \u001b[0m\u001b[38;5;15m\u001b[48;5;0mno \u001b[0m\u001b[38;5;15m\u001b[48;5;0m- \u001b[0m\u001b[38;5;15m\u001b[48;5;0mone \u001b[0m\u001b[38;5;15m\u001b[48;5;0mto \u001b[0m\u001b[38;5;15m\u001b[48;5;0mparty \u001b[0m\u001b[38;5;15m\u001b[48;5;0mwith \u001b[0m\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "0.0\n" + ] + } + ], + "source": [ + "# text = [\"Who was Jim Henson? Jim Henson _ a puppeteer.\"]\n", + "text = [\"I went to school by myself. I had no seat on the bus.\"]\n", + "# text = [\"I thought that John defeated Mary. I was wrong. _ beat _.\"]\n", + "# text = [\"Did John defeat Mary? No, _ beat _.\"]\n", + "# text = [\"That mary defeated John contradicts the fact that _ beat _.\"]\n", + "# text = [\"After the outbreak of the disease, the Ministry of Agriculture and rural areas immediately sent a supervision team to the local. Local Emergency Response Mechanism has been activated in accordance with the requirements, to take blockade, culling, harmless treatment, disinfection and other treatment measures to all disease and culling of pigs for harmless treatment. At the same time, all live pigs and their products are prohibited from transferring out of the blockade area, and live pigs are not allowed to be transported into the blockade area. At present, all the above measures have been implemented.\"]\n", + "# text = [\"Early critics of Emily Dickinson's poetry mistook for simplemindedness the surface of artlessness that in fact she constructed with such innocence.\"]\n", + "\n", + "text = [sss] #\n", + "analyze_text(text, show_firstk_probs=100)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def words2heads(attns, tokens, words):\n", + " positions = [tokens.index(word) for word in words]\n", + "\n", + " for layer in range(config.num_hidden_layers):\n", + " for head in range(config.num_attention_heads):\n", + " for pos_indices in [(0, 1), (1, 0)]:\n", + " from_pos, to_pos = positions[pos_indices[0]], positions[pos_indices[1]]\n", + " if attns[layer][head][from_pos].max(0)[1].item() == to_pos:\n", + " print('Layer %d, head %d: %s -> %s' % (layer, head, tokens[from_pos], tokens[to_pos]), end='\\t')\n", + " print(attns[layer][head][from_pos].topk(5)[0].data)\n", + "\n", + "def head2words(attns, tokens, layer, head):\n", + " for from_pos in range(len(tokens)):\n", + " to_pos = attns[layer][head][from_pos].max(0)[1].item()\n", + " from_word, to_word = tokens[from_pos], tokens[to_pos]\n", + " if from_word.isalpha() and to_word.isalpha():\n", + " print('%s @ %d -> %s @ %d' % (from_word, from_pos, to_word, to_pos), end='\\t')\n", + " print(attns[layer][head][from_pos].topk(5)[0].data)\n", + " \n", + "special_tokens = ['[CLS]', '[SEP]']\n", + "\n", + "def get_salient_heads(attns, tokens, attn_thld=0.5):\n", + " for layer in range(config.num_hidden_layers):\n", + " for head in range(config.num_attention_heads):\n", + " pos_pairs = []\n", + " for from_pos in range(1, len(tokens) - 1): # skip [CLS] and [SEP]\n", + " top_attn, to_pos = attns[layer][head][from_pos].max(0)\n", + " top_attn, to_pos = top_attn.item(), to_pos.item()\n", + " from_word, to_word = tokens[from_pos], tokens[to_pos]\n", + "# if from_word.isalpha() and to_word.isalpha() and top_attn >= attn_thld:\n", + " if abs(from_pos - to_pos) <= 1:\n", + "# print('Layer %d, head %d: %s @ %d -> %s @ %d' % (layer, head, from_word, from_pos, to_word, to_pos), end='\\t')\n", + "# print(attns[layer][head][from_pos].topk(5)[0].data)\n", + " pos_pairs.append((from_pos, to_pos))\n", + " \n", + " ratio = len(pos_pairs) / (len(tokens) - 2)\n", + " if ratio > 0.5:\n", + " print(ratio)\n", + " for from_pos, to_pos in pos_pairs:\n", + " print('Layer %d, head %d: %s @ %d -> %s @ %d' % (layer, head, tokens[from_pos], from_pos, tokens[to_pos], to_pos), end='\\t')\n", + " print(attns[layer][head][from_pos].topk(5)[0].data)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "01/10/2019 21:46:20 - INFO - examples.extract_features - tokens: [CLS] jim laughed because he was so happy . [SEP]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "jim @ 1 -> jim @ 1\ttensor([0.7248, 0.0842, 0.0656, 0.0407, 0.0319], device='cuda:0')\n" + ] + } + ], + "source": [ + "# text, words = [\"The trophy doesn't fit into the brown suitcase because the it is too large.\"], ['fit', 'large']\n", + "# text, words = [\"Mary couldn't beat John in the match because he was too strong.\"], ['beat', 'strong']\n", + "text, words = [\"John is taller than Mary because he is older.\"], ['taller', 'older']\n", + "# text, words = [\"The red ball is heavier than the blue ball because the red ball is bigger.\"], ['heavier', 'bigger']\n", + "text, words = [\"Jim laughed because he was so happy.\"], ['cried', 'sad']\n", + "# text, words = [\"Jim ate the cake quickly because he was so hungry.\"], ['ate', 'hungry']\n", + "# text, words = [\"Jim drank the juice quickly because he was so thirsty.\"], ['drank', 'thirsty']\n", + "# text, words = [\"Tom's drawing hangs high. It is above Susan's drawing\"], ['high', 'above']\n", + "# text, words = [\"Tom's drawing hangs low. It is below Susan's drawing\"], ['low', 'below']\n", + "# text, words = [\"John is taller than Mary . Mary is shorter than John.\"], ['taller', 'shorter']\n", + "# text, words = [\"The drawing is above the cabinet. The cabinet is below the drawing\"], ['above', 'below']\n", + "# text, words = [\"Jim is very thin . He is not fat.\"], ['thin', 'fat']\n", + "\n", + "features = convert_examples_to_features(convert_text_to_examples(text), tokenizer, print_info=False)\n", + "input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).to(device)\n", + "input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long).to(device)\n", + "mlm_logits, _ = model(input_ids, input_type_ids)\n", + "mlm_probs = F.softmax(mlm_logits, dim=-1)\n", + "tokens = features[0].tokens\n", + "# top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=100)\n", + "\n", + "attn_name = 'enc_self_attns'\n", + "hypo = {attn_name: [model.bert.encoder.layer[i].attention.self.attention_probs[0] for i in range(config.num_hidden_layers)]}\n", + "key_labels = query_labels = tokens\n", + "labels_dict = {attn_name: (key_labels, query_labels)}\n", + "result_tuple = (hypo, config.num_attention_heads, labels_dict)\n", + "# plot_layer_attn(result_tuple, attn_name=attn_name, layer=10, heads=None)\n", + "\n", + "attns = hypo[attn_name]\n", + " \n", + "# words2heads(attns, tokens, words)\n", + "head2words(attns, tokens, 2, 10)\n", + "# get_salient_heads(attns, tokens, attn_thld=0.0)" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "0,2\t-1\n", + "0,3\t-1\n", + "0,10\t+1 动宾\n", + "1,1\t+1 动介\n", + "1,4\t-1\n", + "1,11\t0\n", + "2,0\t+1**\n", + "2,6\t0**\n", + "2,9\t+1**\n", + "3,5\t-1\n", + "7,4\t-1\n", + "11,8\t0\n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "head_size = config.hidden_size // config.num_attention_heads\n", + "layer = 1\n", + "head = 1 # 2, 3, 10\n", + "wq = model.bert.encoder.layer[layer].attention.self.query.weight.data.view(-1, config.num_attention_heads, head_size).permute(1, 0, 2)\n", + "wk = model.bert.encoder.layer[layer].attention.self.key.weight.data.view(-1, config.num_attention_heads, head_size).permute(1, 0, 2)\n", + "\n", + "wqk = torch.bmm(wq, wk.transpose(-1, -2))\n", + "# (wqk * wqk.transpose(-1, -2)).sum((1, 2)) / (wqk * wqk).sum((1, 2))\n", + "plt.imshow(wqk[head]*wqk[head])\n", + "plt.show()\n", + "\n", + "# q = torch.matmul(pos_emb, wq)\n", + "# k = torch.matmul(pos_emb_prev, wk)\n", + "# (q * k).sum((-2, -1))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "pos_emb = model.bert.embeddings.position_embeddings.weight.data\n", + "pos_emb_prev = torch.zeros_like(pos_emb)\n", + "pos_emb_next = torch.zeros_like(pos_emb)\n", + "pos_emb_prev[1:] = pos_emb[:-1]\n", + "pos_emb_next[:-1] = pos_emb[1:]\n", + "pos_emb, pos_emb_prev, pos_emb_next = pos_emb[1:-1], pos_emb_prev[1:-1], pos_emb_next[1:-1]\n", + "\n", + "# pos_q = torch.matmul(pos_emb, wk[head])\n", + "# plt.imshow(pos_q[:32])\n", + "# plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"max_position_embeddings\": 512,\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"type_vocab_size\": 2,\n", + " \"vocab_size\": 30522\n", + "}" + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "config" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have the same hair color.',\n", + " 'Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have different hair colors.']" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text = [\n", + " # same / different\n", + " \"Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have the same hair color.\",\n", + " \"Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have different hair colors.\",\n", + " \"Tom has yellow hair. Mary has black hair. John has black hair. Mary and _ have the same hair color.\",\n", + " # because / although\n", + " \"John is taller/shorter than Mary because/although _ is older/younger.\",\n", + " \"The red ball is heavier/lighter than the blue ball because/although the _ ball is bigger/smaller.\",\n", + " \"Charles did a lot better/worse than his good friend Nancy on the test because/although _ had/hadn't studied so hard.\",\n", + " \"The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.\",\n", + " \"John thought that he would arrive earlier than Susan, but/and indeed _ was the first to arrive.\",\n", + " # reverse\n", + " \"John came then Mary came. They left in reverse order. _ left then _ left.\",\n", + " \"John came after Mary. They left in reverse order. _ left after _ .\",\n", + " \"John came first, then came Mary. They left in reverse order: _ left first, then left _ .\",\n", + " # compare sentences with same / opposite meaning, 2nd order\n", + " \"Though John is tall, Tom is taller than John. So John is _ than Tom.\",\n", + " \"Tom is taller than John. So _ is shorter than _.\",\n", + " # WSC-style: before /after\n", + " # \"Mary came before/after John. _ was late/early .\",\n", + " # yes / no, 2nd order\n", + " \"Was Tom taller than Susan? Yes, _ was taller.\",\n", + " # right / wrong, epistemic modality, 2nd order\n", + " \"John said/thought that the red ball was heavier than the blue ball. He was wrong. The _ ball was heavier\",\n", + " \"John was wrong in saying/thinking that the red ball was heavier than the blue ball. The _ ball was heavier\",\n", + " \"John said the rain was about to stop. Mary said the rain would continue. Later the rain stopped. _ was wrong/right.\",\n", + " \n", + " \"The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.\",\n", + " \"John thanked Mary because _ had given help to _ . \",\n", + " \"John felt vindicated/crushed when his longtime rival Mary revealed that _ was the winner of the competition.\",\n", + " \"John couldn't see the stage with Mary in front of him because _ is so short/tall.\",\n", + " \"Although they ran at about the same speed, John beat Sally because _ had such a bad start.\",\n", + " \"The fish ate the worm. The _ was hungry/tasty.\",\n", + " \n", + " \"John beat Mary. _ won the game/e winner.\",\n", + "]\n", + "text" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "with open('WSC_switched_label.json') as f:\n", + " examples = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "with open('WSC_child_problem.json') as f:\n", + " cexamples = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "for ce in cexamples:\n", + " for s in ce['sentences']:\n", + " for a in s['answer0'] + s['answer1']:\n", + " a = a.lower()\n", + " if a not in tokenizer.vocab:\n", + " ce\n", + " print(a, 'not in vocab!!!')" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "for ce in cexamples:\n", + " if len(ce['sentences']) > 0:\n", + " e = examples[ce['index']]\n", + " assert ce['index'] == e['index']\n", + " e['score'] = all([s['score'] for s in ce['sentences']])\n", + " assert len(set([s['adjacent_ref'] for s in ce['sentences']])) == 1, 'adjcent_refs are different!'\n", + " e['adjacent_ref'] = ce['sentences'][0]['adjacent_ref']" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "groups = defaultdict(list)\n", + "for e in examples:\n", + " if 'score' in e:\n", + " index = e['index']\n", + " if index < 252:\n", + " if index % 2 == 1:\n", + " index -= 1\n", + " elif index in [252, 253, 254]:\n", + " index = 252\n", + " else:\n", + " if index % 2 == 0:\n", + " index -= 1\n", + " groups[index].append(e)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(\"The trophy doesn't fit into the brown suitcase because [it] is too large.\",\n", + " 'fit into:large/small'),\n", + " ('Joan made sure to thank Susan for all the help [she] had recieved.',\n", + " 'thank:receive/give'),\n", + " ('The delivery truck zoomed by the school bus because [it] was going so fast.',\n", + " 'zoom by:fast/slow'),\n", + " ('Frank felt vindicated when his longtime rival Bill revealed that [he] was the winner of the competition.',\n", + " 'vindicated/crushed:be the winner'),\n", + " ('The large ball crashed right through the table because [it] was made of steel.',\n", + " 'crash through:[hard]/[soft]'),\n", + " (\"John couldn't see the stage with Billy in front of him because [he] is so short.\",\n", + " '[block]:short/tall'),\n", + " ('Tom threw his schoolbag down to Ray after [he] reached the top of the stairs.',\n", + " 'down to:top/bottom'),\n", + " ('Although they ran at about the same speed, Sue beat Sally because [she] had such a good start.',\n", + " 'beat:good/bad'),\n", + " (\"Sam's drawing was hung just above Tina's and [it] did look much better with another one below it.\",\n", + " 'above/below'),\n", + " ('Anna did a lot better than her good friend Lucy on the test because [she] had studied so hard.',\n", + " 'better/worse:study hard'),\n", + " ('The firemen arrived after the police because [they] were coming from so far away.',\n", + " 'after/before:far away'),\n", + " (\"Frank was upset with Tom because the toaster [he] had bought from him didn't work.\",\n", + " 'be upset with:buy from not work/sell not work'),\n", + " ('The sack of potatoes had been placed above the bag of flour, so [it] had to be moved first.',\n", + " 'above/below:moved first'),\n", + " ('Pete envies Martin although [he] is very successful.', 'although/because'),\n", + " ('I poured water from the bottle into the cup until [it] was empty.',\n", + " 'pour:empty/full'),\n", + " (\"Sid explained his theory to Mark but [he] couldn't convince him.\",\n", + " 'explain:convince/understand'),\n", + " (\"Susan knew that Ann's son had been in a car accident, so [she] told her about it.\",\n", + " '?know tell:so/because'),\n", + " (\"Joe's uncle can still beat him at tennis, even though [he] is 30 years younger.\",\n", + " 'beat:younger/older'),\n", + " ('In the middle of the outdoor concert, the rain started falling, but [it] continued until 10.',\n", + " 'but/and'),\n", + " ('Ann asked Mary what time the library closes, because [she] had forgotten.',\n", + " 'because/but'),\n", + " ('If the con artist has succeeded in fooling Sam, [he] would have gotten a lot of money.',\n", + " 'fool:get/lose'),\n", + " ('Alice tried frantically to stop her daughter from chatting at the party, leaving us to wonder why [she] was behaving so strangely.',\n", + " '?stop normal/stop abnormal:strange'),\n", + " (\"I was trying to open the lock with the key, but someone had filled the keyhole with chewing gum, and I couldn't get [it] in.\",\n", + " 'put ... into filled with ... :get in/get out'),\n", + " ('The dog chased the cat, which ran up a tree. [It] waited at the bottom.',\n", + " 'up:at the bottom/at the top'),\n", + " ('John was doing research in the library when he heard a man humming and whistling. [He] was very annoyed.',\n", + " 'hear ... humming and whistling:annoyed/annoying'),\n", + " ('John was jogging through the park when he saw a man juggling watermelons. [He] was very impressed.',\n", + " 'see ... juggling watermelons:impressed/impressive'),\n", + " ('Jane knocked on the door, and Susan answered it. [She] invited her to come out.',\n", + " 'visit:invite come out/invite come in'),\n", + " ('Jackson was greatly influenced by Arnold, though [he] lived two centuries later.',\n", + " 'influence:later/earlier'),\n", + " ('The actress used to be named Terpsichore, but she changed it to Tina a few years ago, because she figured [it] was too hard to pronounce.',\n", + " 'change:hard/easy'),\n", + " ('Fred is the only man still alive who remembers my great-grandfather. [He] is a remarkable man.',\n", + " 'alive:is/was'),\n", + " (\"In July, Kamtchatka declared war on Yakutsk. Since Yakutsk's army was much better equipped and ten times larger, [they] were defeated within weeks.\",\n", + " 'better equipped and large:defeated/victorious'),\n", + " ('When the sponsors of the bill got to the town hall, they were surprised to find that the room was full of opponents. [They] were very much in the minority.',\n", + " 'be full of:minority/majority'),\n", + " ('Everyone really loved the oatmeal cookies; only a few people liked the chocolate chip cookies. Next time, we should make more of [them] .',\n", + " 'like over:more/fewer'),\n", + " ('We had hoped to place copies of our newsletter on all the chairs in the auditorium, but there were simply not enough of [them] .',\n", + " 'place on all:not enough/too many'),\n", + " (\"Steve follows Fred's example in everything. [He] admires him hugely.\",\n", + " 'follow:admire/influence'),\n", + " (\"The table won't fit through the doorway because [it] is too wide.\",\n", + " 'fit through:wide/narrow'),\n", + " ('Grace was happy to trade me her sweater for my jacket. She thinks [it] looks dowdy on her.',\n", + " 'trade:dowdy/great'),\n", + " ('John hired Bill to take care of [him] .',\n", + " 'hire/hire oneself to:take care of'),\n", + " ('John promised Bill to leave, so an hour later [he] left.', 'promise/order'),\n", + " (\"Jane knocked on Susan's door but [she] did not get an answer.\",\n", + " 'knock:get an answer/answer'),\n", + " ('Joe paid the detective after [he] received the final report on the case.',\n", + " 'pay:receive/deliver'),\n", + " ('Bill passed the half-empty plate to John because [he] was full.',\n", + " 'pass the plate:full/hungry'),\n", + " ('George got free tickets to the play, but he gave them to Eric, even though [he] was particularly eager to see it.',\n", + " 'even though/because/not'),\n", + " (\"Jane gave Joan candy because [she] wasn't hungry.\",\n", + " 'give:not hungry/hungry'),\n", + " ('James asked Robert for a favor but [he] was refused.',\n", + " 'ask for a favor:refuse/be refused`'),\n", + " ('Kirilov ceded the presidency to Shatov because [he] was less popular.',\n", + " 'cede:less popular/more popular'),\n", + " ('Emma did not pass the ball to Janie although [she] saw that she was open.',\n", + " 'not pass although:see open/open')]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def filter_dict(d, keys=['index', 'sentence', 'correct_answer', 'relational_word', 'is_associative', 'score']):\n", + " return {k: d[k] for k in d if k in keys}\n", + "\n", + "# ([[filter_dict(e) for e in eg] for eg in groups.values() if eg[0]['relational_word'] != 'none' and all([e['score'] for e in eg])])# / len([eg for eg in groups.values() if eg[0]['relational_word'] != 'none'])\n", + "# [(index, eg[0]['relational_word'], all([e['score'] for e in eg])) for index, eg in groups.items() if eg[0]['relational_word'] != 'none']\n", + "# len([filter_dict(e) for e in examples if 'score' in e and not e['score'] and e['adjacent_ref']])\n", + "# for e in examples:\n", + "# if e['index'] % 2 == 0:\n", + "# print(e['sentence'])\n", + "[(eg[0]['sentence'], eg[0]['relational_word']) for index, eg in groups.items() if '/' in eg[0]['relational_word']]" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "179" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(['because' in e['sentence'] for e in examples]) + \\\n", + "sum(['so ' in e['sentence'] for e in examples]) + \\\n", + "sum(['but ' in e['sentence'] for e in examples]) + \\\n", + "sum(['though' in e['sentence'] for e in examples])" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "# with open('WSC_switched_label.json', 'w') as f:\n", + "# json.dump(examples, f)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "vis_attn_topk = 3\n", + "\n", + "def has_chinese_label(labels):\n", + " labels = [label.split('->')[0].strip() for label in labels]\n", + " r = sum([len(label) > 1 for label in labels if label not in ['BOS', 'EOS']]) * 1. / (len(labels) - 1)\n", + " return 0 < r < 0.5 # r == 0 means empty query labels used in self attention\n", + "\n", + "def _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col, color='b'):\n", + " assert len(query_labels) == attn.size(0)\n", + " assert len(key_labels) == attn.size(1)\n", + "\n", + " ax1.set_xlim([-1, 1])\n", + " ax1.set_xticks([])\n", + " ax2 = ax1.twinx()\n", + " nlabels = max(len(key_labels), len(query_labels))\n", + " pos = range(nlabels)\n", + " \n", + " if 'self' in attn_name and col < ncols - 1:\n", + " query_labels = ['' for _ in query_labels]\n", + "\n", + " for ax, labels in [(ax1, key_labels), (ax2, query_labels)]:\n", + " ax.set_yticks(pos)\n", + " if has_chinese_label(labels):\n", + " ax.set_yticklabels(labels, fontproperties=zhfont)\n", + " else:\n", + " ax.set_yticklabels(labels)\n", + " ax.set_ylim([nlabels - 1, 0])\n", + " ax.tick_params(width=0, labelsize='xx-large')\n", + "\n", + " for spine in ax.spines.values():\n", + " spine.set_visible(False)\n", + "\n", + "# mask, attn = filter_attn(attn)\n", + " for qi in range(attn.size(0)):\n", + "# if not mask[qi]:\n", + "# continue\n", + "# for ki in range(attn.size(1)):\n", + " for ki in attn[qi].topk(vis_attn_topk)[1]:\n", + " a = attn[qi, ki]\n", + " ax1.plot((-1, 1), (ki, qi), color, alpha=a)\n", + "# print(attn.mean(dim=0).topk(5)[0])\n", + "# ax1.barh(pos, attn.mean(dim=0).data.cpu().numpy())\n", + "\n", + "def plot_layer_attn(result_tuple, attn_name='dec_self_attns', layer=0, heads=None):\n", + " hypo, nheads, labels_dict = result_tuple\n", + " key_labels, query_labels = labels_dict[attn_name]\n", + " if heads is None:\n", + " heads = range(nheads)\n", + " else:\n", + " nheads = len(heads)\n", + " \n", + " stride = 2 if attn_name == 'dec_enc_attns' else 1\n", + " nlabels = max(len(key_labels), len(query_labels))\n", + " rcParams['figure.figsize'] = 20, int(round(nlabels * stride * nheads / 8 * 1.0))\n", + " \n", + " rows = nheads // ncols * stride\n", + " fig, axes = plt.subplots(rows, ncols)\n", + " \n", + " # for head in range(nheads):\n", + " for head_i, head in enumerate(heads):\n", + " row, col = head_i * stride // ncols, head_i * stride % ncols\n", + " ax1 = axes[row, col]\n", + " attn = hypo[attn_name][layer][head]\n", + " _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col)\n", + " if attn_name == 'dec_enc_attns':\n", + " col = col + 1\n", + " axes[row, col].axis('off') # next subfig acts as blank place holder\n", + " # plt.suptitle('%s with %d heads, Layer %d' % (attn_name, nheads, layer), fontsize=20)\n", + " plt.show() \n", + " \n", + "ncols = 4" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"max_position_embeddings\": 512,\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"type_vocab_size\": 2,\n", + " \"vocab_size\": 30522\n", + "}" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "config.num" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Untitled_zeoliao.ipynb b/Untitled_zeoliao.ipynb new file mode 100644 index 00000000000000..104e29a6c09ee9 --- /dev/null +++ b/Untitled_zeoliao.ipynb @@ -0,0 +1,1501 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "InteractiveShell.ast_node_interactivity = 'all'" + ] + }, + { + "cell_type": "code", + "execution_count": 336, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import itertools\n", + "from itertools import product, chain\n", + "import numpy as np\n", + "\n", + "from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "CONFIG_NAME = 'bert_config.json'\n", + "BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/'\n", + "tokenizer = BertTokenizer.from_pretrained(os.path.join(BERT_DIR, 'vocab.txt'))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def reverse(l):\n", + " return list(reversed(l))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def mask(ent_str):\n", + " tokens = ent_str.strip().split()\n", + " if len(tokens) == 1:\n", + " return '[%s]' % tokens[0]\n", + " elif len(tokens) == 2:\n", + " assert tokens[0] == 'the', ent_str\n", + " return '%s [%s]' % (tokens[0], tokens[1])\n", + " else:\n", + " assert False, ent_str" + ] + }, + { + "cell_type": "code", + "execution_count": 276, + "metadata": {}, + "outputs": [], + "source": [ + "A_template = \"{rel_prefix} {dt} {ent0} {rel} {dt} {ent1} {rel_suffix}\"\n", + "B_template = [\"{pred_prefix} {dt} {ent} {pred}\", \"{pred_prefix} {pred} {dt} {ent}\"]\n", + "\n", + "# causal_templates = [[\"{A} because {B}.\"],# \"{B} so {A}.\"], \n", + "# [\"{A} so {B}.\"],# \"{B} because {A}.\"]\n", + "# ]\n", + "# turning_templates = [[\"{A} although {B}.\"],# \"{B} but {A}.\"], \n", + "# [\"{A} but {B}.\"],# \"{B} although {A}.\"]\n", + "# ]\n", + "\n", + "causal_templates = [[\"{A} {conj} {B}.\"],# \"{B} so {A}.\"], \n", + " [\"{A} {conj} {B}.\"],# \"{B} because {A}.\"]\n", + " ]\n", + "turning_templates = [[\"{A} {conj} {B}.\"],# \"{B} but {A}.\"], \n", + " [\"{A} {conj} {B}.\"],# \"{B} although {A}.\"]\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 405, + "metadata": {}, + "outputs": [], + "source": [ + "def make_sentences(A_template, B_template, causal_templates, turning_templates,\n", + " index=-1, orig_sentence='', entities=[\"John\", \"Mary\"], entity_substitutes=None, determiner=\"\", \n", + " packed_relations=[\"rel/~rel\", \"rev_rel/~rev_rel\"], packed_relation_substitutes=None,\n", + " relation_prefix=\"\", relation_suffix=\"\",\n", + " packed_predicates=[\"pred0/~pred0\", \"pred1/~pred1\"], predicate_prefix=\"\", prepositive_pred=False,\n", + " predicate_dichotomy=True, reverse_causal=False, conjunctions=[[\"because\", \"so\"], [\"although\", \"but\"]]):\n", + " assert entities[0].lower() in tokenizer.vocab , entities[0]\n", + " assert entities[1].lower() in tokenizer.vocab , entities[1]\n", + " \n", + " def form_As(packed_rels):\n", + " relations, neg_relations = zip(*[rel.split(\"/\") for rel in packed_rels])\n", + " relations, neg_relations = list(relations), list(neg_relations)\n", + "\n", + " As = [A_template.format(dt=determiner, ent0=ent0, ent1=ent1, rel=rel, rel_prefix=relation_prefix, rel_suffix=relation_suffix) \n", + " for ent0, ent1, rel in [entities + relations[:1], reverse(entities) + reverse(relations)[:1]]]\n", + " negAs = [A_template.format(dt=determiner, ent0=ent0, ent1=ent1, rel=rel, rel_prefix=relation_prefix, rel_suffix=relation_suffix) \n", + " for ent0, ent1, rel in [entities + neg_relations[:1], reverse(entities) + reverse(neg_relations)[:1]]]\n", + " return As, negAs\n", + " \n", + " As, negAs = form_As(packed_relations)\n", + " \n", + " substituted_As, substituted_negAs = [], []\n", + " for packed_rel_subs in zip(*packed_relation_substitutes):\n", + " subs_As, subs_negAs = form_As(packed_rel_subs)\n", + " substituted_As += subs_As\n", + " substituted_negAs += subs_negAs\n", + " \n", + " if \"/\" in packed_predicates[0]:\n", + " predicates, neg_predicates = zip(*[pred.split(\"/\") for pred in packed_predicates])\n", + " predicates, neg_predicates = list(predicates), list(neg_predicates)\n", + " else:\n", + " predicates, neg_predicates = packed_predicates, []\n", + " \n", + " B_template = B_template[int(prepositive_pred)]\n", + " Bs = [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) \n", + " for ent, pred in zip(entities, predicates)]\n", + " negBs = [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) \n", + " for ent, pred in zip(entities, neg_predicates)]\n", + " if predicate_dichotomy:\n", + " Bs += [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) \n", + " for ent, pred in zip(entities, reversed(neg_predicates))]\n", + " negBs += [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) \n", + " for ent, pred in zip(entities, reversed(predicates))]\n", + "\n", + " def form_sentences(sentence_template, As, Bs, conj):\n", + " return [\" \".join(sentence_template.format(A=A, B=B, conj=conj).split()) for A, B in product(As, Bs)]\n", + "\n", + " def form_all_sentences(As, negAs, Bs, negBs):\n", + " causal_sentences = []\n", + " causal_conj = conjunctions[0][int(reverse_causal)]\n", + " for causal_template in causal_templates[int(reverse_causal)]:\n", + " for A, B in [(As, Bs), (negAs, negBs)]:\n", + " causal_sentences += form_sentences(causal_template, A, B, causal_conj)\n", + "\n", + " turning_sentences = []\n", + " turning_conj = conjunctions[1][int(reverse_causal)]\n", + " for turning_template in turning_templates[int(reverse_causal)]:\n", + " for A, B in [(As, negBs), (negAs, Bs)]:\n", + " turning_sentences += form_sentences(turning_template, A, B, turning_conj)\n", + "\n", + " sentences = causal_sentences + turning_sentences\n", + " return sentences, causal_sentences, turning_sentences\n", + " \n", + " sentences, causal_sentences, turning_sentences = form_all_sentences(As, negAs, Bs, negBs)\n", + "# substituted_sentences = sentences\n", + "\n", + " if packed_relation_substitutes is not None:\n", + " substituted_sentences = form_all_sentences(substituted_As, substituted_negAs, Bs, negBs)[0]\n", + " \n", + " substituted_sent_groups = list(zip(sentences, substituted_sentences))\n", + "\n", + " if entity_substitutes is not None:\n", + " for sub in entity_substitutes:\n", + " for ent in sub:\n", + " assert ent.lower() in tokenizer.vocab , ent + \" not in BERT vocab\"\n", + " assert len(set(chain.from_iterable(entity_substitutes))) == 4, entity_substitutes\n", + " assert len(set(chain.from_iterable(entity_substitutes)).union(set(entities))) == 6\n", + "\n", + " entity_substitutes = list(itertools.product(entities[:1] + entity_substitutes[0], entities[1:] + entity_substitutes[1]))\n", + " substituted_sent_groups = [[sent.replace(entities[0], sub[0]).replace(entities[1], sub[1]) \n", + " for sent in sent_group for sub in entity_substitutes] for sent_group in substituted_sent_groups]\n", + " return causal_sentences, turning_sentences, substituted_sent_groups\n", + "\n", + "# if entity_substitutes is not None:\n", + "# for sub in entity_substitutes:\n", + "# for ent in sub:\n", + "# assert ent.lower() in tokenizer.vocab , ent + \" not in BERT vocab\"\n", + "# assert len(set(chain.from_iterable(entity_substitutes))) == 4, entity_substitutes\n", + "# assert len(set(chain.from_iterable(entity_substitutes)).union(set(entities))) == 6 \n", + " \n", + "# entity_substitutes = list(itertools.product(entities[:1] + entity_substitutes[0], entities[1:] + entity_substitutes[1]))\n", + "# substituted_sentences = [sent.replace(entities[0], sub[0]).replace(entities[1], sub[1]) \n", + "# for sent in substituted_sentences for sub in entity_substitutes]\n", + "# return causal_sentences, turning_sentences, substituted_sentences" + ] + }, + { + "cell_type": "code", + "execution_count": 394, + "metadata": {}, + "outputs": [], + "source": [ + "def make_sentences(A_template, B_template, causal_templates, turning_templates,\n", + " index=-1, orig_sentence='', entities=[\"John\", \"Mary\"], entity_substitutes=None, determiner=\"\", \n", + " packed_relations=[\"rel/~rel\", \"rev_rel/~rev_rel\"], packed_relation_substitutes=None,\n", + " relation_prefix=\"\", relation_suffix=\"\",\n", + " packed_predicates=[\"pred0/~pred0\", \"pred1/~pred1\"], predicate_prefix=\"\", prepositive_pred=False,\n", + " predicate_dichotomy=True, reverse_causal=False, conjunctions=[[\"because\", \"so\"], [\"although\", \"but\"]]):\n", + " assert entities[0].lower() in tokenizer.vocab , entities[0]\n", + " assert entities[1].lower() in tokenizer.vocab , entities[1]\n", + " \n", + " relations, neg_relations = zip(*[rel.split(\"/\") for rel in packed_relations])\n", + " relations, neg_relations = list(relations), list(neg_relations)\n", + " \n", + " As = [A_template.format(dt=determiner, ent0=ent0, ent1=ent1, rel=rel, rel_prefix=relation_prefix, rel_suffix=relation_suffix) \n", + " for ent0, ent1, rel in [entities + relations[:1], reverse(entities) + reverse(relations)[:1]]]\n", + " negAs = [A_template.format(dt=determiner, ent0=ent0, ent1=ent1, rel=rel, rel_prefix=relation_prefix, rel_suffix=relation_suffix) \n", + " for ent0, ent1, rel in [entities + neg_relations[:1], reverse(entities) + reverse(neg_relations)[:1]]]\n", + " \n", + " if \"/\" in packed_predicates[0]:\n", + " predicates, neg_predicates = zip(*[pred.split(\"/\") for pred in packed_predicates])\n", + " predicates, neg_predicates = list(predicates), list(neg_predicates)\n", + " else:\n", + " predicates, neg_predicates = packed_predicates, []\n", + " \n", + " B_template = B_template[int(prepositive_pred)]\n", + " Bs = [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) \n", + " for ent, pred in zip(entities, predicates)]\n", + " negBs = [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) \n", + " for ent, pred in zip(entities, neg_predicates)]\n", + " if predicate_dichotomy:\n", + " Bs += [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) \n", + " for ent, pred in zip(entities, reversed(neg_predicates))]\n", + " negBs += [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) \n", + " for ent, pred in zip(entities, reversed(predicates))]\n", + "\n", + " def form_sentences(sentence_template, As, Bs, conj):\n", + " return [\" \".join(sentence_template.format(A=A, B=B, conj=conj).split()) for A, B in product(As, Bs)]\n", + "\n", + " causal_sentences = []\n", + " causal_conj = conjunctions[0][int(reverse_causal)]\n", + " for causal_template in causal_templates[int(reverse_causal)]:\n", + " for A, B in [(As, Bs), (negAs, negBs)]:\n", + " causal_sentences.extend(form_sentences(causal_template, A, B, causal_conj))\n", + "\n", + " turning_sentences = []\n", + " turning_conj = conjunctions[1][int(reverse_causal)]\n", + " for turning_template in turning_templates[int(reverse_causal)]:\n", + " for A, B in [(As, negBs), (negAs, Bs)]:\n", + " turning_sentences.extend(form_sentences(turning_template, A, B, turning_conj))\n", + " \n", + " sentences = causal_sentences + turning_sentences\n", + " substituted_sentences = sentences\n", + "\n", + " if packed_relation_substitutes is not None:\n", + " packed_relation_substitutes = list(itertools.product(packed_relations[:1] + packed_relation_substitutes[0], \n", + " packed_relations[1:] + packed_relation_substitutes[1]))\n", + "\n", + " substituted_sentences = []\n", + " for packed_sub_relations in packed_relation_substitutes:\n", + " sub_relations, sub_neg_relations = zip(*[rel.split(\"/\") for rel in packed_sub_relations])\n", + " \n", + " # neg_relations should be replaced first (maximum matching), otherwise there will be wrong sentences\n", + " substituted_sentences += [sent.replace(neg_relations[0], sub_neg_relations[0])\n", + " .replace(neg_relations[1], sub_neg_relations[1])\n", + " .replace(relations[0], sub_relations[0])\n", + " .replace(relations[1], sub_relations[1]) \n", + " for sent in sentences]\n", + "# print(relations[0] + \" -> \" + sub_relations[0])\n", + "# print(relations[1] + \" -> \" + sub_relations[1])\n", + "# print(neg_relations[0] + \" -> \" + sub_neg_relations[0])\n", + "# print(neg_relations[1] + \" -> \" + sub_neg_relations[1])\n", + "# for sent, subs_sent in zip(sentences, substituted_sentences):\n", + "# print(sent + \" -> \" + subs_sent)\n", + " \n", + " substituted_sentences = list(set(substituted_sentences))\n", + " \n", + "# if entity_substitutes is not None:\n", + "# for sub in entity_substitutes:\n", + "# for ent in sub:\n", + "# assert ent.lower() in tokenizer.vocab , ent + \" not in BERT vocab\"\n", + "# assert len(set(chain.from_iterable(entity_substitutes))) == 4, entity_substitutes\n", + "# assert len(set(chain.from_iterable(entity_substitutes)).union(set(entities))) == 6 \n", + " \n", + "# entity_substitutes = list(itertools.product(entities[:1] + entity_substitutes[0], entities[1:] + entity_substitutes[1]))\n", + "# substituted_sentences = [sent.replace(entities[0], sub[0]).replace(entities[1], sub[1]) \n", + "# for sent in substituted_sentences for sub in entity_substitutes]\n", + " return causal_sentences, turning_sentences, substituted_sentences" + ] + }, + { + "cell_type": "code", + "execution_count": 443, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", + "import torch\n", + "import random\n", + "import math" + ] + }, + { + "cell_type": "code", + "execution_count": 435, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[8., 9., 0., 0.],\n", + " [6., 3., 5., 6.],\n", + " [2., 5., 0., 9.],\n", + " [2., 9., 1., 5.],\n", + " [3., 0., 8., 8.],\n", + " [6., 5., 8., 2.],\n", + " [3., 0., 6., 0.],\n", + " [4., 7., 1., 1.]])" + ] + }, + "execution_count": 435, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "A = torch.randint(10, size=(8, 2))\n", + "B = torch.randint(10, size=(8, 2))\n", + "d = torch.cat([A, B], dim=-1)\n", + "d" + ] + }, + { + "cell_type": "code", + "execution_count": 427, + "metadata": {}, + "outputs": [], + "source": [ + "sampler = RandomSampler(dataset)\n", + "dataloader = DataLoader(dataset, sampler=sampler, batch_size=3)" + ] + }, + { + "cell_type": "code", + "execution_count": 446, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9]]" + ] + }, + "execution_count": 446, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "batch_size = 4\n", + "idx_list = list(range(10))\n", + "# random.shuffle(idx_list)\n", + "n_batches = math.ceil(len(idx_list) / batch_size)\n", + "[idx_list[i * batch_size: (i + 1) * batch_size] for i in range(n_batches)]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def get_frame(frames, index):\n", + " for frame in frames:\n", + " if frame['index'] == index:\n", + " return frame\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 389, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "30" + ] + }, + "execution_count": 389, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "frames = \\\n", + "[\n", + " {\n", + " \"index\": 2,\n", + " \"orig_sentence\": \"The trophy doesn't fit into the brown suitcase because [it] is too large/small.\",\n", + " \"entities\": [\"trophy\", \"suitcase\"],\n", + " \"entity_substitutes\": [[\"ball\", \"toy\"], [\"bag\", \"box\"]],\n", + " \"determiner\": \"the\",\n", + " \"packed_relations\": [\"doesn't fit into/can fit into\", \"doesn't hold/can hold\"],\n", + " \"packed_relation_substitutes\": [[\"can't be put into/can be put into\"], [\"doesn't have enough room for/has enough room for\"]],\n", + " \"packed_predicates\": [\"is large/isn't large\", \"is small/isn't small\"],\n", + " },\n", + " {\n", + " \"index\": 4,\n", + " \"orig_sentence\": \"Joan made sure to thank Susan for all the help [she] had recieved/given.\",\n", + " \"entities\": [\"John\", \"Susan\"],\n", + " \"entity_substitutes\": [[\"David\", \"Michael\"], [\"Mary\", \"Tiffany\"]],\n", + " \"packed_relations\": [\"thanked/didn't thank\", \"took good care of/didn't good care of\"],\n", + " \"packed_relation_substitutes\": [[\"felt grateful to/didn't feel grateful to\"], [\"was appreciated by/wasn't appreciated by\"]],\n", + " \"packed_predicates\": [\"had received a lot of help/hadn't received a lot of help\", \"had given a lot of help/hadn't given a lot of help\"],\n", + " \"predicate_dichotomy\": False,\n", + " },\n", + " {\n", + " \"index\": 4000,\n", + " \"orig_sentence\": \"John gave a lot of money to Susan because [he] was very rich/poor.\",\n", + " \"entities\": [\"John\", \"Susan\"],\n", + " \"entity_substitutes\": [[\"David\", \"Michael\"], [\"Mary\", \"Linda\"]],\n", + " \"packed_relations\": [\"gave a lot of money to/didn't give a lot of money to\", \"received a lot of money from/didn't receive a lot of money from\"],\n", + " \"packed_relation_substitutes\": [[\"subsidized/didn't subsidize\"], [\"borrowed a lot of money from/didn't borrow any money from\"]],\n", + " \"packed_predicates\": [\"was rich/wasn't rich\", \"was poor/wasn't poor\"],\n", + " },\n", + " {\n", + " \"index\": 10,\n", + " \"orig_sentence\": \"The delivery truck zoomed by the school bus because [it] was going so fast/slow.\",\n", + " \"entities\": [\"truck\", \"bus\"],\n", + " \"entity_substitutes\": [[\"car\", \"ambulance\"], [\"bicycle\", \"tram\"]],\n", + " \"determiner\": \"the\",\n", + " \"packed_relations\": [\"overtook/couldn't overtake\", \"fell far behind/didn't fall far behind\"],\n", + " \"packed_relation_substitutes\": [[\"zoomed by/didn't pass\"], [\"was left behind/wasn't left far behind\"]],\n", + " \"packed_predicates\": [\"was going fast/wasn't going fast\", \"was going slow/wasn't going slow\"],\n", + " },\n", + " ## didn't defeated, replace error: didn't defeat -> defeated\n", + " {\n", + " \"index\": 12,\n", + " \"orig_sentence\": \"Frank felt vindicated/crushed when his longtime rival Bill revealed that [he] was the winner of the competition.\",\n", + " \"entities\": [\"John\", \"Susan\"],\n", + " \"entity_substitutes\": [[\"David\", \"Michael\"], [\"Mary\", \"Linda\"]],\n", + " \"packed_relations\": [\"beat/didn't beat\", \"lost to/didn't lose to\"],\n", + " \"packed_relation_substitutes\": [[\"defeated/didn't defeat\"], [\"was defeated by/wasn't defeated by\"]],\n", + " \"relation_suffix\": \"in the game\",\n", + " \"packed_predicates\": [\"was happy/wasn't happy\", \"was sad/wasn't sad\"],\n", + " \"reverse_causal\": True\n", + " },\n", + " {\n", + " \"index\": 16,\n", + " \"orig_sentence\": \"The large ball crashed right through the table because [it] was made of steel/styrofoam.\",\n", + " \"entities\": [\"ball\", \"board\"],\n", + " \"entity_substitutes\": [[\"bullet\", \"arrow\"], [\"shield\", \"disk\"]],\n", + " \"determiner\": \"the\",\n", + " \"packed_relations\": [\"crashed right through/didn't crash through\", \"failed to block/blocked\"],\n", + " \"packed_relation_substitutes\": [[\"penetrated through/didn't penetrate through\"], [\"failed to stop/stopped\"]],\n", + " \"packed_predicates\": [\"was hard/wasn't hard\", \"was soft/wasn't soft\"],\n", + " },\n", + " {\n", + " \"index\": 18,\n", + " \"orig_sentence\": \"John couldn't see the stage with Billy in front of him because [he] is so short.\",\n", + " \"entities\": [\"John\", \"Susan\"],\n", + " \"entity_substitutes\": [[\"David\", \"Edward\"], [\"Betty\", \"Donna\"]],\n", + " \"packed_relations\": [\"couldn't see the stage behind/could see the stage behind\", \"blocked the view of/didn't block the view of\"],\n", + " \"packed_relation_substitutes\": [[\"couldn't catch sight of the stage behind/could catch sight of the stage behind\"], [\"obstructed the sight of/didn't obstruct the sight of\"]],\n", + " \"packed_predicates\": [\"is short/isn't short\", \"is tall/isn't tall\"],\n", + " },\n", + " {\n", + " \"index\": 20,\n", + " \"orig_sentence\": \"Tom threw his schoolbag down to Ray after [he] reached the top of the stairs.\",\n", + " \"entities\": [\"Brian\", \"Amy\"],\n", + " \"entity_substitutes\": [[\"Charles\", \"Paul\"], [\"Emma\", \"Linda\"]],\n", + " \"packed_relations\": [\"threw the schoolbag down to/threw the schoolbag up to\", \"caught the schoolbag thrown down by/caught the schoolbag thrown up by\"],\n", + " \"packed_relation_substitutes\": [[\"cast the schoolbag down to/cast the schoolbag up to\"], [\"took the schoolbag thrown down by/took the schoolbag thrown up by\"]],\n", + " \"packed_predicates\": [\"reached the top of the stairs\", \"reached the bottom of the stairs\"],\n", + " \"conjunctions\": [[\"after\", ], [\"before\", ]]\n", + " },\n", + " ## didn't defeated, replace error: didn't defeat -> defeated\n", + " {\n", + " \"index\": 22,\n", + " \"orig_sentence\": \"Although they ran at about the same speed, Sue beat Sally because [she] had such a good start.\",\n", + " \"entities\": [\"Tom\", \"Sue\"],\n", + " \"entity_substitutes\": [[\"John\", \"David\"], [\"Sally\", \"Susan\"]],\n", + " \"packed_relations\": [\"beat/didn't beat\", \"lost to/didn't lose to\"],\n", + " \"packed_relation_substitutes\": [[\"defeated/didn't defeat\"], [\"was defeated by/wasn't defeated by\"]],\n", + " \"relation_prefix\": \"Running at about the same speed,\",\n", + " \"relation_suffix\": \"in the running race\",\n", + " \"packed_predicates\": [\"had a good start/didn't have a good start\", \"had a bad start/didn't have a bad start\"],\n", + " },\n", + "# {\n", + "# \"index\": 26000,\n", + "# \"orig_sentence\": \"Sam's drawing was hung just above Tina's and [it] did look much better with another one below it\",\n", + "# \"entities\": [\"Bob\", \"Wendy\"],\n", + "# \"entity_substitutes\": [[\"Bush\", \"Tim\"], [\"Sandy\", \"Helen\"]],\n", + "# \"packed_relations\": [\"could reach higher than/couldn't reach higher than\", \"reached lower than/didn't reach lower than\"],\n", + "# \"packed_relation_substitutes\": [[\"could jump higher than/couldn't jump higher than\"], [\"jumped lower than/didn't jump lower than\"]],\n", + "# \"packed_predicates\": [\"is tall/is not tall\", \"is short/is not short\"],\n", + "# },\n", + " {\n", + " \"index\": 28,\n", + " \"orig_sentence\": \"Anna did a lot better than her good friend Lucy on the test because [she] had studied so hard.\",\n", + " \"entities\": [\"Anna\", \"Andy\"],\n", + " \"entity_substitutes\": [[\"Lucy\", \"Nancy\"], [\"George\", \"Frank\"]],\n", + " \"packed_relations\": [\"did better than/didn't do better than\", \"did worse than/didn't do worse than\"],\n", + " \"packed_relation_substitutes\": [[\"performed better than/didn't perform better than\"], [\"performed worse than/didn't perform worse than\"]],\n", + " \"relation_suffix\": \"on the test\",\n", + " \"packed_predicates\": [\"had studied hard/hadn't studied hard\", \"was lazy in doing homework/wasn't lazy in doing homework\"],\n", + " },\n", + " {\n", + " \"index\": 30,\n", + " \"orig_sentence\": \"The firemen arrived after the police because [they] were coming from so far away.\",\n", + " \"entities\": [\"doctor\", \"police\"],\n", + " \"entity_substitutes\": [[\"worker\", \"employee\"], [\"boss\", \"administrator\"]],\n", + " \"determiner\": \"the\",\n", + " \"packed_relations\": [\"arrived after/didn't arrive after\", \"arrived before/didn't arrive before\"],\n", + " \"packed_relation_substitutes\": [[\"reached here after/didn't reach here after\"], [\"reached here before/didn't reach here before\"]],\n", + " \"packed_predicates\": [\"came from far away/didn't come from far away\", \"came from a close place/didn't come from a close place\"],\n", + " },\n", + " {\n", + " \"index\": 32000,\n", + " \"orig_sentence\": \"Frank was upset with Tom because the toaster [he] had bought from him didn't work.\",\n", + " \"entities\": [\"Betty\", \"Henry\"],\n", + " \"entity_substitutes\": [[\"Amy\", \"Linda\"], [\"Bush\", \"Frank\"]],\n", + " \"packed_relations\": [\"was upset with/was pleased with\", \"was hated by/was loved by\"],\n", + " \"packed_relation_substitutes\": [[\"hated/liked\"], [\"was disliked by/was liked by\"]],\n", + " \"packed_predicates\": [\"had bought didn't work/had bought worked well\", \"had sold didn't work/had sold worked well\"],\n", + " \"predicate_prefix\": \"the toaster\",\n", + " \"predicate_dichotomy\": False,\n", + " },\n", + " {\n", + " \"index\": 36,\n", + " \"orig_sentence\": \"The sack of potatoes had been placed above the bag of flour, so [it] had to be moved first\",\n", + " \"entities\": [\"potatoes\", \"flour\"],\n", + " \"entity_substitutes\": [[\"candy\", \"rice\"], [\"beans\", \"noodles\"]],\n", + " \"determiner\": \"the bag of\",\n", + " \"packed_relations\": [\"had been placed above/hadn't been placed above\", \"had been placed below/hadn't been placed below\"],\n", + " \"packed_relation_substitutes\": [[\"had been put above/hadn't been put above\"], [\"had been put below/hadn't been put below\"]],\n", + " \"packed_predicates\": [\"had to be moved first/couldn't be moved first\", \"had to be moved later/couldn't be moved later\"],\n", + " \"reverse_causal\": True\n", + " },\n", + " {\n", + " \"index\": 38,\n", + " \"orig_sentence\": \"Pete envies Martin although [he] is very successful.\",\n", + " \"entities\": [\"Peter\", \"Mandy\"],\n", + " \"entity_substitutes\": [[\"Martin\", \"Paul\"], [\"Cindy\", \"Emma\"]],\n", + " \"packed_relations\": [\"envied/didn't envy\", \"was envied by/wasn't envied by\"],\n", + " \"packed_relation_substitutes\": [[\"was jealous of/wasn't jealous of\"], [\"was admired by/wasn't admired by\"]],\n", + " \"packed_predicates\": [\"failed/didn't fail\", \"was successful/wasn't successful\"],\n", + " },\n", + "# {\n", + "# \"index\": 420000,\n", + "# \"orig_sentence\": \"I poured water from the bottle into the cup until [it] was empty.\",\n", + "# \"entities\": [\"bottle\", \"cup\"],\n", + "# \"entity_substitutes\": [[\"bow\", \"bucket\"], [\"tube\", \"container\"]],\n", + "# \"determiner\": \"the\",\n", + "# \"packed_relations\": [\"is filled with the water from/isn't filled with the water from\", \"leakes the water into/doesn't leakes the water into\"],\n", + "# \"packed_relation_substitutes\": [[\"is full of the water from/isn't full of the water from\"], [\"drains the water into/doesn't drain the water into\"]],\n", + "# \"packed_predicates\": [\"was lower/wasn't lower\", \"was higher/wasn't higher\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# }, \n", + " {\n", + " \"index\": 42,\n", + " \"orig_sentence\": \"I poured water from the bottle into the cup until [it] was empty.\",\n", + " \"entities\": [\"bottle\", \"cup\"],\n", + " \"entity_substitutes\": [[\"bowl\", \"bucket\"], [\"tube\", \"container\"]],\n", + " \"determiner\": \"the\",\n", + " \"packed_relations\": [\"was filled with water from/leaked into\", \"leaked into/was filled with water from\"],\n", + " \"packed_relation_substitutes\": [[\"was suffused with water from/dripped water into\"], [\"dripped water into/was suffused with water from\"]],\n", + " \"packed_predicates\": [\"was empty\", \"was full\"],\n", + " \"conjunctions\": [[\"after\", ], [\"before\", ]]\n", + " },\n", + "# {\n", + "# \"index\": 46000,\n", + "# \"orig_sentence\": \"Sid explained his theory to Mark but [he] couldn't convince him.\",\n", + "# \"entities\": [\"Susan\", \"Mark\"],\n", + "# \"entity_substitutes\": [[\"Amy\", \"Linda\"], [\"David\", \"Michael\"]],\n", + "# \"packed_relations\": [\"is explaining the theory to/doesn't explain the theory\", \"is listening to the explanation of/doesn't listen to the explanation of\"],\n", + "# \"packed_relation_substitutes\": [[\"is illustrating the theory to/doesn't illustrate the theory\"], [\"is paying attention to the explanation of/doesn't pay attention to the explanation of\"]],\n", + "# \"packed_predicates\": [\"has already proved it/doesn't prove it\", \"doesn't understand it/understands it\"],\n", + "# \"reverse_causal\": True\n", + "# },\n", + "# {\n", + "# \"index\": 48,\n", + "# \"orig_sentence\": \"Susan knew that Ann's son had been in a car accident, so [she] told her about it.\",\n", + "# \"entities\": [\"Sandy\", \"Mark\"],\n", + "# \"entity_substitutes\": [[\"Mandy\", \"Betty\"], [\"Bob\", \"Charles\"]],\n", + "# \"packed_relations\": [\"found/didn't find\", \"was found by/wasn't found by\"],\n", + "# \"packed_relation_substitutes\": [[\"caught/didn't catch\"], [\"was caught by/wasn't caught by\"]],\n", + "# \"relation_suffix\": \"cheating on the test\",\n", + "# \"packed_predicates\": [\"felt unfair/didn't feel unfair\", \"felt nervous/didn't feel nervous\"],\n", + "# },\n", + " {\n", + " \"index\": 50,\n", + " \"orig_sentence\": \"Joe's uncle can still beat him at tennis, even though [he] is 30 years younger.\",\n", + " \"entities\": [\"Joe\", \"Amy\"],\n", + " \"entity_substitutes\": [[\"David\", \"Charles\"], [\"Betty\", \"Cindy\"]],\n", + " \"packed_relations\": [\"can beat/can't beat\", \"often loses to/seldom loses to\"],\n", + " \"packed_relation_substitutes\": [[\"can defeat/can't defeat\"], [\"is often defeated by/is seldom defeated by\"]],\n", + " \"relation_suffix\": \"at tennis\",\n", + " \"packed_predicates\": [\"is older/isn't older\", \"is younger/isn't younger\"],\n", + " },\n", + "# {\n", + "# \"index\": 64000,\n", + "# \"orig_sentence\": \"In the middle of the outdoor concert, the rain started falling, but [it] continued until 10.\",\n", + "# \"entities\": [\"concert\", \"rain\"],\n", + "# \"entity_substitutes\": [[\"event\", \"race\"], [\"storm\", \"shower\"]],\n", + "# \"determiner\": \"the\",\n", + "# \"packed_relations\": [\"was interrupted by/wasn't interrupted by\", \"interrupted/didn't interrupt\"],\n", + "# \"packed_relation_substitutes\": [[\"was affected by/wasn't affected by\"], [\"affected/didn't affected\"]],\n", + "# \"packed_predicates\": [\"ended early/ended on time\", \"was heavy/stopped soon\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# },\n", + "# {\n", + "# \"index\": 680,\n", + "# \"orig_sentence\": \"Ann asked Mary what time the library closes, because [she] had forgotten.\",\n", + "# \"entities\": [\"Ann\", \"Henry\"],\n", + "# \"entity_substitutes\": [[\"Mary\", \"Linda\"], [\"Brian\", \"Michael\"]],\n", + "# \"packed_relations\": [\"asked/didn't ask\", \"was asked by/wasn't asked by\"],\n", + "# \"packed_relation_substitutes\": [[\"querid/didn't query\"], [\"was querid by/wasn't querid by\"]],\n", + "# \"relation_suffix\": \"what time the library closes\",\n", + "# \"packed_predicates\": [\"forgot/didn't forget\", \"remembered/didn't remember\"],\n", + "# \"reverse_causal\": True\n", + "# },\n", + " {\n", + " \"index\": 68,\n", + " \"orig_sentence\": \"Ann asked Mary what time the library closes, because [she] had forgotten.\",\n", + " \"entities\": [\"Ann\", \"Henry\"],\n", + " \"entity_substitutes\": [[\"Mary\", \"Linda\"], [\"Brian\", \"Michael\"]],\n", + " \"packed_relations\": [\"asked/didn't ask\", \"told/didn't tell\"],\n", + " \"packed_relation_substitutes\": [[\"was told by/wasn't told by\"], [\"was asked by/wasn't asked by\"]],\n", + " \"relation_suffix\": \"what time the library closes\",\n", + " \"packed_predicates\": [\"had forgotten/hadn't forgotten\", \"remembered/didn't remember\"],\n", + " },\n", + "# {\n", + "# \"index\": 840,\n", + "# \"orig_sentence\": \"If the con artist has succeeded in fooling Sam, [he] would have gotten a lot of money.\",\n", + "# \"entities\": [\"Sam\", \"Emma\"],\n", + "# \"entity_substitutes\": [[\"Paul\", \"Bush\"], [\"Susan\", \"Lucy\"]],\n", + "# \"packed_relations\": [\"succeeded in fooling/failed to fool\", \"was fooled by/wasn't fooled by\"],\n", + "# \"packed_relation_substitutes\": [[\"succeeded in cheating/failed to cheat\"], [\"was cheated by/wasn't cheated by\"]],\n", + "# \"packed_predicates\": [\"got the prize/didn't get the prize\", \"lost the prize/didn't lose the prize\"],\n", + "# \"predicate_dichotomy\": True,\n", + "# \"reverse_causal\": True\n", + "# }, \n", + " {\n", + " \"index\": 84,\n", + " \"orig_sentence\": \"If the con artist has succeeded in fooling Sam, [he] would have gotten a lot of money.\",\n", + " \"entities\": [\"Sam\", \"Emma\"],\n", + " \"entity_substitutes\": [[\"Paul\", \"Bush\"], [\"Susan\", \"Lucy\"]],\n", + " \"packed_relations\": [\"succeeded in fooling/failed to fool\", \"was fooled by/wasn't fooled by\"],\n", + " \"packed_relation_substitutes\": [[\"succeeded in cheating/failed to cheat\"], [\"was cheated by/wasn't cheated by\"]],\n", + " \"packed_predicates\": [\"got a lot of money/didn't get a lot of money\", \"lost a lot of money/didn't lose a lot of money\"],\n", + " \"predicate_dichotomy\": False,\n", + " \"reverse_causal\": True\n", + " }, \n", + "# {\n", + "# \"index\": 92000,\n", + "# \"orig_sentence\": \"Alice tried frantically to stop her daughter from chatting at the party, leaving us to wonder why [she] was behaving so strangely.\",\n", + "# \"entities\": [\"Alice\", \"Emma\"],\n", + "# \"entity_substitutes\": [[\"Paul\", \"Bush\"], [\"Susan\", \"Lucy\"]],\n", + "# \"packed_relations\": [\"didn't allow her daughter/allowed her daughter\", \"wasn't allowed by her father/was allowed by her father\"],\n", + "# \"packed_relation_substitutes\": [[\"didn't approve her daughter/approved her daughter\"], [\"wasn't approved by/was approved by\"]],\n", + "# \"relation_suffix\": \"to go to the party\",\n", + "# \"packed_predicates\": [\"was severe/wasn't severe\", \"was naughty/behaved well\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": False\n", + "# }, \n", + "# {\n", + "# \"index\": 98,\n", + "# \"orig_sentence\": \"I was trying to open the lock with the key, but someone had filled the keyhole with chewing gum, and I couldn't get [it] in.\",\n", + "# \"entities\": [\"hole\", \"gum\"],\n", + "# \"entity_substitutes\": [[\"can\", \"box\"], [\"clay\", \"soil\"]],\n", + "# \"determiner\": \"the\",\n", + "# \"packed_relations\": [\"was filled with/wasn't filled with\", \"clogged/didn't clog\"],\n", + "# \"packed_relation_substitutes\": [[\"was full of/wasn't full of\"], [\"stucked/didn't stuck\"]],\n", + "# \"packed_predicates\": [\"couldn't be gotten in/could be gotten in\", \"should be cleaned up/should't be cleaned up\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": True\n", + "# },\n", + "# {\n", + "# \"index\": 100,\n", + "# \"orig_sentence\": \"The dog chased the cat, which ran up a tree. [It] waited at the bottom.\",\n", + "# \"entities\": [\"tigher\", \"cat\"],\n", + "# \"entity_substitutes\": [[\"fox\", \"weasel\"], [\"hen\", \"rooster\"]],\n", + "# \"determiner\": \"the\",\n", + "# \"packed_relations\": [\"chased/didn't chase\", \"was chased by/wasn't chased by\"],\n", + "# \"packed_relation_substitutes\": [[\"hunted for/didn't hunt for\"], [\"was hunted by/wasn't hunted by\"]],\n", + "# \"relation_suffix\": \"until the tree\",\n", + "# \"packed_predicates\": [\"waited at the bottom of it/didn't waited at the bottom of it\", \"stayed at the top of it/didn't stay at the top of it\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": True\n", + "# },\n", + "# {\n", + "# \"index\": 106,\n", + "# \"orig_sentence\": \"John was doing research in the library when he heard a man humming and whistling. [He] was very annoyed.\",\n", + "# \"entities\": [\"Bob\", \"Tiffany\"],\n", + "# \"entity_substitutes\": [[\"Jack\", \"Ted\"], [\"Mary\", \"Lucy\"]],\n", + "# \"packed_relations\": [\"heard/didn't hear\", \"was heard by/didn't heard by\"],\n", + "# \"packed_relation_substitutes\": [[\"noticed/didn't notice\"], [\"was noticed by/wasn't noticed by\"]],\n", + "# \"relation_suffix\": \"whistle in the library\",\n", + "# \"packed_predicates\": [\"was annoyed/wasn't annoyed\", \"was annoying/wasn't annoying\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": True\n", + "# },\n", + "# {\n", + "# \"index\": 108000,\n", + "# \"orig_sentence\": \"John was jogging through the park when he saw a man juggling watermelons. [He] was very impressed.\",\n", + "# \"entities\": [\"John\", \"Amy\"],\n", + "# \"entity_substitutes\": [[\"Alice\", \"Bush\"], [\"Nancy\", \"Cindy\"]],\n", + "# \"packed_relations\": [\"accompanied with/didn't accompany with\", \"was accompanied by/wasn't accompanied by\"],\n", + "# \"packed_relation_substitutes\": [[\"stayed with/didn't stay with\"], [\"was't left alone by/was left alone by\"]],\n", + "# \"packed_predicates\": [\"is nice/isn't nice\", \"didn't feel lonely/felt lonely\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": True\n", + "# },\n", + "# {\n", + "# \"index\": 132000,\n", + "# \"orig_sentence\": \"Jane knocked on the door, and Susan answered it. [She] invited her to come out.\",\n", + "# \"entities\": [\"Jane\", \"Wendy\"],\n", + "# \"entity_substitutes\": [[\"Bob\", \"Tony\"], [\"Lily\", \"Lucy\"]],\n", + "# \"determiner\": \"\",\n", + "# \"packed_relations\": [\"knocked on the door and heard the answer from/didn't knock on the door and didn't hear the answer from\", \"answered the knocked from/didn't answer the knocked from\"],\n", + "# \"packed_relation_substitutes\": [[\"rang at the door and heard the answer from/didn't ring at the door and didn't hear the answer from\"], [\"answered the rang from/didn't answer the rang from\"]],\n", + "# \"relation_suffix\": \"\",\n", + "# \"packed_predicates\": [\"went in/didn't go in\", \"unlocked the door/didn't unlock the door\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": True\n", + "# },\n", + "# {\n", + "# \"index\": 150,\n", + "# \"orig_sentence\": \"Jackson was greatly influenced by Arnold, though [he] lived two centuries later.\",\n", + "# \"entities\": [\"Jack\", \"Betty\"],\n", + "# \"entity_substitutes\": [[\"Tom\", \"Jay\"], [\"Emily\", \"Helen\"]],\n", + "# \"packed_relations\": [\"was influenced by/wasn't influenced by\", \"influenced/didn't influence\"],\n", + "# \"packed_relation_substitutes\": [[\"was inspired by/didn't inspired by\"], [\"inspired/didn't inspire\"]],\n", + "# \"packed_predicates\": [\"lived two centuries later/didn't live two centuries later\", \"lived two centuries early/didn't live two centuries early\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# },\n", + " {\n", + " \"index\": 15000,\n", + " \"orig_sentence\": \"Jackson was greatly influenced by Arnold, though [he] lived two centuries later.\",\n", + " \"entities\": [\"Jack\", \"Betty\"],\n", + " \"entity_substitutes\": [[\"Tom\", \"Jay\"], [\"Emily\", \"Helen\"]],\n", + " \"packed_relations\": [\"always takes care of/dosen't take care of\", \"is always taken care of by/isn't taken care of by\"],\n", + " \"packed_relation_substitutes\": [[\"always looks after/dosen't look after\"], [\"always needs the help of/didn't need the help of\"]],\n", + " \"packed_predicates\": [\"is older/isn't older\", \"is younger/isn't younger\"],\n", + " },\n", + " {\n", + " \"index\": 160,\n", + " \"orig_sentence\": \"The actress used to be named Terpsichore, but she changed it to Tina a few years ago, because she figured [it] was too hard to pronounce.\",\n", + " \"entities\": [\"Betty\", \"Adele\"],\n", + " \"entity_substitutes\": [[\"Amy\", \"Cindy\"], [\"Alberta\", \"Caroline\"]],\n", + " \"packed_relations\": [\"replaced/didn't replace\", \"was changed to/wasn't changed to\"],\n", + " \"packed_relation_substitutes\": [[\"was substituted for/wasn't substituted for\"], [\"was replaced by/wasn't replaced by\"]],\n", + " \"relation_suffix\": \"as the actress's new name\",\n", + " \"packed_predicates\": [\"is easy to pronounce/isn't easy to pronounce\", \"is hard to pronounce/isn't hard to pronounce\"],\n", + " },\n", + "# {\n", + "# \"index\": 1660000,\n", + "# \"orig_sentence\": \"Fred is the only man still alive who remembers my great-grandfather. [He] is a remarkable man.\",\n", + "# \"entities\": [\"Tom\", \"grandmother\"],\n", + "# \"entity_substitutes\": [[\"Tim\", \"Mark\"], [\"grandma\", \"mother\"]],\n", + "# \"determiner\": \"\",\n", + "# \"packed_relations\": [\"still remembers/doesn't remember\", \"is remembered by/isn't remembered by\"],\n", + "# \"packed_relation_substitutes\": [[\"still recollect/doesn't recollect\"], [\"is recollected by/isn't recollected by\"]],\n", + "# \"relation_suffix\": \"\",\n", + "# \"packed_predicates\": [\"has good memory/doesn't have good memory\", \"was remarkable/wasn't remarkable\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": True\n", + "# },\n", + " {\n", + " \"index\": 1700000,\n", + " \"orig_sentence\": \"In July, Kamtchatka declared war on Yakutsk. Since Yakutsk's army was much better equipped and ten times larger, [they] were defeated within weeks.\",\n", + " \"entities\": [\"Germany\", \"Italy\"],\n", + " \"entity_substitutes\": [[\"Australia\", \"Japan\"], [\"Argentina\", \"Canada\"]],\n", + " \"packed_relations\": [\"defeated/didn't defeat\", \"was defeated by/wasn't defeated by\"],\n", + " \"packed_relation_substitutes\": [[\"conquered/didn't conquer\"], [\"was conquered by/wasn't conquered by\"]],\n", + " \"packed_predicates\": [\"was more powerful/wasn't more powerful\", \"was less powerful/wasn't less powerful\"],\n", + " },\n", + " {\n", + " \"index\": 186,\n", + " \"orig_sentence\": \"When the sponsors of the bill got to the town hall, they were surprised to find that the room was full of opponents. [They] were very much in the minority\",\n", + " \"entities\": [\"sponsors\", \"opponents\"],\n", + " \"entity_substitutes\": [[\"workers\", \"customers\"], [\"teachers\", \"students\"]],\n", + " \"determiner\": \"the\",\n", + " \"packed_relations\": [\"were less in number than/were not less in number than\", \"were more in number than/were not more in number than\"],\n", + " \"packed_relation_substitutes\": [[\"were outnumbered by/were not outnumbered by\"], [\"outnumbered/didn't outnumber\"]],\n", + " \"packed_predicates\": [\"were in the minority/were not in the minority\", \"were in the majority/were not in the majority\"],\n", + " \"reverse_causal\": True\n", + " },\n", + " {\n", + " \"index\": 188,\n", + " \"orig_sentence\": \"Everyone really loved the oatmeal cookies; only a few people liked the chocolate chip cookies. Next time, we should make more of [them] .\",\n", + " \"entities\": [\"cookies\", \"chips\"],\n", + " \"entity_substitutes\": [[\"apples\", \"bananas\"], [\"grapes\", \"sandwiches\"]],\n", + " \"determiner\": \"the\",\n", + " \"packed_relations\": [\"are more popular than/are less popular than\", \"lose to/don't lose to\"],\n", + " \"packed_relation_substitutes\": [[\"are sold more than/are sold less than\"], [\"are not as popular as/are as popular as\"]],\n", + " \"packed_predicates\": [\"should be made more next time/shouldn't be made more next time\", \"should be made less next time/shouldn't be made less next time\"],\n", + " \"reverse_causal\": True\n", + " },\n", + "# {\n", + "# \"index\": 1900,\n", + "# \"orig_sentence\": \"We had hoped to place copies of our newsletter on all the chairs in the auditorium, but there were simply not enough of [them] .\",\n", + "# \"entities\": [\"newspapers\", \"chairs\"],\n", + "# \"entity_substitutes\": [[\"cups\", \"pictures\"], [\"tables\", \"benches\"]],\n", + "# \"determiner\": \"the\",\n", + "# \"packed_relations\": [\"could be placed on all/couldn't be placed on all\", \"could hold all/couldn't hold all\"],\n", + "# \"packed_relation_substitutes\": [[\"could be put on all/couldn't be put on all\"], [\"could carry all/couldn't carry all\"]],\n", + "# \"relation_suffix\": \"in the auditorium\",\n", + "# \"packed_predicates\": [\"isn't too many/is too many\", \"is enough/isn't enough\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": True\n", + "# },\n", + " {\n", + " \"index\": 190,\n", + " \"orig_sentence\": \"We had hoped to place copies of our newsletter on all the chairs in the auditorium, but there were simply not enough of [them] .\",\n", + " \"entities\": [\"newspapers\", \"chairs\"],\n", + " \"entity_substitutes\": [[\"cups\", \"pictures\"], [\"tables\", \"benches\"]],\n", + " \"determiner\": \"the\",\n", + " \"packed_relations\": [\"could be placed on all/couldn't be placed on all\", \"could all be covered by/couldn't all be covered by\"],\n", + " \"packed_relation_substitutes\": [[\"could be put on all/couldn't be put on all\"], [\"could carry all/couldn't carry all\"]],\n", + " \"packed_predicates\": [\"there were many of/there were not many of\", \"there were few of/there were not few of\"],\n", + " \"prepositive_pred\": True,\n", + " },\n", + " {\n", + " \"index\": 19600,\n", + " \"orig_sentence\": \"Steve follows Fred's example in everything. [He] admires him hugely.\",\n", + " \"entities\": [\"Steve\", \"Lucy\"],\n", + " \"entity_substitutes\": [[\"Fred\", \"George\"], [\"Lily\", \"Wendy\"]],\n", + " \"packed_relations\": [\"follows/doesn't follow\", \"is followed by/isn't followed by\"],\n", + " \"packed_relation_substitutes\": [[\"imitates/doesn't imitate\"], [\"is imitated by/isn't imitated by\"]],\n", + " \"relation_suffix\": \"in everything\",\n", + " \"packed_predicates\": [\"is bad at making decisions/isn't bad at making decisions\", \"is good at making decisions/isn't good at making decisions\"],\n", + " },\n", + " {\n", + " \"index\": 198,\n", + " \"orig_sentence\": \"The table won't fit through the doorway because [it] is too wide.\",\n", + " \"entities\": [\"table\", \"doorway\"],\n", + " \"entity_substitutes\": [[\"desk\", \"sofa\"], [\"corridor\", \"hallway\"]],\n", + " \"determiner\": \"the\",\n", + " \"packed_relations\": [\"will fit through/won't fit through\", \"will be fitted through by/won't be fitted through by\"],\n", + " \"packed_relation_substitutes\": [[\"will pass through/won't pass through\"], [\"will be passed through by/won't be passed through by\"]],\n", + " \"packed_predicates\": [\"is narrow/isn't narrow\", \"is wide/isn't wide\"],\n", + " },\n", + "# {\n", + "# \"index\": 2000,\n", + "# \"orig_sentence\": \"Grace was happy to trade me her sweater for my jacket. She thinks [it] looks dowdy on her.\",\n", + "# \"entities\": [\"Mandy\", \"Tim\"],\n", + "# \"entity_substitutes\": [[\"Betty\", \"Nancy\"], [\"Bob\", \"John\"]],\n", + "# \"determiner\": \"\",\n", + "# \"packed_relations\": [\"traded the sweater with/didn't trade the sweater with\", \"traded the T-shirt for the sweater with/didn't trade the T-shirt for the sweater with\"],\n", + "# \"packed_relation_substitutes\": [[\"exchanged the sweater with/didn't exchange the sweater with\"], [\"exchanged the T-shirt for the sweater with/didn't exchange the T-shirt for the sweater with\"]],\n", + "# \"relation_suffix\": \"\",\n", + "# \"packed_predicates\": [\"thinks it looks bad/thinks it doesn't look bad\", \"thinks it looks great/thinks it doesn't look great\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": False\n", + "# },\n", + " {\n", + " \"index\": 2000000,\n", + " \"orig_sentence\": \"Grace was happy to trade me her sweater for my jacket. She thinks [it] looks dowdy on her.\",\n", + " \"entities\": [\"sweater\", \"jacket\"],\n", + " \"entity_substitutes\": [[\"skirt\", \"cap\"], [\"hat\", \"short\"]],\n", + " \"determiner\": \"the\",\n", + " \"packed_relations\": [\"is traded by Grace for/isn't traded by Grace for\", \"is substituted by Grace for/isn't substituted by Grace for\"],\n", + " \"packed_relation_substitutes\": [[\"is replaced by Grace with/isn't replaced by Grace with\"], [\"is preferred by Grace to/isn't preferred by Grace to\"]],\n", + " \"packed_predicates\": [\"looks bad/looks not bad\", \"looks good/looks not good\"],\n", + " \"predicate_prefix\": \"she thinks\",\n", + " },\n", + "# {\n", + "# \"index\": 2020000,\n", + "# \"orig_sentence\": \"John hired Bill to take care of [him] .\",\n", + "# \"entities\": [\"Bill\", \"Mandy\"],\n", + "# \"entity_substitutes\": [[\"Ted\", \"Jackson\"], [\"Lily\", \"Peggy\"]],\n", + "# \"determiner\": \"\",\n", + "# \"packed_relations\": [\"hired/didn't hire\", \"was hired by/wasn't hired by\"],\n", + "# \"packed_relation_substitutes\": [[\"asked/didn't ask\"], [\"was asked by/wasn't asked by\"]],\n", + "# \"relation_suffix\": \"to take care of him\",\n", + "# \"packed_predicates\": [\"is sick/isn't sick\", \"is nice/isn't nice\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": False\n", + "# },\n", + "# {\n", + "# \"index\": 204,\n", + "# \"orig_sentence\": \"John promised Bill to leave, so an hour later [he] left.\",\n", + "# \"entities\": [\"Bill\", \"Mandy\"],\n", + "# \"entity_substitutes\": [[\"Ted\", \"Jackson\"], [\"Lily\", \"Peggy\"]],\n", + "# \"determiner\": \"\",\n", + "# \"packed_relations\": [\"promised/didn't promise\", \"was promised by/wasn't promised by\"],\n", + "# \"packed_relation_substitutes\": [[\"guaranteed/didn't guarantee\"], [\"was guaranteed by/wasn't guaranteed by\"]],\n", + "# \"relation_suffix\": \"to leave\",\n", + "# \"packed_predicates\": [\"left/didn't leave\", \"was left alone/wasn't left alone\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": True\n", + "# },\n", + "# {\n", + "# \"index\": 2100000,\n", + "# \"orig_sentence\": \"Jane knocked on Susan's door but [she] did not get an answer.\",\n", + "# \"entities\": [\"Jane\", \"Frank\"],\n", + "# \"entity_substitutes\": [[\"Susan\", \"Sandy\"], [\"Tony\", \"Paul\"]],\n", + "# \"determiner\": \"\",\n", + "# \"packed_relations\": [\"called/didn't call\", \"was called by/wasn't called by\"],\n", + "# \"packed_relation_substitutes\": [[\"contected/didn't contect\"], [\"was contected by/wasn't contected by\"]],\n", + "# \"relation_suffix\": \"\",\n", + "# \"packed_predicates\": [\"got an answer/didn't get an answer\", \"answered the phone/didn't answer the phone\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": True\n", + "# },\n", + "# {\n", + "# \"index\": 212,\n", + "# \"orig_sentence\": \"Joe paid the detective after [he] received the final report on the case\",\n", + "# \"entities\": [\"Betty\", \"Peter\"],\n", + "# \"entity_substitutes\": [[\"Tina\", \"Donna\"], [\"Bush\", \"Billy\"]],\n", + "# \"determiner\": \"\",\n", + "# \"packed_relations\": [\"paid/didn't pay\", \"was paied by/wasn't paied by\"],\n", + "# \"packed_relation_substitutes\": [[\"gave money to/didn't give money to\"], [\"received money from/didn't receive money from\"]],\n", + "# \"relation_suffix\": \"\",\n", + "# \"packed_predicates\": [\"received the final report on the case/didn't receive the final report on the caser\", \"delivered the final report on the case/didn't delivered the final report on the caser\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": False\n", + "# },\n", + " {\n", + " \"index\": 226,\n", + " \"orig_sentence\": \"Bill passed the half-empty plate to John because [he] was full.\",\n", + " \"entities\": [\"Bill\", \"Amy\"],\n", + " \"entity_substitutes\": [[\"Brian\", \"David\"], [\"Emma\", \"Helen\"]],\n", + " \"packed_relations\": [\"passed the half-empty plate to/didn't pass the half-empty plate to\", \"received the half-empty plate from/didn't received the half-empty plate from\"],\n", + " \"packed_relation_substitutes\": [[\"gave the half-empty plate to/didn't give the half-empty plate to\"], [\"took the half-empty plate from/didn't take the half-empty plate from\"]],\n", + " \"packed_predicates\": [\"was full/wasn't full\", \"was hungry/wasn't hungry\"],\n", + " },\n", + " {\n", + " \"index\": 252,\n", + " \"orig_sentence\": \"George got free tickets to the play, but he gave them to Eric, even though [he] was particularly eager to see it.\",\n", + " \"entities\": [\"George\", \"Linda\"],\n", + " \"entity_substitutes\": [[\"Eric\", \"Ted\"], [\"Cindy\", \"Lucy\"]],\n", + " \"packed_relations\": [\"gave the tickets of the play to/didn't give the tickets of the play to\", \"received the tickets of the play from/didn't receive the tickets of the play from\"],\n", + " \"packed_relation_substitutes\": [[\"sent the tickets of the play to/didn't send the tickets of the play to\"], [\"took the tickets of the play from/didn't take the tickets of the play from\"]],\n", + " \"packed_predicates\": [\"wasn't interested in it/was interested in it\", \"was eager to see it/wasn't eager to see it\"],\n", + " },\n", + "# {\n", + "# \"index\": 255,\n", + "# \"orig_sentence\": \"Jane gave Joan candy because [she] wasn't hungry.\",\n", + "# \"entities\": [\"Helen\", \"Ted\"],\n", + "# \"entity_substitutes\": [[\"Wendy\", \"Lucy\"], [\"Charles\", \"Billy\"]],\n", + "# \"determiner\": \"\",\n", + "# \"packed_relations\": [\"gave candy to/didn't give candy to\", \"received candy from/didn't receive candy from\"],\n", + "# \"packed_relation_substitutes\": [[\"delivered candy to/didn't deliver candy to\"], [\"accepted candy from/didn't accept candy from\"]],\n", + "# \"relation_suffix\": \"\",\n", + "# \"packed_predicates\": [\"was full/wasn't full\", \"was hungry/wasn't hungry\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": False\n", + "# },\n", + "# {\n", + "# \"index\": 259,\n", + "# \"orig_sentence\": \"James asked Robert for a favor but [he] was refused.\",\n", + "# \"entities\": [\"James\", \"Amy\"],\n", + "# \"entity_substitutes\": [[\"Robert\", \"Jack\"], [\"Donna\", \"Emily\"]],\n", + "# \"determiner\": \"\",\n", + "# \"packed_relations\": [\"asked/didn't asked\", \"was asked by/wasn't asked by\"],\n", + "# \"packed_relation_substitutes\": [[\"queried/didn't query\"], [\"was queried by/wasn't queried by\"]],\n", + "# \"relation_suffix\": \"for a favor\",\n", + "# \"packed_predicates\": [\"was refused/wasn't refused\", \"refused/didn't refuse\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": True\n", + "# },\n", + " {\n", + " \"index\": 261,\n", + " \"orig_sentence\": \"Kirilov ceded the presidency to Shatov because [he] was less popular.\",\n", + " \"entities\": [\"James\", \"Amy\"],\n", + " \"entity_substitutes\": [[\"Robert\", \"Jack\"], [\"Donna\", \"Emily\"]],\n", + " \"packed_relations\": [\"ceded the presidency to/didn't cede the presidency to\", \"took over the presidency from/didn't take over the presidency from\"],\n", + " \"packed_relation_substitutes\": [[\"gave the presidency to/didn't give the presidency to\"], [\"got the presidency from/didn't get the presidency from\"]],\n", + " \"packed_predicates\": [\"was notorious/was not notorious\", \"was popular/wasn't popular\"],\n", + " },\n", + "# {\n", + "# \"index\": 2630000,\n", + "# \"orig_sentence\": \"Emma did not pass the ball to Janie although [she] saw that she was open.\",\n", + "# \"entities\": [\"Emma\", \"Alberta\"],\n", + "# \"entity_substitutes\": [[\"Lily\", \"Nancy\"], [\"George\", \"Henry\"]],\n", + "# \"determiner\": \"\",\n", + "# \"packed_relations\": [\"passed the ball to/didn't pass the ball to\", \"was passed the ball by/wasn't passed the ball by\"],\n", + "# \"packed_relation_substitutes\": [[\"gave the ball to/didn't give the ball to\"], [\"was given the ball by/wasn't given the ball by\"]],\n", + "# \"relation_suffix\": \"\",\n", + "# \"packed_predicates\": [\"had enough strength/didn't have enough strength\", \"was open/wasn't open\"],\n", + "# \"predicate_dichotomy\": False,\n", + "# \"reverse_causal\": False\n", + "# },\n", + "]\n", + "len(frames)" + ] + }, + { + "cell_type": "code", + "execution_count": 406, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "causal_sentences, turning_sentences, substituted_sent_groups = \\\n", + " make_sentences(A_template, B_template, causal_templates, turning_templates, **get_frame(frames, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": 407, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[\"the trophy doesn't fit into the suitcase because the [trophy] is large.\",\n", + " \"the trophy doesn't fit into the suitcase because the [suitcase] is small.\",\n", + " \"the trophy doesn't fit into the suitcase because the [trophy] isn't small.\",\n", + " \"the trophy doesn't fit into the suitcase because the [suitcase] isn't large.\",\n", + " \"the suitcase doesn't hold the trophy because the [trophy] is large.\",\n", + " \"the suitcase doesn't hold the trophy because the [suitcase] is small.\",\n", + " \"the suitcase doesn't hold the trophy because the [trophy] isn't small.\",\n", + " \"the suitcase doesn't hold the trophy because the [suitcase] isn't large.\",\n", + " \"the trophy can fit into the suitcase because the [trophy] isn't large.\",\n", + " \"the trophy can fit into the suitcase because the [suitcase] isn't small.\",\n", + " 'the trophy can fit into the suitcase because the [trophy] is small.',\n", + " 'the trophy can fit into the suitcase because the [suitcase] is large.',\n", + " \"the suitcase can hold the trophy because the [trophy] isn't large.\",\n", + " \"the suitcase can hold the trophy because the [suitcase] isn't small.\",\n", + " 'the suitcase can hold the trophy because the [trophy] is small.',\n", + " 'the suitcase can hold the trophy because the [suitcase] is large.']" + ] + }, + "execution_count": 407, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "[\"the trophy doesn't fit into the suitcase although the [trophy] isn't large.\",\n", + " \"the trophy doesn't fit into the suitcase although the [suitcase] isn't small.\",\n", + " \"the trophy doesn't fit into the suitcase although the [trophy] is small.\",\n", + " \"the trophy doesn't fit into the suitcase although the [suitcase] is large.\",\n", + " \"the suitcase doesn't hold the trophy although the [trophy] isn't large.\",\n", + " \"the suitcase doesn't hold the trophy although the [suitcase] isn't small.\",\n", + " \"the suitcase doesn't hold the trophy although the [trophy] is small.\",\n", + " \"the suitcase doesn't hold the trophy although the [suitcase] is large.\",\n", + " 'the trophy can fit into the suitcase although the [trophy] is large.',\n", + " 'the trophy can fit into the suitcase although the [suitcase] is small.',\n", + " \"the trophy can fit into the suitcase although the [trophy] isn't small.\",\n", + " \"the trophy can fit into the suitcase although the [suitcase] isn't large.\",\n", + " 'the suitcase can hold the trophy although the [trophy] is large.',\n", + " 'the suitcase can hold the trophy although the [suitcase] is small.',\n", + " \"the suitcase can hold the trophy although the [trophy] isn't small.\",\n", + " \"the suitcase can hold the trophy although the [suitcase] isn't large.\"]" + ] + }, + "execution_count": 407, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "32" + ] + }, + "execution_count": 407, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": [ + "[[\"the ball doesn't fit into the bag because the [ball] is large.\",\n", + " \"the ball doesn't fit into the box because the [ball] is large.\",\n", + " \"the toy doesn't fit into the bag because the [toy] is large.\",\n", + " \"the toy doesn't fit into the box because the [toy] is large.\",\n", + " \"the ball can't be put into the bag because the [ball] is large.\",\n", + " \"the ball can't be put into the box because the [ball] is large.\",\n", + " \"the toy can't be put into the bag because the [toy] is large.\",\n", + " \"the toy can't be put into the box because the [toy] is large.\"],\n", + " [\"the ball doesn't fit into the bag because the [bag] is small.\",\n", + " \"the ball doesn't fit into the box because the [box] is small.\",\n", + " \"the toy doesn't fit into the bag because the [bag] is small.\",\n", + " \"the toy doesn't fit into the box because the [box] is small.\",\n", + " \"the ball can't be put into the bag because the [bag] is small.\",\n", + " \"the ball can't be put into the box because the [box] is small.\",\n", + " \"the toy can't be put into the bag because the [bag] is small.\",\n", + " \"the toy can't be put into the box because the [box] is small.\"],\n", + " [\"the ball doesn't fit into the bag because the [ball] isn't small.\",\n", + " \"the ball doesn't fit into the box because the [ball] isn't small.\",\n", + " \"the toy doesn't fit into the bag because the [toy] isn't small.\",\n", + " \"the toy doesn't fit into the box because the [toy] isn't small.\",\n", + " \"the ball can't be put into the bag because the [ball] isn't small.\",\n", + " \"the ball can't be put into the box because the [ball] isn't small.\",\n", + " \"the toy can't be put into the bag because the [toy] isn't small.\",\n", + " \"the toy can't be put into the box because the [toy] isn't small.\"],\n", + " [\"the ball doesn't fit into the bag because the [bag] isn't large.\",\n", + " \"the ball doesn't fit into the box because the [box] isn't large.\",\n", + " \"the toy doesn't fit into the bag because the [bag] isn't large.\",\n", + " \"the toy doesn't fit into the box because the [box] isn't large.\",\n", + " \"the ball can't be put into the bag because the [bag] isn't large.\",\n", + " \"the ball can't be put into the box because the [box] isn't large.\",\n", + " \"the toy can't be put into the bag because the [bag] isn't large.\",\n", + " \"the toy can't be put into the box because the [box] isn't large.\"],\n", + " [\"the bag doesn't hold the ball because the [ball] is large.\",\n", + " \"the box doesn't hold the ball because the [ball] is large.\",\n", + " \"the bag doesn't hold the toy because the [toy] is large.\",\n", + " \"the box doesn't hold the toy because the [toy] is large.\",\n", + " \"the bag doesn't have enough room for the ball because the [ball] is large.\",\n", + " \"the box doesn't have enough room for the ball because the [ball] is large.\",\n", + " \"the bag doesn't have enough room for the toy because the [toy] is large.\",\n", + " \"the box doesn't have enough room for the toy because the [toy] is large.\"],\n", + " [\"the bag doesn't hold the ball because the [bag] is small.\",\n", + " \"the box doesn't hold the ball because the [box] is small.\",\n", + " \"the bag doesn't hold the toy because the [bag] is small.\",\n", + " \"the box doesn't hold the toy because the [box] is small.\",\n", + " \"the bag doesn't have enough room for the ball because the [bag] is small.\",\n", + " \"the box doesn't have enough room for the ball because the [box] is small.\",\n", + " \"the bag doesn't have enough room for the toy because the [bag] is small.\",\n", + " \"the box doesn't have enough room for the toy because the [box] is small.\"],\n", + " [\"the bag doesn't hold the ball because the [ball] isn't small.\",\n", + " \"the box doesn't hold the ball because the [ball] isn't small.\",\n", + " \"the bag doesn't hold the toy because the [toy] isn't small.\",\n", + " \"the box doesn't hold the toy because the [toy] isn't small.\",\n", + " \"the bag doesn't have enough room for the ball because the [ball] isn't small.\",\n", + " \"the box doesn't have enough room for the ball because the [ball] isn't small.\",\n", + " \"the bag doesn't have enough room for the toy because the [toy] isn't small.\",\n", + " \"the box doesn't have enough room for the toy because the [toy] isn't small.\"],\n", + " [\"the bag doesn't hold the ball because the [bag] isn't large.\",\n", + " \"the box doesn't hold the ball because the [box] isn't large.\",\n", + " \"the bag doesn't hold the toy because the [bag] isn't large.\",\n", + " \"the box doesn't hold the toy because the [box] isn't large.\",\n", + " \"the bag doesn't have enough room for the ball because the [bag] isn't large.\",\n", + " \"the box doesn't have enough room for the ball because the [box] isn't large.\",\n", + " \"the bag doesn't have enough room for the toy because the [bag] isn't large.\",\n", + " \"the box doesn't have enough room for the toy because the [box] isn't large.\"],\n", + " [\"the ball can fit into the bag because the [ball] isn't large.\",\n", + " \"the ball can fit into the box because the [ball] isn't large.\",\n", + " \"the toy can fit into the bag because the [toy] isn't large.\",\n", + " \"the toy can fit into the box because the [toy] isn't large.\",\n", + " \"the ball can be put into the bag because the [ball] isn't large.\",\n", + " \"the ball can be put into the box because the [ball] isn't large.\",\n", + " \"the toy can be put into the bag because the [toy] isn't large.\",\n", + " \"the toy can be put into the box because the [toy] isn't large.\"],\n", + " [\"the ball can fit into the bag because the [bag] isn't small.\",\n", + " \"the ball can fit into the box because the [box] isn't small.\",\n", + " \"the toy can fit into the bag because the [bag] isn't small.\",\n", + " \"the toy can fit into the box because the [box] isn't small.\",\n", + " \"the ball can be put into the bag because the [bag] isn't small.\",\n", + " \"the ball can be put into the box because the [box] isn't small.\",\n", + " \"the toy can be put into the bag because the [bag] isn't small.\",\n", + " \"the toy can be put into the box because the [box] isn't small.\"],\n", + " ['the ball can fit into the bag because the [ball] is small.',\n", + " 'the ball can fit into the box because the [ball] is small.',\n", + " 'the toy can fit into the bag because the [toy] is small.',\n", + " 'the toy can fit into the box because the [toy] is small.',\n", + " 'the ball can be put into the bag because the [ball] is small.',\n", + " 'the ball can be put into the box because the [ball] is small.',\n", + " 'the toy can be put into the bag because the [toy] is small.',\n", + " 'the toy can be put into the box because the [toy] is small.'],\n", + " ['the ball can fit into the bag because the [bag] is large.',\n", + " 'the ball can fit into the box because the [box] is large.',\n", + " 'the toy can fit into the bag because the [bag] is large.',\n", + " 'the toy can fit into the box because the [box] is large.',\n", + " 'the ball can be put into the bag because the [bag] is large.',\n", + " 'the ball can be put into the box because the [box] is large.',\n", + " 'the toy can be put into the bag because the [bag] is large.',\n", + " 'the toy can be put into the box because the [box] is large.'],\n", + " [\"the bag can hold the ball because the [ball] isn't large.\",\n", + " \"the box can hold the ball because the [ball] isn't large.\",\n", + " \"the bag can hold the toy because the [toy] isn't large.\",\n", + " \"the box can hold the toy because the [toy] isn't large.\",\n", + " \"the bag has enough room for the ball because the [ball] isn't large.\",\n", + " \"the box has enough room for the ball because the [ball] isn't large.\",\n", + " \"the bag has enough room for the toy because the [toy] isn't large.\",\n", + " \"the box has enough room for the toy because the [toy] isn't large.\"],\n", + " [\"the bag can hold the ball because the [bag] isn't small.\",\n", + " \"the box can hold the ball because the [box] isn't small.\",\n", + " \"the bag can hold the toy because the [bag] isn't small.\",\n", + " \"the box can hold the toy because the [box] isn't small.\",\n", + " \"the bag has enough room for the ball because the [bag] isn't small.\",\n", + " \"the box has enough room for the ball because the [box] isn't small.\",\n", + " \"the bag has enough room for the toy because the [bag] isn't small.\",\n", + " \"the box has enough room for the toy because the [box] isn't small.\"],\n", + " ['the bag can hold the ball because the [ball] is small.',\n", + " 'the box can hold the ball because the [ball] is small.',\n", + " 'the bag can hold the toy because the [toy] is small.',\n", + " 'the box can hold the toy because the [toy] is small.',\n", + " 'the bag has enough room for the ball because the [ball] is small.',\n", + " 'the box has enough room for the ball because the [ball] is small.',\n", + " 'the bag has enough room for the toy because the [toy] is small.',\n", + " 'the box has enough room for the toy because the [toy] is small.'],\n", + " ['the bag can hold the ball because the [bag] is large.',\n", + " 'the box can hold the ball because the [box] is large.',\n", + " 'the bag can hold the toy because the [bag] is large.',\n", + " 'the box can hold the toy because the [box] is large.',\n", + " 'the bag has enough room for the ball because the [bag] is large.',\n", + " 'the box has enough room for the ball because the [box] is large.',\n", + " 'the bag has enough room for the toy because the [bag] is large.',\n", + " 'the box has enough room for the toy because the [box] is large.'],\n", + " [\"the ball doesn't fit into the bag although the [ball] isn't large.\",\n", + " \"the ball doesn't fit into the box although the [ball] isn't large.\",\n", + " \"the toy doesn't fit into the bag although the [toy] isn't large.\",\n", + " \"the toy doesn't fit into the box although the [toy] isn't large.\",\n", + " \"the ball can't be put into the bag although the [ball] isn't large.\",\n", + " \"the ball can't be put into the box although the [ball] isn't large.\",\n", + " \"the toy can't be put into the bag although the [toy] isn't large.\",\n", + " \"the toy can't be put into the box although the [toy] isn't large.\"],\n", + " [\"the ball doesn't fit into the bag although the [bag] isn't small.\",\n", + " \"the ball doesn't fit into the box although the [box] isn't small.\",\n", + " \"the toy doesn't fit into the bag although the [bag] isn't small.\",\n", + " \"the toy doesn't fit into the box although the [box] isn't small.\",\n", + " \"the ball can't be put into the bag although the [bag] isn't small.\",\n", + " \"the ball can't be put into the box although the [box] isn't small.\",\n", + " \"the toy can't be put into the bag although the [bag] isn't small.\",\n", + " \"the toy can't be put into the box although the [box] isn't small.\"],\n", + " [\"the ball doesn't fit into the bag although the [ball] is small.\",\n", + " \"the ball doesn't fit into the box although the [ball] is small.\",\n", + " \"the toy doesn't fit into the bag although the [toy] is small.\",\n", + " \"the toy doesn't fit into the box although the [toy] is small.\",\n", + " \"the ball can't be put into the bag although the [ball] is small.\",\n", + " \"the ball can't be put into the box although the [ball] is small.\",\n", + " \"the toy can't be put into the bag although the [toy] is small.\",\n", + " \"the toy can't be put into the box although the [toy] is small.\"],\n", + " [\"the ball doesn't fit into the bag although the [bag] is large.\",\n", + " \"the ball doesn't fit into the box although the [box] is large.\",\n", + " \"the toy doesn't fit into the bag although the [bag] is large.\",\n", + " \"the toy doesn't fit into the box although the [box] is large.\",\n", + " \"the ball can't be put into the bag although the [bag] is large.\",\n", + " \"the ball can't be put into the box although the [box] is large.\",\n", + " \"the toy can't be put into the bag although the [bag] is large.\",\n", + " \"the toy can't be put into the box although the [box] is large.\"],\n", + " [\"the bag doesn't hold the ball although the [ball] isn't large.\",\n", + " \"the box doesn't hold the ball although the [ball] isn't large.\",\n", + " \"the bag doesn't hold the toy although the [toy] isn't large.\",\n", + " \"the box doesn't hold the toy although the [toy] isn't large.\",\n", + " \"the bag doesn't have enough room for the ball although the [ball] isn't large.\",\n", + " \"the box doesn't have enough room for the ball although the [ball] isn't large.\",\n", + " \"the bag doesn't have enough room for the toy although the [toy] isn't large.\",\n", + " \"the box doesn't have enough room for the toy although the [toy] isn't large.\"],\n", + " [\"the bag doesn't hold the ball although the [bag] isn't small.\",\n", + " \"the box doesn't hold the ball although the [box] isn't small.\",\n", + " \"the bag doesn't hold the toy although the [bag] isn't small.\",\n", + " \"the box doesn't hold the toy although the [box] isn't small.\",\n", + " \"the bag doesn't have enough room for the ball although the [bag] isn't small.\",\n", + " \"the box doesn't have enough room for the ball although the [box] isn't small.\",\n", + " \"the bag doesn't have enough room for the toy although the [bag] isn't small.\",\n", + " \"the box doesn't have enough room for the toy although the [box] isn't small.\"],\n", + " [\"the bag doesn't hold the ball although the [ball] is small.\",\n", + " \"the box doesn't hold the ball although the [ball] is small.\",\n", + " \"the bag doesn't hold the toy although the [toy] is small.\",\n", + " \"the box doesn't hold the toy although the [toy] is small.\",\n", + " \"the bag doesn't have enough room for the ball although the [ball] is small.\",\n", + " \"the box doesn't have enough room for the ball although the [ball] is small.\",\n", + " \"the bag doesn't have enough room for the toy although the [toy] is small.\",\n", + " \"the box doesn't have enough room for the toy although the [toy] is small.\"],\n", + " [\"the bag doesn't hold the ball although the [bag] is large.\",\n", + " \"the box doesn't hold the ball although the [box] is large.\",\n", + " \"the bag doesn't hold the toy although the [bag] is large.\",\n", + " \"the box doesn't hold the toy although the [box] is large.\",\n", + " \"the bag doesn't have enough room for the ball although the [bag] is large.\",\n", + " \"the box doesn't have enough room for the ball although the [box] is large.\",\n", + " \"the bag doesn't have enough room for the toy although the [bag] is large.\",\n", + " \"the box doesn't have enough room for the toy although the [box] is large.\"],\n", + " ['the ball can fit into the bag although the [ball] is large.',\n", + " 'the ball can fit into the box although the [ball] is large.',\n", + " 'the toy can fit into the bag although the [toy] is large.',\n", + " 'the toy can fit into the box although the [toy] is large.',\n", + " 'the ball can be put into the bag although the [ball] is large.',\n", + " 'the ball can be put into the box although the [ball] is large.',\n", + " 'the toy can be put into the bag although the [toy] is large.',\n", + " 'the toy can be put into the box although the [toy] is large.'],\n", + " ['the ball can fit into the bag although the [bag] is small.',\n", + " 'the ball can fit into the box although the [box] is small.',\n", + " 'the toy can fit into the bag although the [bag] is small.',\n", + " 'the toy can fit into the box although the [box] is small.',\n", + " 'the ball can be put into the bag although the [bag] is small.',\n", + " 'the ball can be put into the box although the [box] is small.',\n", + " 'the toy can be put into the bag although the [bag] is small.',\n", + " 'the toy can be put into the box although the [box] is small.'],\n", + " [\"the ball can fit into the bag although the [ball] isn't small.\",\n", + " \"the ball can fit into the box although the [ball] isn't small.\",\n", + " \"the toy can fit into the bag although the [toy] isn't small.\",\n", + " \"the toy can fit into the box although the [toy] isn't small.\",\n", + " \"the ball can be put into the bag although the [ball] isn't small.\",\n", + " \"the ball can be put into the box although the [ball] isn't small.\",\n", + " \"the toy can be put into the bag although the [toy] isn't small.\",\n", + " \"the toy can be put into the box although the [toy] isn't small.\"],\n", + " [\"the ball can fit into the bag although the [bag] isn't large.\",\n", + " \"the ball can fit into the box although the [box] isn't large.\",\n", + " \"the toy can fit into the bag although the [bag] isn't large.\",\n", + " \"the toy can fit into the box although the [box] isn't large.\",\n", + " \"the ball can be put into the bag although the [bag] isn't large.\",\n", + " \"the ball can be put into the box although the [box] isn't large.\",\n", + " \"the toy can be put into the bag although the [bag] isn't large.\",\n", + " \"the toy can be put into the box although the [box] isn't large.\"],\n", + " ['the bag can hold the ball although the [ball] is large.',\n", + " 'the box can hold the ball although the [ball] is large.',\n", + " 'the bag can hold the toy although the [toy] is large.',\n", + " 'the box can hold the toy although the [toy] is large.',\n", + " 'the bag has enough room for the ball although the [ball] is large.',\n", + " 'the box has enough room for the ball although the [ball] is large.',\n", + " 'the bag has enough room for the toy although the [toy] is large.',\n", + " 'the box has enough room for the toy although the [toy] is large.'],\n", + " ['the bag can hold the ball although the [bag] is small.',\n", + " 'the box can hold the ball although the [box] is small.',\n", + " 'the bag can hold the toy although the [bag] is small.',\n", + " 'the box can hold the toy although the [box] is small.',\n", + " 'the bag has enough room for the ball although the [bag] is small.',\n", + " 'the box has enough room for the ball although the [box] is small.',\n", + " 'the bag has enough room for the toy although the [bag] is small.',\n", + " 'the box has enough room for the toy although the [box] is small.'],\n", + " [\"the bag can hold the ball although the [ball] isn't small.\",\n", + " \"the box can hold the ball although the [ball] isn't small.\",\n", + " \"the bag can hold the toy although the [toy] isn't small.\",\n", + " \"the box can hold the toy although the [toy] isn't small.\",\n", + " \"the bag has enough room for the ball although the [ball] isn't small.\",\n", + " \"the box has enough room for the ball although the [ball] isn't small.\",\n", + " \"the bag has enough room for the toy although the [toy] isn't small.\",\n", + " \"the box has enough room for the toy although the [toy] isn't small.\"],\n", + " [\"the bag can hold the ball although the [bag] isn't large.\",\n", + " \"the box can hold the ball although the [box] isn't large.\",\n", + " \"the bag can hold the toy although the [bag] isn't large.\",\n", + " \"the box can hold the toy although the [box] isn't large.\",\n", + " \"the bag has enough room for the ball although the [bag] isn't large.\",\n", + " \"the box has enough room for the ball although the [box] isn't large.\",\n", + " \"the bag has enough room for the toy although the [bag] isn't large.\",\n", + " \"the box has enough room for the toy although the [box] isn't large.\"]]" + ] + }, + "execution_count": 407, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "causal_sentences\n", + "turning_sentences\n", + "len(substituted_sent_groups)\n", + "substituted_sent_groups" + ] + }, + { + "cell_type": "code", + "execution_count": 275, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "index: 190\n", + "\n", + "\n", + "1-1, The newspapers could be placed on all the chairs, because there were many of the newspapers. The newspapers could be placed on all the chairs, because there were few of the chairs. The newspapers could be placed on all the chairs, because there were not few of the newspapers. The newspapers could be placed on all the chairs, because there were not many of the chairs. The chairs could all be covered by the newspapers, because there were many of the newspapers. The chairs could all be covered by the newspapers, because there were few of the chairs. The chairs could all be covered by the newspapers, because there were not few of the newspapers. The chairs could all be covered by the newspapers, because there were not many of the chairs. The newspapers couldn't be placed on all the chairs, because there were not many of the newspapers. The newspapers couldn't be placed on all the chairs, because there were not few of the chairs. The newspapers couldn't be placed on all the chairs, because there were few of the newspapers. The newspapers couldn't be placed on all the chairs, because there were many of the chairs. The chairs couldn't all be covered by the newspapers, because there were not many of the newspapers. The chairs couldn't all be covered by the newspapers, because there were not few of the chairs. The chairs couldn't all be covered by the newspapers, because there were few of the newspapers. The chairs couldn't all be covered by the newspapers, because there were many of the chairs. \n", + "\n", + "\n", + "2-1, The newspapers could be placed on all the chairs, although there were not many of the newspapers. The newspapers could be placed on all the chairs, although there were not few of the chairs. The newspapers could be placed on all the chairs, although there were few of the newspapers. The newspapers could be placed on all the chairs, although there were many of the chairs. The chairs could all be covered by the newspapers, although there were not many of the newspapers. The chairs could all be covered by the newspapers, although there were not few of the chairs. The chairs could all be covered by the newspapers, although there were few of the newspapers. The chairs could all be covered by the newspapers, although there were many of the chairs. The newspapers couldn't be placed on all the chairs, although there were many of the newspapers. The newspapers couldn't be placed on all the chairs, although there were few of the chairs. The newspapers couldn't be placed on all the chairs, although there were not few of the newspapers. The newspapers couldn't be placed on all the chairs, although there were not many of the chairs. The chairs couldn't all be covered by the newspapers, although there were many of the newspapers. The chairs couldn't all be covered by the newspapers, although there were few of the chairs. The chairs couldn't all be covered by the newspapers, although there were not few of the newspapers. The chairs couldn't all be covered by the newspapers, although there were not many of the chairs. \n", + "\n", + "\n", + "******************************************\n", + "\n", + "\n", + "3-1, The chairs could all be covered by the newspapers, because there were many of the newspapers. The tables could all be covered by the newspapers, because there were many of the newspapers. The benches could all be covered by the newspapers, because there were many of the newspapers. The chairs could all be covered by the cups, because there were many of the cups. The tables could all be covered by the cups, because there were many of the cups. The benches could all be covered by the cups, because there were many of the cups. The chairs could all be covered by the pictures, because there were many of the pictures. The tables could all be covered by the pictures, because there were many of the pictures. The benches could all be covered by the pictures, because there were many of the pictures. The chairs couldn't all be covered by the newspapers, because there were not many of the newspapers. The tables couldn't all be covered by the newspapers, because there were not many of the newspapers. The benches couldn't all be covered by the newspapers, because there were not many of the newspapers. The chairs couldn't all be covered by the cups, because there were not many of the cups. The tables couldn't all be covered by the cups, because there were not many of the cups. The benches couldn't all be covered by the cups, because there were not many of the cups. The chairs couldn't all be covered by the pictures, because there were not many of the pictures. The tables couldn't all be covered by the pictures, because there were not many of the pictures. The benches couldn't all be covered by the pictures, because there were not many of the pictures. The chairs couldn't carry all the newspapers, although there were not few of the newspapers. The tables couldn't carry all the newspapers, although there were not few of the newspapers. The benches couldn't carry all the newspapers, although there were not few of the newspapers. The chairs couldn't carry all the cups, although there were not few of the cups. The tables couldn't carry all the cups, although there were not few of the cups. The benches couldn't carry all the cups, although there were not few of the cups. The chairs couldn't carry all the pictures, although there were not few of the pictures. The tables couldn't carry all the pictures, although there were not few of the pictures. The benches couldn't carry all the pictures, although there were not few of the pictures. The newspapers could be placed on all the chairs, although there were not many of the newspapers. The newspapers could be placed on all the tables, although there were not many of the newspapers. The newspapers could be placed on all the benches, although there were not many of the newspapers. The cups could be placed on all the chairs, although there were not many of the cups. The cups could be placed on all the tables, although there were not many of the cups. The cups could be placed on all the benches, although there were not many of the cups. The pictures could be placed on all the chairs, although there were not many of the pictures. The pictures could be placed on all the tables, although there were not many of the pictures. The pictures could be placed on all the benches, although there were not many of the pictures. The newspapers couldn't be put on all the chairs, because there were not many of the newspapers. The newspapers couldn't be put on all the tables, because there were not many of the newspapers. The newspapers couldn't be put on all the benches, because there were not many of the newspapers. The cups couldn't be put on all the chairs, because there were not many of the cups. The cups couldn't be put on all the tables, because there were not many of the cups. The cups couldn't be put on all the benches, because there were not many of the cups. The pictures couldn't be put on all the chairs, because there were not many of the pictures. The pictures couldn't be put on all the tables, because there were not many of the pictures. The pictures couldn't be put on all the benches, because there were not many of the pictures. The newspapers could be put on all the chairs, although there were many of the chairs. The newspapers could be put on all the tables, although there were many of the tables. The newspapers could be put on all the benches, although there were many of the benches. The cups could be put on all the chairs, although there were many of the chairs. The cups could be put on all the tables, although there were many of the tables. \n", + "\n", + "\n", + "3-2, The cups could be put on all the benches, although there were many of the benches. The pictures could be put on all the chairs, although there were many of the chairs. The pictures could be put on all the tables, although there were many of the tables. The pictures could be put on all the benches, although there were many of the benches. The chairs couldn't all be covered by the newspapers, because there were many of the chairs. The tables couldn't all be covered by the newspapers, because there were many of the tables. The benches couldn't all be covered by the newspapers, because there were many of the benches. The chairs couldn't all be covered by the cups, because there were many of the chairs. The tables couldn't all be covered by the cups, because there were many of the tables. The benches couldn't all be covered by the cups, because there were many of the benches. The chairs couldn't all be covered by the pictures, because there were many of the chairs. The tables couldn't all be covered by the pictures, because there were many of the tables. The benches couldn't all be covered by the pictures, because there were many of the benches. The newspapers could be placed on all the chairs, although there were many of the chairs. The newspapers could be placed on all the tables, although there were many of the tables. The newspapers could be placed on all the benches, although there were many of the benches. The cups could be placed on all the chairs, although there were many of the chairs. The cups could be placed on all the tables, although there were many of the tables. The cups could be placed on all the benches, although there were many of the benches. The pictures could be placed on all the chairs, although there were many of the chairs. The pictures could be placed on all the tables, although there were many of the tables. The pictures could be placed on all the benches, although there were many of the benches. The chairs couldn't all be covered by the newspapers, although there were many of the newspapers. The tables couldn't all be covered by the newspapers, although there were many of the newspapers. The benches couldn't all be covered by the newspapers, although there were many of the newspapers. The chairs couldn't all be covered by the cups, although there were many of the cups. The tables couldn't all be covered by the cups, although there were many of the cups. The benches couldn't all be covered by the cups, although there were many of the cups. The chairs couldn't all be covered by the pictures, although there were many of the pictures. The tables couldn't all be covered by the pictures, although there were many of the pictures. The benches couldn't all be covered by the pictures, although there were many of the pictures. The newspapers could be put on all the chairs, although there were not few of the chairs. The newspapers could be put on all the tables, although there were not few of the tables. The newspapers could be put on all the benches, although there were not few of the benches. The cups could be put on all the chairs, although there were not few of the chairs. The cups could be put on all the tables, although there were not few of the tables. The cups could be put on all the benches, although there were not few of the benches. The pictures could be put on all the chairs, although there were not few of the chairs. The pictures could be put on all the tables, although there were not few of the tables. The pictures could be put on all the benches, although there were not few of the benches. The newspapers couldn't be put on all the chairs, because there were many of the chairs. The newspapers couldn't be put on all the tables, because there were many of the tables. The newspapers couldn't be put on all the benches, because there were many of the benches. The cups couldn't be put on all the chairs, because there were many of the chairs. The cups couldn't be put on all the tables, because there were many of the tables. The cups couldn't be put on all the benches, because there were many of the benches. The pictures couldn't be put on all the chairs, because there were many of the chairs. The pictures couldn't be put on all the tables, because there were many of the tables. The pictures couldn't be put on all the benches, because there were many of the benches. The chairs couldn't carry all the newspapers, because there were many of the chairs. \n", + "\n", + "\n", + "******************************************\n", + "\n", + "\n", + "3-3, The tables couldn't carry all the newspapers, because there were many of the tables. The benches couldn't carry all the newspapers, because there were many of the benches. The chairs couldn't carry all the cups, because there were many of the chairs. The tables couldn't carry all the cups, because there were many of the tables. The benches couldn't carry all the cups, because there were many of the benches. The chairs couldn't carry all the pictures, because there were many of the chairs. The tables couldn't carry all the pictures, because there were many of the tables. The benches couldn't carry all the pictures, because there were many of the benches. The newspapers could be put on all the chairs, because there were few of the chairs. The newspapers could be put on all the tables, because there were few of the tables. The newspapers could be put on all the benches, because there were few of the benches. The cups could be put on all the chairs, because there were few of the chairs. The cups could be put on all the tables, because there were few of the tables. The cups could be put on all the benches, because there were few of the benches. The pictures could be put on all the chairs, because there were few of the chairs. The pictures could be put on all the tables, because there were few of the tables. The pictures could be put on all the benches, because there were few of the benches. The newspapers couldn't be placed on all the chairs, although there were few of the chairs. The newspapers couldn't be placed on all the tables, although there were few of the tables. The newspapers couldn't be placed on all the benches, although there were few of the benches. The cups couldn't be placed on all the chairs, although there were few of the chairs. The cups couldn't be placed on all the tables, although there were few of the tables. The cups couldn't be placed on all the benches, although there were few of the benches. The pictures couldn't be placed on all the chairs, although there were few of the chairs. The pictures couldn't be placed on all the tables, although there were few of the tables. The pictures couldn't be placed on all the benches, although there were few of the benches. The newspapers could be placed on all the chairs, because there were many of the newspapers. The newspapers could be placed on all the tables, because there were many of the newspapers. The newspapers could be placed on all the benches, because there were many of the newspapers. The cups could be placed on all the chairs, because there were many of the cups. The cups could be placed on all the tables, because there were many of the cups. The cups could be placed on all the benches, because there were many of the cups. The pictures could be placed on all the chairs, because there were many of the pictures. The pictures could be placed on all the tables, because there were many of the pictures. The pictures could be placed on all the benches, because there were many of the pictures. The chairs couldn't carry all the newspapers, because there were few of the newspapers. The tables couldn't carry all the newspapers, because there were few of the newspapers. The benches couldn't carry all the newspapers, because there were few of the newspapers. The chairs couldn't carry all the cups, because there were few of the cups. The tables couldn't carry all the cups, because there were few of the cups. The benches couldn't carry all the cups, because there were few of the cups. The chairs couldn't carry all the pictures, because there were few of the pictures. The tables couldn't carry all the pictures, because there were few of the pictures. The benches couldn't carry all the pictures, because there were few of the pictures. The chairs could all be covered by the newspapers, because there were not many of the chairs. The tables could all be covered by the newspapers, because there were not many of the tables. The benches could all be covered by the newspapers, because there were not many of the benches. The chairs could all be covered by the cups, because there were not many of the chairs. The tables could all be covered by the cups, because there were not many of the tables. The benches could all be covered by the cups, because there were not many of the benches. \n" + ] + } + ], + "source": [ + "num = next(i for i in range(len(frames)) if frames[i][\"index\"] == 190)\n", + "\n", + "print(\"index:\", frames[num][\"index\"])\n", + "print(\"\\n\")\n", + "\n", + "\n", + "def add_sentence(article, sentence):\n", + " if sentence[:3] == \"the\":\n", + " sentence = sentence.replace(\"the\", \"The\", 1)\n", + " sentence = sentence.replace(\"[\", \"\")\n", + " sentence = sentence.replace(\"]\", \"\")\n", + " sentence = sentence.replace(\" <\", \", \")\n", + " sentence = sentence.replace(\">\", \"\")\n", + " if article.count(\".\") < 50:\n", + " article += \"{} \".format(sentence)\n", + " return True, article\n", + " return False, article\n", + "\n", + "\n", + "article_c = \"\"\n", + "article_t = \"\"\n", + "\n", + "article_s1 = \"\"\n", + "article_s2 = \"\"\n", + "article_s3 = \"\"\n", + "\n", + "c, t, s = make_sentences(A_template, B_template, causal_templates, turning_templates, **frames[num])\n", + "for j in c:\n", + " _, article_c = add_sentence(article_c, j)\n", + "for j in t:\n", + " _, article_t = add_sentence(article_t, j)\n", + "for j in s:\n", + " success, article_s1 = add_sentence(article_s1, j)\n", + " if not success:\n", + " success, article_s2 = add_sentence(article_s2, j)\n", + " if not success:\n", + " success, article_s3 = add_sentence(article_s3, j)\n", + "\n", + " \n", + "\n", + "print(\"1-1,\", article_c)\n", + "print(\"\\n\")\n", + "print(\"2-1,\", article_t)\n", + "print(\"\\n\")\n", + "print(\"******************************************\")\n", + "print(\"\\n\")\n", + "print(\"3-1,\", article_s1) \n", + "print(\"\\n\")\n", + "print(\"3-2,\", article_s2)\n", + "print(\"\\n\")\n", + "print(\"******************************************\")\n", + "print(\"\\n\")\n", + "print(\"3-3,\", article_s3) \n" + ] + }, + { + "cell_type": "code", + "execution_count": 267, + "metadata": {}, + "outputs": [], + "source": [ + "error_index: {\n", + " \"replcae_error\": {\n", + " \"index\": [12, 22, 38, 188, 226],\n", + " \"example\": [\n", + " \"John didn't defeated Sue in the running race although Sue had a bad start.\",\n", + " \"The cakes don't are not as popular as the apples so the cakes shouldn't be made less next time.\"\n", + " ],\n", + " \"error\": [\n", + " \"didn't defeated\",\n", + " \"don't are not\"\n", + " ]\n", + " },\n", + " \"add ',' before 'but' and 'so'\": {}\n", + "}\n", + "TODO: {\"index\":[190, \"few of -> lack of\"]} " + ] + }, + { + "cell_type": "code", + "execution_count": 206, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 206, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s = \"[mask]\"\n", + "s.endswith(\"]\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/WSC_associative_label.json b/WSC_associative_label.json new file mode 100644 index 00000000000000..4fd2fd1015ae5d --- /dev/null +++ b/WSC_associative_label.json @@ -0,0 +1 @@ +[{"index": 0, "sentence": "The city councilmen refused the demonstrators a permit because [they] feared violence.", "answer1": "The demonstrators", "answer0": "The city councilmen", "is_associative": 0, "correct_answer": "The city councilmen"}, {"index": 1, "sentence": "The city councilmen refused the demonstrators a permit because [they] advocated violence.", "answer1": "The demonstrators", "answer0": "The city councilmen", "is_associative": 0, "correct_answer": "The demonstrators"}, {"index": 2, "sentence": "The trophy doesn't fit into the brown suitcase because [it] is too large.", "answer1": "the suitcase", "answer0": "the trophy", "is_associative": 0, "correct_answer": "the trophy"}, {"index": 3, "sentence": "The trophy doesn't fit into the brown suitcase because [it] is too small.", "answer1": "the suitcase", "answer0": "the trophy", "is_associative": 0, "correct_answer": "the suitcase"}, {"index": 4, "sentence": "Joan made sure to thank Susan for all the help [she] had recieved.", "answer1": "Susan", "answer0": "Joan", "is_associative": 0, "correct_answer": "Joan"}, {"index": 5, "sentence": "Joan made sure to thank Susan for all the help [she] had given.", "answer1": "Susan", "answer0": "Joan", "is_associative": 0, "correct_answer": "Susan"}, {"index": 6, "sentence": "Paul tried to call George on the phone, but [he] wasn't successful.", "answer1": "George", "answer0": "Paul", "is_associative": 0, "correct_answer": "Paul"}, {"index": 7, "sentence": "Paul tried to call George on the phone, but [he] wasn't available.", "answer1": "George", "answer0": "Paul", "is_associative": 0, "correct_answer": "George"}, {"index": 8, "sentence": "The lawyer asked the witness a question, but [he] was reluctant to repeat it.", "answer1": "the witness", "answer0": "the lawyer", "is_associative": 0, "correct_answer": "the lawyer"}, {"index": 9, "sentence": "The lawyer asked the witness a question, but [he] was reluctant to answer it.", "answer1": "the witness", "answer0": "the lawyer", "is_associative": 0, "correct_answer": "the witness"}, {"index": 10, "sentence": "The delivery truck zoomed by the school bus because [it] was going so fast.", "answer1": "the school bus", "answer0": "the delivery truck", "is_associative": 0, "correct_answer": "the delivery truck"}, {"index": 11, "sentence": "The delivery truck zoomed by the school bus because [it] was going so slow.", "answer1": "the school bus", "answer0": "the delivery truck", "is_associative": 0, "correct_answer": "the school bus"}, {"index": 12, "sentence": "Frank felt vindicated when his longtime rival Bill revealed that [he] was the winner of the competition.", "answer1": "Bill", "answer0": "Frank", "is_associative": 0, "correct_answer": "Frank"}, {"index": 13, "sentence": "Frank felt crushed when his longtime rival Bill revealed that [he] was the winner of the competition.", "answer1": "Bill", "answer0": "Frank", "is_associative": 0, "correct_answer": "Bill"}, {"index": 14, "sentence": "The man couldn't lift his son because [he] was so weak.", "answer1": "The son", "answer0": "The man", "is_associative": 0, "correct_answer": "The man"}, {"index": 15, "sentence": "The man couldn't lift his son because [he] was so heavy.", "answer1": "The son", "answer0": "The man", "is_associative": 0, "correct_answer": "The son"}, {"index": 16, "sentence": "The large ball crashed right through the table because [it] was made of steel.", "answer1": "The table", "answer0": "The large ball", "is_associative": 0, "correct_answer": "The large ball"}, {"index": 17, "sentence": "The large ball crashed right through the table because [it] was made of styrofoam.", "answer1": "The table", "answer0": "The large ball", "is_associative": 0, "correct_answer": "The table"}, {"index": 18, "sentence": "John couldn't see the stage with Billy in front of him because [he] is so short.", "answer1": "Billy", "answer0": "John", "is_associative": 0, "correct_answer": "John"}, {"index": 19, "sentence": "John couldn't see the stage with Billy in front of him because [he] is so tall.", "answer1": "Billy", "answer0": "John", "is_associative": 0, "correct_answer": "Billy"}, {"index": 20, "sentence": "Tom threw his schoolbag down to Ray after [he] reached the top of the stairs.", "answer1": "Ray", "answer0": "Tom", "is_associative": 0, "correct_answer": "Tom"}, {"index": 21, "sentence": "Tom threw his schoolbag down to Ray after [he] reached the bottom of the stairs.", "answer1": "Ray", "answer0": "Tom", "is_associative": 0, "correct_answer": "Ray"}, {"index": 22, "sentence": "Although they ran at about the same speed, Sue beat Sally because [she] had such a good start.", "answer1": "Sally", "answer0": "Sue", "is_associative": 0, "correct_answer": "Sue"}, {"index": 23, "sentence": "Although they ran at about the same speed, Sue beat Sally because [she] had such a bad start.", "answer1": "Sally", "answer0": "Sue", "is_associative": 0, "correct_answer": "Sally"}, {"index": 24, "sentence": "The sculpture rolled off the shelf because [it] wasn't anchored.", "answer1": "The shelf", "answer0": "The sculpture", "is_associative": 0, "correct_answer": "The sculpture"}, {"index": 25, "sentence": "The sculpture rolled off the shelf because [it] wasn't level.", "answer1": "The shelf", "answer0": "The sculpture", "is_associative": 0, "correct_answer": "The shelf"}, {"index": 26, "sentence": "Sam's drawing was hung just above Tina's and [it] did look much better with another one below it.", "answer1": "Tina's drawing", "answer0": "Sam's drawing", "is_associative": 0, "correct_answer": "Sam's drawing"}, {"index": 27, "sentence": "Sam's drawing was hung just above Tina's and [it] did look much better with another one above it.", "answer1": "Tina's drawing", "answer0": "Sam's drawing", "is_associative": 0, "correct_answer": "Tina's drawing"}, {"index": 28, "sentence": "Anna did a lot better than her good friend Lucy on the test because [she] had studied so hard.", "answer1": "Lucy", "answer0": "Anna", "is_associative": 0, "correct_answer": "Anna"}, {"index": 29, "sentence": "Anna did a lot worse than her good friend Lucy on the test because [she] had studied so hard.", "answer1": "Lucy", "answer0": "Anna", "is_associative": 0, "correct_answer": "Lucy"}, {"index": 30, "sentence": "The firemen arrived after the police because [they] were coming from so far away.", "answer1": "The police", "answer0": "The firemen", "is_associative": 0, "correct_answer": "The firemen"}, {"index": 31, "sentence": "The firemen arrived before the police because [they] were coming from so far away.", "answer1": "The police", "answer0": "The firemen", "is_associative": 0, "correct_answer": "The police"}, {"index": 32, "sentence": "Frank was upset with Tom because the toaster [he] had bought from him didn't work.", "answer1": "Tom", "answer0": "Frank", "is_associative": 0, "correct_answer": "Frank"}, {"index": 33, "sentence": "Frank was upset with Tom because the toaster [he] had sold him didn't work.", "answer1": "Tom", "answer0": "Frank", "is_associative": 0, "correct_answer": "Tom"}, {"index": 34, "sentence": "Jim yelled at Kevin because [he] was so upset.", "answer1": "Kevin", "answer0": "Jim", "is_associative": 0, "correct_answer": "Jim"}, {"index": 35, "sentence": "Jim comforted Kevin because [he] was so upset.", "answer1": "Kevin", "answer0": "Jim", "is_associative": 0, "correct_answer": "Kevin"}, {"index": 36, "sentence": "The sack of potatoes had been placed above the bag of flour, so [it] had to be moved first.", "answer1": "The bag of flour", "answer0": "The sack of potatoes", "is_associative": 0, "correct_answer": "The sack of potatoes"}, {"index": 37, "sentence": "The sack of potatoes had been placed below the bag of flour, so [it] had to be moved first.", "answer1": "The bag of flour", "answer0": "The sack of potatoes", "is_associative": 0, "correct_answer": "The bag of flour"}, {"index": 38, "sentence": "Pete envies Martin although [he] is very successful.", "answer1": "Martin", "answer0": "Pete", "is_associative": 0, "correct_answer": "Pete"}, {"index": 39, "sentence": "Pete envies Martin because [he] is very successful.", "answer1": "Martin", "answer0": "Pete", "is_associative": 0, "correct_answer": "Martin"}, {"index": 40, "sentence": "The older students were bullying the younger ones, so we punished [them] .", "answer1": "The younger students", "answer0": "The older students", "is_associative": 0, "correct_answer": "The older students"}, {"index": 41, "sentence": "The older students were bullying the younger ones, so we rescued [them] .", "answer1": "The younger students", "answer0": "The older students", "is_associative": 0, "correct_answer": "The younger students"}, {"index": 42, "sentence": "I poured water from the bottle into the cup until [it] was empty.", "answer1": "the cup", "answer0": "the bottle", "is_associative": 0, "correct_answer": "the bottle"}, {"index": 43, "sentence": "I poured water from the bottle into the cup until [it] was full.", "answer1": "the cup", "answer0": "the bottle", "is_associative": 0, "correct_answer": "the cup"}, {"index": 44, "sentence": "Susan knows all about Ann's personal problems because [she] is nosy.", "answer1": "Ann", "answer0": "Susan", "is_associative": 0, "correct_answer": "Susan"}, {"index": 45, "sentence": "Susan knows all about Ann's personal problems because [she] is indiscreet.", "answer1": "Ann", "answer0": "Susan", "is_associative": 0, "correct_answer": "Ann"}, {"index": 46, "sentence": "Sid explained his theory to Mark but [he] couldn't convince him.", "answer1": "Mark", "answer0": "Sid", "is_associative": 0, "correct_answer": "Sid"}, {"index": 47, "sentence": "Sid explained his theory to Mark but [he] couldn't understand him.", "answer1": "Mark", "answer0": "Sid", "is_associative": 0, "correct_answer": "Mark"}, {"index": 48, "sentence": "Susan knew that Ann's son had been in a car accident, so [she] told her about it.", "answer1": "Ann", "answer0": "Susan", "is_associative": 0, "correct_answer": "Susan"}, {"index": 49, "sentence": "Susan knew that Ann's son had been in a car accident, because [she] told her about it.", "answer1": "Ann", "answer0": "Susan", "is_associative": 0, "correct_answer": "Ann"}, {"index": 50, "sentence": "Joe's uncle can still beat him at tennis, even though [he] is 30 years younger.", "answer1": "Joe's uncle", "answer0": "Joe", "is_associative": 0, "correct_answer": "Joe"}, {"index": 51, "sentence": "Joe's uncle can still beat him at tennis, even though [he] is 30 years older.", "answer1": "Joe's uncle", "answer0": "Joe", "is_associative": 0, "correct_answer": "Joe's uncle"}, {"index": 52, "sentence": "The painting in Mark's living room shows an oak tree. [It] is to the right of the bookcase.", "answer1": "The oak tree", "answer0": "The painting", "is_associative": 0, "correct_answer": "The painting"}, {"index": 56, "sentence": "The drain is clogged with hair. [It] has to be cleaned.", "answer1": "The hair", "answer0": "The drain", "is_associative": 0, "correct_answer": "The drain"}, {"index": 57, "sentence": "The drain is clogged with hair. [It] has to be removed.", "answer1": "The hair", "answer0": "The drain", "is_associative": 0, "correct_answer": "The hair"}, {"index": 59, "sentence": "My meeting started at 4:00 and I needed to catch the train at 4:30, so there wasn't much time. Luckily, [it] was delayed, so it worked out.", "answer1": "The train", "answer0": "The meeting", "is_associative": 0, "correct_answer": "The train"}, {"index": 60, "sentence": "There is a pillar between me and the stage, and I can't see around [it] .", "answer1": "The stage", "answer0": "The pillar", "is_associative": 0, "correct_answer": "The pillar"}, {"index": 61, "sentence": "There is a pillar between me and the stage, and I can't see [it] .", "answer1": "The stage", "answer0": "The pillar", "is_associative": 0, "correct_answer": "The stage"}, {"index": 62, "sentence": "They broadcast an announcement, but a subway came into the station and I couldn't hear [it] .", "answer1": "The subway", "answer0": "The announcement", "is_associative": 0, "correct_answer": "The announcement"}, {"index": 63, "sentence": "They broadcast an announcement, but a subway came into the station and I couldn't hear over [it] .", "answer1": "The subway", "answer0": "The announcement", "is_associative": 0, "correct_answer": "The subway"}, {"index": 64, "sentence": "In the middle of the outdoor concert, the rain started falling, but [it] continued until 10.", "answer1": "The rain", "answer0": "The concert", "is_associative": 0, "correct_answer": "The concert"}, {"index": 65, "sentence": "In the middle of the outdoor concert, the rain started falling, and [it] continued until 10.", "answer1": "The rain", "answer0": "The concert", "is_associative": 0, "correct_answer": "The rain"}, {"index": 66, "sentence": "I used an old rag to clean the knife, and then I put [it] in the trash.", "answer1": "The knife", "answer0": "The rag", "is_associative": 0, "correct_answer": "The rag"}, {"index": 67, "sentence": "I used an old rag to clean the knife, and then I put [it] in the drawer.", "answer1": "The knife", "answer0": "The rag", "is_associative": 0, "correct_answer": "The knife"}, {"index": 68, "sentence": "Ann asked Mary what time the library closes, because [she] had forgotten.", "answer1": "Mary", "answer0": "Ann", "is_associative": 0, "correct_answer": "Ann"}, {"index": 69, "sentence": "Ann asked Mary what time the library closes, but [she] had forgotten.", "answer1": "Mary", "answer0": "Ann", "is_associative": 0, "correct_answer": "Mary"}, {"index": 70, "sentence": "I took the water bottle out of the backpack so that [it] would be handy.", "answer1": "The backpack", "answer0": "The water bottle", "is_associative": 0, "correct_answer": "The water bottle"}, {"index": 71, "sentence": "I took the water bottle out of the backpack so that [it] would be lighter.", "answer1": "The backpack", "answer0": "The water bottle", "is_associative": 0, "correct_answer": "The backpack"}, {"index": 73, "sentence": "I couldn't put the pot on the shelf because [it] was too high.", "answer1": "The shelf", "answer0": "The pot", "is_associative": 0, "correct_answer": "The shelf"}, {"index": 76, "sentence": "Bob paid for Charlie's college education. [He] is very generous.", "answer1": "Charlie", "answer0": "Bob", "is_associative": 0, "correct_answer": "Bob"}, {"index": 77, "sentence": "Bob paid for Charlie's college education. [He] is very grateful.", "answer1": "Charlie", "answer0": "Bob", "is_associative": 0, "correct_answer": "Charlie"}, {"index": 78, "sentence": "Bob paid for Charlie's college education, but now Charlie acts as though it never happened. [He] is very hurt.", "answer1": "Charlie", "answer0": "Bob", "is_associative": 0, "correct_answer": "Bob"}, {"index": 79, "sentence": "Bob paid for Charlie's college education, but now Charlie acts as though it never happened. [He] is very ungrateful.", "answer1": "Charlie", "answer0": "Bob", "is_associative": 0, "correct_answer": "Charlie"}, {"index": 80, "sentence": "Bob was playing cards with Adam and was way ahead. If Adam hadn't had a sudden run of good luck, [he] would have won.", "answer1": "Adam", "answer0": "Bob", "is_associative": 0, "correct_answer": "Bob"}, {"index": 81, "sentence": "Bob was playing cards with Adam and was way ahead. If Adam hadn't had a sudden run of good luck, [he] would have lost.", "answer1": "Adam", "answer0": "Bob", "is_associative": 0, "correct_answer": "Adam"}, {"index": 82, "sentence": "Adam can't leave work here until Bob arrives to replace him. If Bob had left home for work on time, [he] would be gone by this time.", "answer1": "Bob", "answer0": "Adam", "is_associative": 0, "correct_answer": "Adam"}, {"index": 83, "sentence": "Adam can't leave work here until Bob arrives to replace him. If Bob had left home for work on time, [he] would be here by this time.", "answer1": "Bob", "answer0": "Adam", "is_associative": 0, "correct_answer": "Bob"}, {"index": 84, "sentence": "If the con artist has succeeded in fooling Sam, [he] would have gotten a lot of money.", "answer1": "Sam", "answer0": "The con artist", "is_associative": 0, "correct_answer": "The con artist"}, {"index": 85, "sentence": "If the con artist has succeeded in fooling Sam, [he] would have lost a lot of money.", "answer1": "Sam", "answer0": "The con artist", "is_associative": 0, "correct_answer": "Sam"}, {"index": 86, "sentence": "It was a summer afternoon, and the dog was sitting in the middle of the lawn. After a while, it got up and moved to a spot under the tree, because [it] was hot.", "answer1": "The spot under the tree", "answer0": "The dog", "is_associative": 0, "correct_answer": "The dog"}, {"index": 88, "sentence": "The cat was lying by the mouse hole waiting for the mouse, but [it] was too impatient.", "answer1": "The mouse", "answer0": "The cat", "is_associative": 0, "correct_answer": "The cat"}, {"index": 89, "sentence": "The cat was lying by the mouse hole waiting for the mouse, but [it] was too cautious.", "answer1": "The mouse", "answer0": "The cat", "is_associative": 0, "correct_answer": "The mouse"}, {"index": 90, "sentence": "Anne gave birth to a daughter last month. [She] is a very charming woman.", "answer1": "Anne's daughter", "answer0": "Anne", "is_associative": 0, "correct_answer": "Anne"}, {"index": 91, "sentence": "Anne gave birth to a daughter last month. [She] is a very charming baby.", "answer1": "Anne's daughter", "answer0": "Anne", "is_associative": 0, "correct_answer": "Anne's daughter"}, {"index": 92, "sentence": "Alice tried frantically to stop her daughter from chatting at the party, leaving us to wonder why [she] was behaving so strangely.", "answer1": "Alice's daughter", "answer0": "Alice", "is_associative": 0, "correct_answer": "Alice"}, {"index": 93, "sentence": "Alice tried frantically to stop her daughter from barking at the party, leaving us to wonder why [she] was behaving so strangely.", "answer1": "Alice's daughter", "answer0": "Alice", "is_associative": 0, "correct_answer": "Alice's daughter"}, {"index": 94, "sentence": "I saw Jim yelling at some guy in a military uniform with a huge red beard. I don't know why [he] was, but he looked very unhappy.", "answer1": "the guy in uniform", "answer0": "Jim", "is_associative": 0, "correct_answer": "Jim"}, {"index": 95, "sentence": "I saw Jim yelling at some guy in a military uniform with a huge red beard. I don't know who [he] was, but he looked very unhappy.", "answer1": "the guy in uniform", "answer0": "Jim", "is_associative": 0, "correct_answer": "the guy in uniform"}, {"index": 96, "sentence": "The fish ate the worm. [It] was hungry.", "answer1": "The worm", "answer0": "The fish", "is_associative": 0, "correct_answer": "The fish"}, {"index": 97, "sentence": "The fish ate the worm. [It] was tasty.", "answer1": "The worm", "answer0": "The fish", "is_associative": 0, "correct_answer": "The worm"}, {"index": 99, "sentence": "I was trying to open the lock with the key, but someone had filled the keyhole with chewing gum, and I couldn't get [it] out.", "answer1": "The chewing gum", "answer0": "The key", "is_associative": 0, "correct_answer": "The chewing gum"}, {"index": 100, "sentence": "The dog chased the cat, which ran up a tree. [It] waited at the bottom.", "answer1": "The cat", "answer0": "The dog", "is_associative": 0, "correct_answer": "The dog"}, {"index": 101, "sentence": "The dog chased the cat, which ran up a tree. [It] waited at the top.", "answer1": "The cat", "answer0": "The dog", "is_associative": 0, "correct_answer": "The cat"}, {"index": 102, "sentence": "In the storm, the tree fell down and crashed through the roof of my house. Now, I have to get [it] removed.", "answer1": "The roof", "answer0": "The tree", "is_associative": 0, "correct_answer": "The tree"}, {"index": 104, "sentence": "The customer walked into the bank and stabbed one of the tellers. [He] was immediately taken to the police station.", "answer1": "The teller", "answer0": "The customer", "is_associative": 0, "correct_answer": "The customer"}, {"index": 105, "sentence": "The customer walked into the bank and stabbed one of the tellers. [He] was immediately taken to the hospital.", "answer1": "The teller", "answer0": "The customer", "is_associative": 0, "correct_answer": "The teller"}, {"index": 106, "sentence": "John was doing research in the library when he heard a man humming and whistling. [He] was very annoyed.", "answer1": "The man", "answer0": "John", "is_associative": 0, "correct_answer": "John"}, {"index": 107, "sentence": "John was doing research in the library when he heard a man humming and whistling. [He] was very annoying.", "answer1": "The man", "answer0": "John", "is_associative": 0, "correct_answer": "The man"}, {"index": 108, "sentence": "John was jogging through the park when he saw a man juggling watermelons. [He] was very impressed.", "answer1": "The juggler", "answer0": "John", "is_associative": 0, "correct_answer": "John"}, {"index": 110, "sentence": "Bob collapsed on the sidewalk. Soon he saw Carl coming to help. [He] was very ill.", "answer1": "Carl", "answer0": "Bob", "is_associative": 0, "correct_answer": "Bob"}, {"index": 111, "sentence": "Bob collapsed on the sidewalk. Soon he saw Carl coming to help. [He] was very concerned.", "answer1": "Carl", "answer0": "Bob", "is_associative": 0, "correct_answer": "Carl"}, {"index": 113, "sentence": "Sam and Amy are passionately in love, but Amy's parents are unhappy about it, because [they] are snobs.", "answer1": "Amy's parents", "answer0": "Sam and Amy", "is_associative": 0, "correct_answer": "Amy's parents"}, {"index": 114, "sentence": "Mark told Pete many lies about himself, which Pete included in his book. [He] should have been more truthful.", "answer1": "Pete", "answer0": "Mark", "is_associative": 0, "correct_answer": "Mark"}, {"index": 115, "sentence": "Mark told Pete many lies about himself, which Pete included in his book. [He] should have been more skeptical.", "answer1": "Pete", "answer0": "Mark", "is_associative": 0, "correct_answer": "Pete"}, {"index": 121, "sentence": "Mary took out her flute and played one of her favorite pieces. She has loved [it] since she was a child.", "answer1": "The piece", "answer0": "The flute", "is_associative": 0, "correct_answer": "The piece"}, {"index": 122, "sentence": "Sam pulled up a chair to the piano, but [it] was broken, so he had to stand instead.", "answer1": "The piano", "answer0": "The chair", "is_associative": 0, "correct_answer": "The chair"}, {"index": 123, "sentence": "Sam pulled up a chair to the piano, but [it] was broken, so he had to sing instead.", "answer1": "The piano", "answer0": "The chair", "is_associative": 0, "correct_answer": "The piano"}, {"index": 124, "sentence": "Since it was raining, I carried the newspaper in my backpack to keep [it] dry.", "answer1": "The backpack", "answer0": "The newspaper", "is_associative": 0, "correct_answer": "The newspaper"}, {"index": 125, "sentence": "Since it was raining, I carried the newspaper over my backpack to keep [it] dry.", "answer1": "The backpack", "answer0": "The newspaper", "is_associative": 0, "correct_answer": "The backpack"}, {"index": 126, "sentence": "Sara borrowed the book from the library because she needs it for an article she is working on. She reads [it] when she gets home from work.", "answer1": "The article", "answer0": "The book", "is_associative": 0, "correct_answer": "The book"}, {"index": 127, "sentence": "Sara borrowed the book from the library because she needs it for an article she is working on. She writes [it] when she gets home from work.", "answer1": "The article", "answer0": "The book", "is_associative": 0, "correct_answer": "The article"}, {"index": 128, "sentence": "This morning, Joey built a sand castle on the beach, and put a toy flag in the highest tower, but this afternoon the tide knocked [it] down.", "answer1": "The flag", "answer0": "The sand castle", "is_associative": 0, "correct_answer": "The sand castle"}, {"index": 129, "sentence": "This morning, Joey built a sand castle on the beach, and put a toy flag in the highest tower, but this afternoon the wind knocked [it] down.", "answer1": "The flag", "answer0": "The sand castle", "is_associative": 0, "correct_answer": "The flag"}, {"index": 130, "sentence": "Jane knocked on Susan's door, but there was no answer. [She] was disappointed.", "answer1": "Susan", "answer0": "Jane", "is_associative": 0, "correct_answer": "Jane"}, {"index": 131, "sentence": "Jane knocked on Susan's door, but there was no answer. [She] was out.", "answer1": "Susan", "answer0": "Jane", "is_associative": 0, "correct_answer": "Susan"}, {"index": 132, "sentence": "Jane knocked on the door, and Susan answered it. [She] invited her to come out.", "answer1": "Susan", "answer0": "Jane", "is_associative": 0, "correct_answer": "Jane"}, {"index": 133, "sentence": "Jane knocked on the door, and Susan answered it. [She] invited her to come in.", "answer1": "Susan", "answer0": "Jane", "is_associative": 0, "correct_answer": "Susan"}, {"index": 134, "sentence": "Sam took French classes from Adam, because [he] was eager to speak it fluently.", "answer1": "Adam", "answer0": "Sam", "is_associative": 0, "correct_answer": "Sam"}, {"index": 135, "sentence": "Sam took French classes from Adam, because [he] was known to speak it fluently.", "answer1": "Adam", "answer0": "Sam", "is_associative": 0, "correct_answer": "Adam"}, {"index": 139, "sentence": "The sun was covered by a thick cloud all morning, but luckily, by the time the picnic started, [it] was gone.", "answer1": "The cloud", "answer0": "The sun", "is_associative": 0, "correct_answer": "The cloud"}, {"index": 140, "sentence": "We went to the lake, because a shark had been seen at the ocean beach, so [it] was a safer place to swim.", "answer1": "The ocean beach", "answer0": "The lake", "is_associative": 0, "correct_answer": "The lake"}, {"index": 141, "sentence": "We went to the lake, because a shark had been seen at the ocean beach, so [it] was a dangerous place to swim.", "answer1": "The ocean beach", "answer0": "The lake", "is_associative": 0, "correct_answer": "The ocean beach"}, {"index": 142, "sentence": "Sam tried to paint a picture of shepherds with sheep, but [they] ended up looking more like golfers.", "answer1": "The sheep", "answer0": "The shepherds", "is_associative": 0, "correct_answer": "The shepherds"}, {"index": 143, "sentence": "Sam tried to paint a picture of shepherds with sheep, but [they] ended up looking more like dogs.", "answer1": "The sheep", "answer0": "The shepherds", "is_associative": 0, "correct_answer": "The sheep"}, {"index": 144, "sentence": "Mary tucked her daughter Anne into bed, so that [she] could work.", "answer1": "Mary's daughter", "answer0": "Mary", "is_associative": 0, "correct_answer": "Mary"}, {"index": 145, "sentence": "Mary tucked her daughter Anne into bed, so that [she] could sleep.", "answer1": "Mary's daughter", "answer0": "Mary", "is_associative": 0, "correct_answer": "Mary's daughter"}, {"index": 148, "sentence": "Thomson visited Cooper's grave in 1765. At that date [he] had been travelling for five years.", "answer1": "Cooper", "answer0": "Thomson", "is_associative": 0, "correct_answer": "Thomson"}, {"index": 149, "sentence": "Thomson visited Cooper's grave in 1765. At that date [he] had been dead for five years.", "answer1": "Cooper", "answer0": "Thomson", "is_associative": 0, "correct_answer": "Cooper"}, {"index": 150, "sentence": "Jackson was greatly influenced by Arnold, though [he] lived two centuries later.", "answer1": "Arnold", "answer0": "Jackson", "is_associative": 0, "correct_answer": "Jackson"}, {"index": 151, "sentence": "Jackson was greatly influenced by Arnold, though [he] lived two centuries earlier.", "answer1": "Arnold", "answer0": "Jackson", "is_associative": 0, "correct_answer": "Arnold"}, {"index": 152, "sentence": "I can't cut that tree down with that axe; [it] is too thick.", "answer1": "The axe", "answer0": "The tree", "is_associative": 0, "correct_answer": "The tree"}, {"index": 153, "sentence": "I can't cut that tree down with that axe; [it] is too small.", "answer1": "The axe", "answer0": "The tree", "is_associative": 0, "correct_answer": "The axe"}, {"index": 154, "sentence": "The foxes are getting in at night and attacking the chickens. I shall have to kill [them] .", "answer1": "The chickens", "answer0": "The foxes", "is_associative": 0, "correct_answer": "The foxes"}, {"index": 156, "sentence": "The foxes are getting in at night and attacking the chickens. [They] have gotten very bold.", "answer1": "The chickens", "answer0": "The foxes", "is_associative": 0, "correct_answer": "The foxes"}, {"index": 157, "sentence": "The foxes are getting in at night and attacking the chickens. [They] have gotten very nervous.", "answer1": "The chickens", "answer0": "The foxes", "is_associative": 0, "correct_answer": "The chickens"}, {"index": 159, "sentence": "Fred covered his eyes with his hands, because the wind was blowing sand around. He lowered [them] when the wind stopped.", "answer1": "His hands", "answer0": "His eyes", "is_associative": 0, "correct_answer": "His hands"}, {"index": 160, "sentence": "The actress used to be named Terpsichore, but she changed it to Tina a few years ago, because she figured [it] was too hard to pronounce.", "answer1": "Tina", "answer0": "Terpsichore", "is_associative": 0, "correct_answer": "Terpsichore"}, {"index": 161, "sentence": "The actress used to be named Terpsichore, but she changed it to Tina a few years ago, because she figured [it] was easier to pronounce.", "answer1": "Tina", "answer0": "Terpsichore", "is_associative": 0, "correct_answer": "Tina"}, {"index": 162, "sentence": "Fred watched TV while George went out to buy groceries. After an hour [he] got up.", "answer1": "George", "answer0": "Fred", "is_associative": 0, "correct_answer": "Fred"}, {"index": 163, "sentence": "Fred watched TV while George went out to buy groceries. After an hour [he] got back.", "answer1": "George", "answer0": "Fred", "is_associative": 0, "correct_answer": "George"}, {"index": 164, "sentence": "Fred was supposed to run the dishwasher, but he put it off, because he wanted to watch TV. But the show turned out to be boring, so he changed his mind and turned [it] on.", "answer1": "The TV", "answer0": "The dishwasher", "is_associative": 0, "correct_answer": "The dishwasher"}, {"index": 165, "sentence": "Fred was supposed to run the dishwasher, but he put it off, because he wanted to watch TV. But the show turned out to be boring, so he changed his mind and turned [it] off.", "answer1": "The TV", "answer0": "The dishwasher", "is_associative": 0, "correct_answer": "The TV"}, {"index": 166, "sentence": "Fred is the only man still alive who remembers my great-grandfather. [He] is a remarkable man.", "answer1": "My great-grandfather", "answer0": "Fred", "is_associative": 0, "correct_answer": "Fred"}, {"index": 167, "sentence": "Fred is the only man still alive who remembers my great-grandfather. [He] was a remarkable man.", "answer1": "My great-grandfather", "answer0": "Fred", "is_associative": 0, "correct_answer": "My great-grandfather"}, {"index": 168, "sentence": "Fred is the only man alive who still remembers my father as an infant. When Fred first saw my father, [he] was twelve years old.", "answer1": "My father", "answer0": "Fred", "is_associative": 0, "correct_answer": "Fred"}, {"index": 169, "sentence": "Fred is the only man alive who still remembers my father as an infant. When Fred first saw my father, [he] was twelve months old.", "answer1": "My father", "answer0": "Fred", "is_associative": 0, "correct_answer": "My father"}, {"index": 170, "sentence": "In July, Kamtchatka declared war on Yakutsk. Since Yakutsk's army was much better equipped and ten times larger, [they] were defeated within weeks.", "answer1": "Yakutsk", "answer0": "Kamchatka", "is_associative": 0, "correct_answer": "Kamchatka"}, {"index": 171, "sentence": "In July, Kamtchatka declared war on Yakutsk. Since Yakutsk's army was much better equipped and ten times larger, [they] were victorious within weeks.", "answer1": "Yakutsk", "answer0": "Kamchatka", "is_associative": 0, "correct_answer": "Yakutsk"}, {"index": 172, "sentence": "Look! There is a minnow swimming right below that duck! [It] had better get away to safety fast!", "answer1": "The duck", "answer0": "The minnow", "is_associative": 0, "correct_answer": "The minnow"}, {"index": 173, "sentence": "Look! There is a shark swimming right below that duck! [It] had better get away to safety fast!", "answer1": "The duck", "answer0": "The shark", "is_associative": 0, "correct_answer": "The duck"}, {"index": 178, "sentence": "The journalists interviewed the stars of the new movie. [They] were very persistent, so the interview lasted for a long time.", "answer1": "The stars", "answer0": "The journalists", "is_associative": 0, "correct_answer": "The journalists"}, {"index": 179, "sentence": "The journalists interviewed the stars of the new movie. [They] were very cooperative, so the interview lasted for a long time.", "answer1": "The stars", "answer0": "The journalists", "is_associative": 0, "correct_answer": "The stars"}, {"index": 186, "sentence": "When the sponsors of the bill got to the town hall, they were surprised to find that the room was full of opponents. [They] were very much in the minority.", "answer1": "The opponents", "answer0": "The sponsors", "is_associative": 0, "correct_answer": "The sponsors"}, {"index": 187, "sentence": "When the sponsors of the bill got to the town hall, they were surprised to find that the room was full of opponents. [They] were very much in the majority.", "answer1": "The opponents", "answer0": "The sponsors", "is_associative": 0, "correct_answer": "The opponents"}, {"index": 188, "sentence": "Everyone really loved the oatmeal cookies; only a few people liked the chocolate chip cookies. Next time, we should make more of [them] .", "answer1": "The chocolate chip cookies", "answer0": "The oatmeal cookies", "is_associative": 0, "correct_answer": "The oatmeal cookies"}, {"index": 189, "sentence": "Everyone really loved the oatmeal cookies; only a few people liked the chocolate chip cookies. Next time, we should make fewer of [them] .", "answer1": "The chocolate chip cookies", "answer0": "The oatmeal cookies", "is_associative": 0, "correct_answer": "The chocolate chip cookies"}, {"index": 190, "sentence": "We had hoped to place copies of our newsletter on all the chairs in the auditorium, but there were simply not enough of [them] .", "answer1": "chairs", "answer0": "copies of the newsletter", "is_associative": 0, "correct_answer": "copies of the newsletter"}, {"index": 191, "sentence": "We had hoped to place copies of our newsletter on all the chairs in the auditorium, but there were simply too many of [them] .", "answer1": "chairs", "answer0": "copies of the newsletter", "is_associative": 0, "correct_answer": "chairs"}, {"index": 193, "sentence": "I stuck a pin through a carrot. When I pulled the pin out, [it] had a hole.", "answer1": "The carrot", "answer0": "The pin", "is_associative": 0, "correct_answer": "The carrot"}, {"index": 194, "sentence": "I couldn't find a spoon, so I tried using a pen to stir my coffee. But that turned out to be a bad idea, because [it] got full of coffee.", "answer1": "The coffee", "answer0": "The pen", "is_associative": 0, "correct_answer": "The pen"}, {"index": 195, "sentence": "I couldn't find a spoon, so I tried using a pen to stir my coffee. But that turned out to be a bad idea, because [it] got full of ink.", "answer1": "The coffee", "answer0": "The pen", "is_associative": 0, "correct_answer": "The coffee"}, {"index": 196, "sentence": "Steve follows Fred's example in everything. [He] admires him hugely.", "answer1": "Fred", "answer0": "Steve", "is_associative": 0, "correct_answer": "Steve"}, {"index": 197, "sentence": "Steve follows Fred's example in everything. [He] influences him hugely.", "answer1": "Fred", "answer0": "Steve", "is_associative": 0, "correct_answer": "Fred"}, {"index": 198, "sentence": "The table won't fit through the doorway because [it] is too wide.", "answer1": "The doorway", "answer0": "The table", "is_associative": 0, "correct_answer": "The table"}, {"index": 199, "sentence": "The table won't fit through the doorway because [it] is too narrow.", "answer1": "The doorway", "answer0": "The table", "is_associative": 0, "correct_answer": "The doorway"}, {"index": 200, "sentence": "Grace was happy to trade me her sweater for my jacket. She thinks [it] looks dowdy on her.", "answer1": "The jacket", "answer0": "The sweater", "is_associative": 0, "correct_answer": "The sweater"}, {"index": 201, "sentence": "Grace was happy to trade me her sweater for my jacket. She thinks [it] looks great on her.", "answer1": "The jacket", "answer0": "The sweater", "is_associative": 0, "correct_answer": "The jacket"}, {"index": 202, "sentence": "John hired Bill to take care of [him] .", "answer1": "Bill", "answer0": "John", "is_associative": 0, "correct_answer": "John"}, {"index": 203, "sentence": "John hired himself out to Bill to take care of [him] .", "answer1": "Bill", "answer0": "John", "is_associative": 0, "correct_answer": "Bill"}, {"index": 204, "sentence": "John promised Bill to leave, so an hour later [he] left.", "answer1": "Bill", "answer0": "John", "is_associative": 0, "correct_answer": "John"}, {"index": 205, "sentence": "John ordered Bill to leave, so an hour later [he] left.", "answer1": "Bill", "answer0": "John", "is_associative": 0, "correct_answer": "Bill"}, {"index": 206, "sentence": "Sam Goodman's biography of the Spartan general Xenophanes conveys a vivid sense of the difficulties [he] faced in his research.", "answer1": "Xenophanes", "answer0": "Goodman", "is_associative": 0, "correct_answer": "Goodman"}, {"index": 207, "sentence": "Sam Goodman's biography of the Spartan general Xenophanes conveys a vivid sense of the difficulties [he] faced in his childhood.", "answer1": "Xenophanes", "answer0": "Goodman", "is_associative": 0, "correct_answer": "Xenophanes"}, {"index": 208, "sentence": "Emma's mother had died long ago, and [her] education had been managed by an excellent woman as governess.", "answer1": "Emma's mother", "answer0": "Emma", "is_associative": 0, "correct_answer": "Emma"}, {"index": 209, "sentence": "Emma's mother had died long ago, and [her] place had been taken by an excellent woman as governess.", "answer1": "Emma's mother", "answer0": "Emma", "is_associative": 0, "correct_answer": "Emma's mother"}, {"index": 210, "sentence": "Jane knocked on Susan's door but [she] did not get an answer.", "answer1": "Susan", "answer0": "Jane", "is_associative": 0, "correct_answer": "Jane"}, {"index": 211, "sentence": "Jane knocked on Susan's door but [she] did not answer.", "answer1": "Susan", "answer0": "Jane", "is_associative": 0, "correct_answer": "Susan"}, {"index": 212, "sentence": "Joe paid the detective after [he] received the final report on the case.", "answer1": "the detective", "answer0": "Joe", "is_associative": 0, "correct_answer": "Joe"}, {"index": 213, "sentence": "Joe paid the detective after [he] delivered the final report on the case.", "answer1": "the detective", "answer0": "Joe", "is_associative": 0, "correct_answer": "the detective"}, {"index": 214, "sentence": "Beth didn't get angry with Sally, who had cut her off, because [she] stopped and counted to ten.", "answer1": "Sally", "answer0": "Beth", "is_associative": 0, "correct_answer": "Beth"}, {"index": 215, "sentence": "Beth didn't get angry with Sally, who had cut her off, because [she] stopped and apologized.", "answer1": "Sally", "answer0": "Beth", "is_associative": 0, "correct_answer": "Sally"}, {"index": 216, "sentence": "Jim signaled the barman and gestured toward [his] empty glass", "answer1": "The barman", "answer0": "Jim", "is_associative": 0, "correct_answer": "Jim"}, {"index": 217, "sentence": "Jim signaled the barman and gestured toward [his] bathroom key.", "answer1": "The barman", "answer0": "Jim", "is_associative": 0, "correct_answer": "The barman"}, {"index": 218, "sentence": "Dan took the rear seat while Bill claimed the front because [his] \"Dibs!\" was slow.", "answer1": "Bill", "answer0": "Dan", "is_associative": 0, "correct_answer": "Dan"}, {"index": 219, "sentence": "Dan took the rear seat while Bill claimed the front because [his] \"Dibs!\" was quicker.", "answer1": "Bill", "answer0": "Dan", "is_associative": 0, "correct_answer": "Bill"}, {"index": 220, "sentence": "Tom said \"Check\" to Ralph as he moved [his] bishop.", "answer1": "Ralph", "answer0": "Tom", "is_associative": 0, "correct_answer": "Tom"}, {"index": 221, "sentence": "Tom said \"Check\" to Ralph as he took [his] bishop.", "answer1": "Ralph", "answer0": "Tom", "is_associative": 0, "correct_answer": "Ralph"}, {"index": 222, "sentence": "As Andrea in the crop duster passed over Susan, [she] could see the landing strip.", "answer1": "Susan", "answer0": "Andrea", "is_associative": 0, "correct_answer": "Andrea"}, {"index": 223, "sentence": "As Andrea in the crop duster passed over Susan, [she] could see the landing gear.", "answer1": "Susan", "answer0": "Andrea", "is_associative": 0, "correct_answer": "Susan"}, {"index": 224, "sentence": "Tom gave Ralph a lift to school so [he] wouldn't have to drive alone.", "answer1": "Ralph", "answer0": "Tom", "is_associative": 0, "correct_answer": "Tom"}, {"index": 225, "sentence": "Tom gave Ralph a lift to school so [he] wouldn't have to walk.", "answer1": "Ralph", "answer0": "Tom", "is_associative": 0, "correct_answer": "Ralph"}, {"index": 226, "sentence": "Bill passed the half-empty plate to John because [he] was full.", "answer1": "John", "answer0": "Bill", "is_associative": 0, "correct_answer": "Bill"}, {"index": 227, "sentence": "Bill passed the half-empty plate to John because [he] was hungry.", "answer1": "John", "answer0": "Bill", "is_associative": 0, "correct_answer": "John"}, {"index": 228, "sentence": "Bill passed the gameboy to John because [his] turn was over.", "answer1": "John", "answer0": "Bill", "is_associative": 0, "correct_answer": "Bill"}, {"index": 229, "sentence": "Bill passed the gameboy to John because [his] turn was next.", "answer1": "John", "answer0": "Bill", "is_associative": 0, "correct_answer": "John"}, {"index": 230, "sentence": "The man lifted the boy onto [his] shoulders.", "answer1": "The boy", "answer0": "The man", "is_associative": 0, "correct_answer": "The man"}, {"index": 232, "sentence": "Stretching [her] back, the woman smiled at the girl.", "answer1": "The girl", "answer0": "The woman", "is_associative": 0, "correct_answer": "The woman"}, {"index": 233, "sentence": "Patting [her] back, the woman smiled at the girl.", "answer1": "The girl", "answer0": "The woman", "is_associative": 0, "correct_answer": "The girl"}, {"index": 234, "sentence": "Billy cried because Toby wouldn't accept [his] toy.", "answer1": "Toby", "answer0": "Billy", "is_associative": 0, "correct_answer": "Billy"}, {"index": 235, "sentence": "Billy cried because Toby wouldn't share [his] toy.", "answer1": "Toby", "answer0": "Billy", "is_associative": 0, "correct_answer": "Toby"}, {"index": 236, "sentence": "Lily spoke to Donna, breaking [her] silence.", "answer1": "Donna", "answer0": "Lily", "is_associative": 0, "correct_answer": "Lily"}, {"index": 237, "sentence": "Lily spoke to Donna, breaking [her] concentration.", "answer1": "Donna", "answer0": "Lily", "is_associative": 0, "correct_answer": "Donna"}, {"index": 238, "sentence": "When Tommy dropped his ice cream, Timmy giggled, so father gave [him] a sympathetic look.", "answer1": "Timmy", "answer0": "Tommy", "is_associative": 0, "correct_answer": "Tommy"}, {"index": 239, "sentence": "When Tommy dropped his ice cream, Timmy giggled, so father gave [him] a stern look.", "answer1": "Timmy", "answer0": "Tommy", "is_associative": 0, "correct_answer": "Timmy"}, {"index": 240, "sentence": "As Ollie carried Tommy up the long winding steps, [his] legs ached.", "answer1": "Tommy", "answer0": "Ollie", "is_associative": 0, "correct_answer": "Ollie"}, {"index": 241, "sentence": "As Ollie carried Tommy up the long winding steps, [his] legs dangled.", "answer1": "Tommy", "answer0": "Ollie", "is_associative": 0, "correct_answer": "Tommy"}, {"index": 242, "sentence": "The father carried the sleeping boy in [his] arms", "answer1": "The boy", "answer0": "The father", "is_associative": 0, "correct_answer": "The father"}, {"index": 243, "sentence": "The father carried the sleeping boy in [his] bassinet.", "answer1": "The boy", "answer0": "The father", "is_associative": 0, "correct_answer": "The boy"}, {"index": 244, "sentence": "The woman held the girl against [her] chest", "answer1": "The girl", "answer0": "The woman", "is_associative": 0, "correct_answer": "The woman"}, {"index": 245, "sentence": "The woman held the girl against [her] will.", "answer1": "The girl", "answer0": "The woman", "is_associative": 0, "correct_answer": "The girl"}, {"index": 246, "sentence": "Pam's parents came home and found her having sex with her boyfriend, Paul. [They] were furious about it.", "answer1": "Pam and Paul", "answer0": "Pam's parents", "is_associative": 0, "correct_answer": "Pam's parents"}, {"index": 247, "sentence": "Pam's parents came home and found her having sex with her boyfriend, Paul. [They] were embarrassed about it.", "answer1": "Pam and Paul", "answer0": "Pam's parents", "is_associative": 0, "correct_answer": "Pam and Paul"}, {"index": 248, "sentence": "Dr. Adams informed Kate that [she] had retired and presented several options for future treatment.", "answer1": "Kate", "answer0": "Dr. Adams", "is_associative": 0, "correct_answer": "Dr. Adams"}, {"index": 249, "sentence": "Dr. Adams informed Kate that [she] had cancer and presented several options for future treatment.", "answer1": "Kate", "answer0": "Dr. Adams", "is_associative": 0, "correct_answer": "Kate"}, {"index": 250, "sentence": "Dan had to stop Bill from toying with the injured bird. [He] is very compassionate.", "answer1": "Bill", "answer0": "Dan", "is_associative": 0, "correct_answer": "Dan"}, {"index": 251, "sentence": "Dan had to stop Bill from toying with the injured bird. [He] is very cruel.", "answer1": "Bill", "answer0": "Dan", "is_associative": 0, "correct_answer": "Bill"}, {"index": 252, "sentence": "George got free tickets to the play, but he gave them to Eric, even though [he] was particularly eager to see it.", "answer1": "Eric", "answer0": "George", "is_associative": 0, "correct_answer": "George"}, {"index": 253, "sentence": "George got free tickets to the play, but he gave them to Eric, because [he] was particularly eager to see it.", "answer1": "Eric", "answer0": "George", "is_associative": 0, "correct_answer": "Eric"}, {"index": 254, "sentence": "George got free tickets to the play, but he gave them to Eric, because [he] was not particularly eager to see it.", "answer1": "Eric", "answer0": "George", "is_associative": 0, "correct_answer": "George"}, {"index": 255, "sentence": "Jane gave Joan candy because [she] wasn't hungry.", "answer1": "Joan", "answer0": "Jane", "is_associative": 0, "correct_answer": "Jane"}, {"index": 256, "sentence": "Jane gave Joan candy because [she] was hungry.", "answer1": "Joan", "answer0": "Jane", "is_associative": 0, "correct_answer": "Joan"}, {"index": 257, "sentence": "I tried to paint a picture of an orchard, with lemons in the lemon trees, but [they] came out looking more like light bulbs.", "answer1": "lemon trees", "answer0": "lemons", "is_associative": 0, "correct_answer": "lemons"}, {"index": 258, "sentence": "I tried to paint a picture of an orchard, with lemons in the lemon trees, but [they] came out looking more like telephone poles.", "answer1": "lemon trees", "answer0": "lemons", "is_associative": 0, "correct_answer": "lemon trees"}, {"index": 259, "sentence": "James asked Robert for a favor but [he] was refused.", "answer1": "Robert", "answer0": "James", "is_associative": 0, "correct_answer": "James"}, {"index": 260, "sentence": "James asked Robert for a favor but [he] refused.", "answer1": "Robert", "answer0": "James", "is_associative": 0, "correct_answer": "Robert"}, {"index": 261, "sentence": "Kirilov ceded the presidency to Shatov because [he] was less popular.", "answer1": "Shatov", "answer0": "Kirilov", "is_associative": 0, "correct_answer": "Kirilov"}, {"index": 262, "sentence": "Kirilov ceded the presidency to Shatov because [he] was more popular.", "answer1": "Shatov", "answer0": "Kirilov", "is_associative": 0, "correct_answer": "Shatov"}, {"index": 263, "sentence": "Emma did not pass the ball to Janie although [she] saw that she was open.", "answer1": "Janie", "answer0": "Emma", "is_associative": 0, "correct_answer": "Emma"}, {"index": 264, "sentence": "Emma did not pass the ball to Janie although [she] was open.", "answer1": "Janie", "answer0": "Emma", "is_associative": 0, "correct_answer": "Janie"}, {"index": 265, "sentence": "I put the butterfly wing on the table and [it] broke.", "answer1": "The table", "answer0": "The butterfly wing", "is_associative": 0, "correct_answer": "The butterfly wing"}, {"index": 267, "sentence": "Madonna fired her trainer because [she] couldn't stand her boyfriend.", "answer1": "The trainer", "answer0": "Madonna", "is_associative": 0, "correct_answer": "Madonna"}, {"index": 268, "sentence": "Madonna fired her trainer because [she] slept with her boyfriend.", "answer1": "The trainer", "answer0": "Madonna", "is_associative": 0, "correct_answer": "The trainer"}, {"index": 269, "sentence": "Madonna fired her trainer because she slept with [her] boyfriend.", "answer1": "The trainer", "answer0": "Madonna", "is_associative": 0, "correct_answer": "Madonna"}, {"index": 270, "sentence": "Madonna fired her trainer because she couldn't stand [her] boyfriend.", "answer1": "The trainer", "answer0": "Madonna", "is_associative": 0, "correct_answer": "The trainer"}, {"index": 271, "sentence": "Carol believed that Rebecca suspected that [she] had stolen the watch.", "answer1": "Rebecca", "answer0": "Carol", "is_associative": 0, "correct_answer": "Carol"}, {"index": 272, "sentence": "Carol believed that Rebecca regretted that [she] had stolen the watch.", "answer1": "Rebecca", "answer0": "Carol", "is_associative": 0, "correct_answer": "Rebecca"}, {"index": 53, "sentence": "The painting in Mark's living room shows an oak tree. [It] is to the right of a house.", "answer1": "The oak tree", "answer0": "The painting", "is_associative": 1, "correct_answer": "The oak tree"}, {"index": 54, "sentence": "There is a gap in the wall. You can see the garden through [it] .", "answer1": "The wall", "answer0": "The gap", "is_associative": 1, "correct_answer": "The gap"}, {"index": 55, "sentence": "There is a gap in the wall. You can see the garden behind [it] .", "answer1": "The wall", "answer0": "The gap", "is_associative": 1, "correct_answer": "The wall"}, {"index": 58, "sentence": "My meeting started at 4:00 and I needed to catch the train at 4:30, so there wasn't much time. Luckily, [it] was short, so it worked out.", "answer1": "The train", "answer0": "The meeting", "is_associative": 1, "correct_answer": "The meeting"}, {"index": 72, "sentence": "I couldn't put the pot on the shelf because [it] was too tall.", "answer1": "The shelf", "answer0": "The pot", "is_associative": 1, "correct_answer": "The pot"}, {"index": 74, "sentence": "I'm sure that my map will show this building; [it] is very good.", "answer1": "The building", "answer0": "The map", "is_associative": 1, "correct_answer": "The map"}, {"index": 75, "sentence": "I'm sure that my map will show this building; [it] is very famous.", "answer1": "The building", "answer0": "The map", "is_associative": 1, "correct_answer": "The building"}, {"index": 87, "sentence": "It was a summer afternoon, and the dog was sitting in the middle of the lawn. After a while, it got up and moved to a spot under the tree, because [it] was cooler.", "answer1": "The spot under the tree", "answer0": "The dog", "is_associative": 1, "correct_answer": "The spot under the tree"}, {"index": 98, "sentence": "I was trying to open the lock with the key, but someone had filled the keyhole with chewing gum, and I couldn't get [it] in.", "answer1": "The chewing gum", "answer0": "The key", "is_associative": 1, "correct_answer": "The key"}, {"index": 103, "sentence": "In the storm, the tree fell down and crashed through the roof of my house. Now, I have to get [it] repaired.", "answer1": "The roof", "answer0": "The tree", "is_associative": 1, "correct_answer": "The roof"}, {"index": 109, "sentence": "John was jogging through the park when he saw a man juggling watermelons. [He] was very impressive.", "answer1": "The juggler", "answer0": "John", "is_associative": 1, "correct_answer": "The juggler"}, {"index": 112, "sentence": "Sam and Amy are passionately in love, but Amy's parents are unhappy about it, because [they] are fifteen.", "answer1": "Amy's parents", "answer0": "Sam and Amy", "is_associative": 1, "correct_answer": "Sam and Amy"}, {"index": 116, "sentence": "Joe has sold his house and bought a new one a few miles away. He will be moving out of [it] on Thursday.", "answer1": "The new house", "answer0": "The old house", "is_associative": 1, "correct_answer": "The old house"}, {"index": 117, "sentence": "Joe has sold his house and bought a new one a few miles away. He will be moving into [it] on Thursday.", "answer1": "The new house", "answer0": "The old house", "is_associative": 1, "correct_answer": "The new house"}, {"index": 118, "sentence": "Many people start to read Paul's books and can't put them down. [They] are gripped because Paul writes so well.", "answer1": "Paul's books", "answer0": "People", "is_associative": 1, "correct_answer": "People"}, {"index": 119, "sentence": "Many people start to read Paul's books and can't put them down. [They] are popular because Paul writes so well.", "answer1": "Paul's books", "answer0": "People", "is_associative": 1, "correct_answer": "Paul's books"}, {"index": 120, "sentence": "Mary took out her flute and played one of her favorite pieces. She has had [it] since she was a child.", "answer1": "The piece", "answer0": "The flute", "is_associative": 1, "correct_answer": "The flute"}, {"index": 136, "sentence": "The path to the lake was blocked, so we couldn't use [it] .", "answer1": "The lake", "answer0": "The path", "is_associative": 1, "correct_answer": "The path"}, {"index": 137, "sentence": "The path to the lake was blocked, so we couldn't reach [it] .", "answer1": "The lake", "answer0": "The path", "is_associative": 1, "correct_answer": "The lake"}, {"index": 138, "sentence": "The sun was covered by a thick cloud all morning, but luckily, by the time the picnic started, [it] was out.", "answer1": "The cloud", "answer0": "The sun", "is_associative": 1, "correct_answer": "The sun"}, {"index": 146, "sentence": "Fred and Alice had very warm down coats, but [they] were not prepared for the cold in Alaska.", "answer1": "coats", "answer0": "Fred and Alice", "is_associative": 1, "correct_answer": "Fred and Alice"}, {"index": 147, "sentence": "Fred and Alice had very warm down coats, but [they] were not enough for the cold in Alaska.", "answer1": "coats", "answer0": "Fred and Alice", "is_associative": 1, "correct_answer": "coats"}, {"index": 155, "sentence": "The foxes are getting in at night and attacking the chickens. I shall have to guard [them] .", "answer1": "The chickens", "answer0": "The foxes", "is_associative": 1, "correct_answer": "The chickens"}, {"index": 158, "sentence": "Fred covered his eyes with his hands, because the wind was blowing sand around. He opened [them] when the wind stopped.", "answer1": "His hands", "answer0": "His eyes", "is_associative": 1, "correct_answer": "His eyes"}, {"index": 174, "sentence": "Archaeologists have concluded that humans lived in Laputa 20,000 years ago. [They] hunted for evidence on the river banks.", "answer1": "Prehistoric humans", "answer0": "Archaeologists", "is_associative": 1, "correct_answer": "Archaeologists"}, {"index": 175, "sentence": "Archaeologists have concluded that humans lived in Laputa 20,000 years ago. [They] hunted for deer on the river banks.", "answer1": "Prehistoric humans", "answer0": "Archaeologists", "is_associative": 1, "correct_answer": "Prehistoric humans"}, {"index": 176, "sentence": "The scientists are studying three species of fish that have recently been found living in the Indian Ocean. [They] began two years ago.", "answer1": "The fish", "answer0": "The scientists", "is_associative": 1, "correct_answer": "The scientists"}, {"index": 177, "sentence": "The scientists are studying three species of fish that have recently been found living in the Indian Ocean. [They] appeared two years ago.", "answer1": "The fish", "answer0": "The scientists", "is_associative": 1, "correct_answer": "The fish"}, {"index": 180, "sentence": "The police arrested all of the gang members. [They] were trying to stop the drug trade in the neighborhood.", "answer1": "The gang members", "answer0": "The police", "is_associative": 1, "correct_answer": "The police"}, {"index": 181, "sentence": "The police arrested all of the gang members. [They] were trying to run the drug trade in the neighborhood.", "answer1": "The gang members", "answer0": "The police", "is_associative": 1, "correct_answer": "The gang members"}, {"index": 182, "sentence": "I put the cake away in the refrigerator. [It] has a lot of butter in it.", "answer1": "The refrigerator", "answer0": "The cake", "is_associative": 1, "correct_answer": "The cake"}, {"index": 183, "sentence": "I put the cake away in the refrigerator. [It] has a lot of leftovers in it.", "answer1": "The refrigerator", "answer0": "The cake", "is_associative": 1, "correct_answer": "The refrigerator"}, {"index": 184, "sentence": "Sam broke both his ankles and he's walking with crutches. But a month or so from now [they] should be better.", "answer1": "The crutches", "answer0": "The ankles", "is_associative": 1, "correct_answer": "The ankles"}, {"index": 185, "sentence": "Sam broke both his ankles and he's walking with crutches. But a month or so from now [they] should be unnecessary.", "answer1": "The crutches", "answer0": "The ankles", "is_associative": 1, "correct_answer": "The crutches"}, {"index": 192, "sentence": "I stuck a pin through a carrot. When I pulled the pin out, [it] left a hole.", "answer1": "The carrot", "answer0": "The pin", "is_associative": 1, "correct_answer": "The pin"}, {"index": 231, "sentence": "The man lifted the boy onto [his] bunk bed.", "answer1": "The boy", "answer0": "The man", "is_associative": 1, "correct_answer": "The boy"}, {"index": 266, "sentence": "I put the heavy book on the table and [it] broke.", "answer1": "The table", "answer0": "The heavy book", "is_associative": 1, "correct_answer": "The table"}] \ No newline at end of file diff --git a/WSC_child_problem.json b/WSC_child_problem.json new file mode 100644 index 00000000000000..33336d7e6cdef0 --- /dev/null +++ b/WSC_child_problem.json @@ -0,0 +1,11443 @@ +[ + { + "index": 0, + "sentences": [ + { + "sentence": "The policemen refused the demonstrators a permit because the _ feared violence.", + "answer1": [ + "demonstrators" + ], + "answer0": [ + "policemen" + ], + "correct_answer": [ + "policemen" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "demonstrators", + 0.4118339717388153 + ] + ], + "score": 0 + } + ] + }, + { + "index": 1, + "sentences": [ + { + "sentence": "The policemen refused the demonstrators a permit because the _ advocated violence.", + "answer1": [ + "demonstrators" + ], + "answer0": [ + "policemen" + ], + "correct_answer": [ + "demonstrators" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "demonstrators", + 0.457832932472229 + ] + ], + "score": 1 + } + ] + }, + { + "index": 2, + "sentences": [ + { + "sentence": "The trophy doesn't fit into the brown suitcase because the _ is too large.", + "answer1": [ + "suitcase" + ], + "answer0": [ + "trophy" + ], + "correct_answer": [ + "trophy" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "suitcase", + 0.21058858931064606 + ] + ], + "score": 0 + } + ] + }, + { + "index": 3, + "sentences": [ + { + "sentence": "The trophy doesn't fit into the brown suitcase because the _ is too small.", + "answer1": [ + "suitcase" + ], + "answer0": [ + "trophy" + ], + "correct_answer": [ + "suitcase" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "suitcase", + 0.11347327381372452 + ], + [ + "trophy", + 0.04299129545688629 + ] + ], + "score": 1 + } + ] + }, + { + "index": 4, + "sentences": [ + { + "sentence": "Susan made sure to thank Alan for all the help _ had recieved.", + "answer1": [ + "Alan", + "he" + ], + "answer0": [ + "Susan", + "she" + ], + "correct_answer": [ + "Susan", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.8059180378913879 + ], + [ + "she", + 0.10031098127365112 + ], + [ + "alan", + 0.006458722520619631 + ] + ], + "score": 0 + }, + { + "sentence": "Anthony made sure to thank Donna for all the help _ had recieved.", + "answer1": [ + "Donna", + "she" + ], + "answer0": [ + "Anthony", + "he" + ], + "correct_answer": [ + "Anthony", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.8589648604393005 + ], + [ + "he", + 0.035839419811964035 + ], + [ + "donna", + 0.009961705654859543 + ], + [ + "anthony", + 0.0026284847408533096 + ] + ], + "score": 0 + } + ] + }, + { + "index": 5, + "sentences": [ + { + "sentence": "Wendy made sure to thank David for all the help _ had given.", + "answer1": [ + "David", + "he" + ], + "answer0": [ + "Wendy", + "she" + ], + "correct_answer": [ + "David", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.9703136682510376 + ], + [ + "she", + 0.012762513943016529 + ], + [ + "david", + 0.002392230788245797 + ] + ], + "score": 1 + }, + { + "sentence": "Daniel made sure to thank Wendy for all the help _ had given.", + "answer1": [ + "Wendy", + "she" + ], + "answer0": [ + "Daniel", + "he" + ], + "correct_answer": [ + "Wendy", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.9681608080863953 + ], + [ + "he", + 0.01262774970382452 + ], + [ + "wendy", + 0.003302064025774598 + ], + [ + "daniel", + 0.00175810931250453 + ] + ], + "score": 1 + } + ] + }, + { + "index": 6, + "sentences": [ + { + "sentence": "Daniel tried to call Lisa on the phone, but _ wasn't successful.", + "answer1": [ + "Lisa", + "she" + ], + "answer0": [ + "Daniel", + "he" + ], + "correct_answer": [ + "Daniel", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.9892734885215759 + ], + [ + "she", + 0.005904461722820997 + ], + [ + "daniel", + 0.0004344168701209128 + ] + ], + "score": 1 + }, + { + "sentence": "Donna tried to call David on the phone, but _ wasn't successful.", + "answer1": [ + "David", + "he" + ], + "answer0": [ + "Donna", + "she" + ], + "correct_answer": [ + "Donna", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.9855420589447021 + ], + [ + "he", + 0.006738866213709116 + ], + [ + "david", + 0.0012104340130463243 + ] + ], + "score": 1 + } + ] + }, + { + "index": 7, + "sentences": [ + { + "sentence": "Barbara tried to call Charles on the phone, but _ wasn't available.", + "answer1": [ + "Charles", + "he" + ], + "answer0": [ + "Barbara", + "she" + ], + "correct_answer": [ + "Charles", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.8587782979011536 + ], + [ + "charles", + 0.1335594803094864 + ], + [ + "she", + 0.000997341237962246 + ] + ], + "score": 1 + }, + { + "sentence": "Warren tried to call Laura on the phone, but _ wasn't available.", + "answer1": [ + "Laura", + "she" + ], + "answer0": [ + "Warren", + "he" + ], + "correct_answer": [ + "Laura", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.9781970381736755 + ], + [ + "laura", + 0.01614491641521454 + ], + [ + "he", + 0.00039897472015582025 + ] + ], + "score": 1 + } + ] + }, + { + "index": 8, + "sentences": [ + { + "sentence": "The lawyer asked the witness a question, but the _ was reluctant to repeat it.", + "answer1": [ + "witness" + ], + "answer0": [ + "lawyer" + ], + "correct_answer": [ + "lawyer" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "witness", + 0.6564255952835083 + ], + [ + "lawyer", + 0.012273530475795269 + ] + ], + "score": 0 + } + ] + }, + { + "index": 9, + "sentences": [ + { + "sentence": "The lawyer asked the witness a question, but the _ was reluctant to answer it.", + "answer1": [ + "witness" + ], + "answer0": [ + "lawyer" + ], + "correct_answer": [ + "witness" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "witness", + 0.8160716891288757 + ] + ], + "score": 1 + } + ] + }, + { + "index": 10, + "sentences": [ + { + "sentence": "The truck zoomed by the bus because the _ was going so fast.", + "answer1": [ + "bus" + ], + "answer0": [ + "truck" + ], + "correct_answer": [ + "truck" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "bus", + 0.2043217420578003 + ], + [ + "truck", + 0.18831151723861694 + ] + ], + "score": 0 + } + ] + }, + { + "index": 11, + "sentences": [ + { + "sentence": "The truck zoomed by the bus because the _ was going so slow.", + "answer1": [ + "bus" + ], + "answer0": [ + "truck" + ], + "correct_answer": [ + "bus" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "bus", + 0.239765927195549 + ], + [ + "truck", + 0.16296519339084625 + ] + ], + "score": 1 + } + ] + }, + { + "index": 12, + "sentences": [ + { + "sentence": "Helen felt vindicated when her longtime rival George revealed that _ was the winner of the competition.", + "answer1": [ + "George", + "he" + ], + "answer0": [ + "Helen", + "she" + ], + "correct_answer": [ + "Helen", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.7605838775634766 + ], + [ + "she", + 0.1281837671995163 + ], + [ + "helen", + 0.03044680505990982 + ], + [ + "george", + 0.016294078901410103 + ] + ], + "score": 0 + }, + { + "sentence": "George felt vindicated when his longtime rival Sandra revealed that _ was the winner of the competition.", + "answer1": [ + "Sandra", + "she" + ], + "answer0": [ + "George", + "he" + ], + "correct_answer": [ + "George", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.6007376313209534 + ], + [ + "he", + 0.27373355627059937 + ], + [ + "george", + 0.06088083237409592 + ], + [ + "sandra", + 0.002873124787583947 + ] + ], + "score": 0 + } + ] + }, + { + "index": 13, + "sentences": [ + { + "sentence": "Edward felt crushed when his longtime rival Barbara revealed that _ was the winner of the competition.", + "answer1": [ + "Barbara", + "she" + ], + "answer0": [ + "Edward", + "he" + ], + "correct_answer": [ + "Barbara", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.7010877132415771 + ], + [ + "he", + 0.1804790198802948 + ], + [ + "edward", + 0.008347589522600174 + ], + [ + "barbara", + 0.0056170192547142506 + ] + ], + "score": 1 + }, + { + "sentence": "Nancy felt crushed when her longtime rival Tony revealed that _ was the winner of the competition.", + "answer1": [ + "Tony", + "he" + ], + "answer0": [ + "Nancy", + "she" + ], + "correct_answer": [ + "Tony", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.6629055142402649 + ], + [ + "she", + 0.231082022190094 + ], + [ + "tony", + 0.03906760364770889 + ], + [ + "nancy", + 0.013350573368370533 + ] + ], + "score": 1 + } + ] + }, + { + "index": 14, + "sentences": [ + { + "sentence": "The man couldn't lift his daughter because _ was so weak.", + "answer1": [ + "she" + ], + "answer0": [ + "he" + ], + "correct_answer": [ + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.966475784778595 + ], + [ + "he", + 0.012883448973298073 + ] + ], + "score": 0 + } + ] + }, + { + "index": 15, + "sentences": [ + { + "sentence": "The man couldn't lift his daughter because _ was so heavy.", + "answer1": [ + "she" + ], + "answer0": [ + "he" + ], + "correct_answer": [ + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.9816145300865173 + ], + [ + "he", + 0.0035426830872893333 + ] + ], + "score": 1 + } + ] + }, + { + "index": 16, + "sentences": [ + { + "sentence": "The ball crashed right through the table because the _ was made of steel.", + "answer1": [ + "table" + ], + "answer0": [ + "ball" + ], + "correct_answer": [ + "ball" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "table", + 0.3629739284515381 + ], + [ + "ball", + 0.08679845184087753 + ] + ], + "score": 0 + } + ] + }, + { + "index": 17, + "sentences": [ + { + "sentence": "The ball crashed right through the table because the _ was made of styrofoam.", + "answer1": [ + "table" + ], + "answer0": [ + "ball" + ], + "correct_answer": [ + "table" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "table", + 0.4074442386627197 + ], + [ + "ball", + 0.10976675152778625 + ] + ], + "score": 1 + } + ] + }, + { + "index": 18, + "sentences": [ + { + "sentence": "Tony couldn't see the stage with Jennifer in front of him because _ is so short.", + "answer1": [ + "Jennifer", + "she" + ], + "answer0": [ + "Tony", + "he" + ], + "correct_answer": [ + "Tony", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.5132592916488647 + ], + [ + "jennifer", + 0.40244361758232117 + ], + [ + "he", + 0.013166015036404133 + ], + [ + "tony", + 0.003285476006567478 + ] + ], + "score": 0 + }, + { + "sentence": "Mandy couldn't see the stage with Charles in front of her because _ is so short.", + "answer1": [ + "Charles", + "he" + ], + "answer0": [ + "Mandy", + "she" + ], + "correct_answer": [ + "Mandy", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.46862319111824036 + ], + [ + "she", + 0.21981821954250336 + ], + [ + "charles", + 0.14085616171360016 + ] + ], + "score": 0 + } + ] + }, + { + "index": 19, + "sentences": [ + { + "sentence": "Sandra couldn't see the stage with Jason in front of her because _ is so tall.", + "answer1": [ + "Jason", + "he" + ], + "answer0": [ + "Sandra", + "she" + ], + "correct_answer": [ + "Jason", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.7260995507240295 + ], + [ + "jason", + 0.2526448965072632 + ], + [ + "she", + 0.006947671994566917 + ], + [ + "sandra", + 0.0005759962368756533 + ] + ], + "score": 1 + }, + { + "sentence": "Richard couldn't see the stage with Margaret in front of him because _ is so tall.", + "answer1": [ + "Margaret", + "she" + ], + "answer0": [ + "Richard", + "he" + ], + "correct_answer": [ + "Margaret", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.8138208389282227 + ], + [ + "margaret", + 0.1297387033700943 + ], + [ + "he", + 0.02292880043387413 + ], + [ + "richard", + 0.0023241350427269936 + ] + ], + "score": 1 + } + ] + }, + { + "index": 20, + "sentences": [ + { + "sentence": "Lisa threw his schoolbag down to Jason after _ reached the top of the stairs.", + "answer1": [ + "Jason", + "he" + ], + "answer0": [ + "Lisa", + "she" + ], + "correct_answer": [ + "Lisa", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.2535589635372162 + ], + [ + "she", + 0.0745818242430687 + ], + [ + "jason", + 0.034438613802194595 + ] + ], + "score": 0 + }, + { + "sentence": "Zack threw his schoolbag down to Sarah after _ reached the top of the stairs.", + "answer1": [ + "Sarah", + "she" + ], + "answer0": [ + "Zack", + "he" + ], + "correct_answer": [ + "Zack", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.2084319144487381 + ], + [ + "he", + 0.10264766216278076 + ], + [ + "sarah", + 0.0038962233811616898 + ] + ], + "score": 0 + } + ] + }, + { + "index": 21, + "sentences": [ + { + "sentence": "Joseph threw his schoolbag down to Emma after _ reached the bottom of the stairs.", + "answer1": [ + "Emma", + "she" + ], + "answer0": [ + "Joseph", + "he" + ], + "correct_answer": [ + "Emma", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.23867610096931458 + ], + [ + "he", + 0.09107589721679688 + ], + [ + "emma", + 0.021801337599754333 + ] + ], + "score": 1 + }, + { + "sentence": "Anna threw his schoolbag down to Anthony after _ reached the bottom of the stairs.", + "answer1": [ + "Anthony", + "he" + ], + "answer0": [ + "Anna", + "she" + ], + "correct_answer": [ + "Anthony", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.3054933547973633 + ], + [ + "she", + 0.06062417849898338 + ], + [ + "anthony", + 0.00862197671085596 + ] + ], + "score": 1 + } + ] + }, + { + "index": 22, + "sentences": [ + { + "sentence": "Although they ran at about the same speed, John beat Vivian because _ had such a good start.", + "answer1": [ + "Vivian", + "she" + ], + "answer0": [ + "John", + "he" + ], + "correct_answer": [ + "John", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.4357670843601227 + ], + [ + "she", + 0.4087676405906677 + ], + [ + "vivian", + 0.028013162314891815 + ], + [ + "john", + 0.007432900369167328 + ] + ], + "score": 1 + }, + { + "sentence": "Although they ran at about the same speed, Barbara beat Tony because _ had such a good start.", + "answer1": [ + "Tony", + "he" + ], + "answer0": [ + "Barbara", + "she" + ], + "correct_answer": [ + "Barbara", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.5490840673446655 + ], + [ + "she", + 0.2430657595396042 + ], + [ + "tony", + 0.01997600682079792 + ], + [ + "barbara", + 0.004922997672110796 + ] + ], + "score": 0 + } + ] + }, + { + "index": 23, + "sentences": [ + { + "sentence": "Although they ran at about the same speed, George beat Nancy because _ had such a bad start.", + "answer1": [ + "Nancy", + "she" + ], + "answer0": [ + "George", + "he" + ], + "correct_answer": [ + "Nancy", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.6856573820114136 + ], + [ + "he", + 0.16068877279758453 + ], + [ + "nancy", + 0.027285361662507057 + ], + [ + "george", + 0.019552595913410187 + ] + ], + "score": 1 + }, + { + "sentence": "Although they ran at about the same speed, Sue beat Edward because _ had such a bad start.", + "answer1": [ + "Edward", + "he" + ], + "answer0": [ + "Sue", + "she" + ], + "correct_answer": [ + "Edward", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.6439886689186096 + ], + [ + "she", + 0.201904296875 + ], + [ + "edward", + 0.05457816645503044 + ], + [ + "sue", + 0.00406282115727663 + ] + ], + "score": 1 + } + ] + }, + { + "index": 24, + "sentences": [ + { + "sentence": "The sculpture rolled off the shelf because the _ wasn't anchored.", + "answer1": [ + "shelf" + ], + "answer0": [ + "sculpture" + ], + "correct_answer": [ + "sculpture" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "shelf", + 0.049739278852939606 + ] + ], + "score": 0 + } + ] + }, + { + "index": 25, + "sentences": [ + { + "sentence": "The sculpture rolled off the shelf because the _ wasn't level.", + "answer1": [ + "shelf" + ], + "answer0": [ + "sculpture" + ], + "correct_answer": [ + "shelf" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "shelf", + 0.1008094847202301 + ] + ], + "score": 1 + } + ] + }, + { + "index": 26, + "sentences": [ + { + "sentence": "Betty's drawing was hung just above Warren's and _'s drawing did look much better with another one below it.", + "answer1": [ + "Warren" + ], + "answer0": [ + "Betty" + ], + "correct_answer": [ + "Betty" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "warren", + 0.11010950058698654 + ], + [ + "betty", + 0.07763160765171051 + ] + ], + "score": 0 + }, + { + "sentence": "Edward's drawing was hung just above Wendy's and _'s drawing did look much better with another one below it.", + "answer1": [ + "Wendy" + ], + "answer0": [ + "Edward" + ], + "correct_answer": [ + "Edward" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "edward", + 0.14753200113773346 + ], + [ + "wendy", + 0.021199282258749008 + ] + ], + "score": 1 + } + ] + }, + { + "index": 27, + "sentences": [ + { + "sentence": "Paul's drawing was hung just above Anna's and _'s drawing did look much better with another one above it.", + "answer1": [ + "Anna" + ], + "answer0": [ + "Paul" + ], + "correct_answer": [ + "Anna" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "anna", + 0.39766207337379456 + ], + [ + "paul", + 0.17563124001026154 + ] + ], + "score": 1 + }, + { + "sentence": "Lisa's drawing was hung just above Anthony's and _'s drawing did look much better with another one above it.", + "answer1": [ + "Anthony" + ], + "answer0": [ + "Lisa" + ], + "correct_answer": [ + "Anthony" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "anthony", + 0.20052659511566162 + ], + [ + "lisa", + 0.06912633776664734 + ] + ], + "score": 1 + } + ] + }, + { + "index": 28, + "sentences": [ + { + "sentence": "Charles did a lot better than his good friend Nancy on the test because _ had studied so hard.", + "answer1": [ + "Nancy", + "she" + ], + "answer0": [ + "Charles", + "he" + ], + "correct_answer": [ + "Charles", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.4398093521595001 + ], + [ + "she", + 0.15170268714427948 + ], + [ + "charles", + 0.08861381560564041 + ], + [ + "nancy", + 0.08683475106954575 + ] + ], + "score": 1 + }, + { + "sentence": "Anna did a lot better than her good friend Jason on the test because _ had studied so hard.", + "answer1": [ + "Jason", + "he" + ], + "answer0": [ + "Anna", + "she" + ], + "correct_answer": [ + "Anna", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.5284304618835449 + ], + [ + "she", + 0.1605355590581894 + ], + [ + "jason", + 0.08630751073360443 + ], + [ + "anna", + 0.051893241703510284 + ] + ], + "score": 0 + } + ] + }, + { + "index": 29, + "sentences": [ + { + "sentence": "Linda did a lot worse than her good friend Thomas on the test because _ had studied so hard.", + "answer1": [ + "Thomas", + "he" + ], + "answer0": [ + "Linda", + "she" + ], + "correct_answer": [ + "Thomas", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.4308488368988037 + ], + [ + "he", + 0.23555852472782135 + ], + [ + "linda", + 0.10451061278581619 + ], + [ + "thomas", + 0.00802362896502018 + ] + ], + "score": 0 + }, + { + "sentence": "Daniel did a lot worse than his good friend Wendy on the test because _ had studied so hard.", + "answer1": [ + "Wendy", + "she" + ], + "answer0": [ + "Daniel", + "he" + ], + "correct_answer": [ + "Wendy", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.6460021734237671 + ], + [ + "he", + 0.13338574767112732 + ], + [ + "wendy", + 0.08289165794849396 + ], + [ + "daniel", + 0.04040371626615524 + ] + ], + "score": 1 + } + ] + }, + { + "index": 30, + "sentences": [ + { + "sentence": "The doctors arrived after the police because the _ were coming from so far away.", + "answer1": [ + "police" + ], + "answer0": [ + "doctors" + ], + "correct_answer": [ + "doctors" + ], + "adjacent_ref": false, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 31, + "sentences": [ + { + "sentence": "The doctors arrived before the police because the _ were coming from so far away.", + "answer1": [ + "police" + ], + "answer0": [ + "doctors" + ], + "correct_answer": [ + "police" + ], + "adjacent_ref": true, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 32, + "sentences": [ + { + "sentence": "Tim was upset with Barbara because the toaster _ had bought from her didn't work.", + "answer1": [ + "Barbara", + "she" + ], + "answer0": [ + "Tim", + "he" + ], + "correct_answer": [ + "Tim", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.5925723910331726 + ], + [ + "barbara", + 0.10750548541545868 + ], + [ + "tim", + 0.036800283938646317 + ], + [ + "she", + 0.034948937594890594 + ] + ], + "score": 1 + }, + { + "sentence": "Betty was upset with Daniel because the toaster _ had bought from him didn't work.", + "answer1": [ + "Daniel", + "he" + ], + "answer0": [ + "Betty", + "she" + ], + "correct_answer": [ + "Betty", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.5441405177116394 + ], + [ + "betty", + 0.3130474388599396 + ], + [ + "he", + 0.015271657146513462 + ], + [ + "daniel", + 0.014954675920307636 + ] + ], + "score": 1 + } + ] + }, + { + "index": 33, + "sentences": [ + { + "sentence": "Joseph was upset with Sandra because the toaster _ had sold him didn't work.", + "answer1": [ + "Sandra", + "she" + ], + "answer0": [ + "Joseph", + "he" + ], + "correct_answer": [ + "Sandra", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.3242930769920349 + ], + [ + "sandra", + 0.06303571909666061 + ], + [ + "joseph", + 0.01616159826517105 + ] + ], + "score": 1 + }, + { + "sentence": "Sue was upset with Brian because the toaster _ had sold her didn't work.", + "answer1": [ + "Brian", + "he" + ], + "answer0": [ + "Sue", + "she" + ], + "correct_answer": [ + "Brian", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.4483914077281952 + ], + [ + "brian", + 0.14752180874347687 + ], + [ + "sue", + 0.04900128394365311 + ], + [ + "she", + 0.014528470113873482 + ] + ], + "score": 1 + } + ] + }, + { + "index": 34, + "sentences": [ + { + "sentence": "Linda yelled at William because _ was so upset.", + "answer1": [ + "William", + "he" + ], + "answer0": [ + "Linda", + "she" + ], + "correct_answer": [ + "Linda", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.8594950437545776 + ], + [ + "she", + 0.10877512395381927 + ], + [ + "william", + 0.012823620811104774 + ] + ], + "score": 0 + }, + { + "sentence": "Warren yelled at Wendy because _ was so upset.", + "answer1": [ + "Wendy", + "she" + ], + "answer0": [ + "Warren", + "he" + ], + "correct_answer": [ + "Warren", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.8749997019767761 + ], + [ + "he", + 0.09160665422677994 + ], + [ + "wendy", + 0.007091708946973085 + ] + ], + "score": 0 + } + ] + }, + { + "index": 35, + "sentences": [ + { + "sentence": "Sandra comforted Daniel because _ was so upset.", + "answer1": [ + "Daniel", + "he" + ], + "answer0": [ + "Sandra", + "she" + ], + "correct_answer": [ + "Daniel", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.946452796459198 + ], + [ + "she", + 0.029977431520819664 + ], + [ + "daniel", + 0.014466170221567154 + ] + ], + "score": 1 + }, + { + "sentence": "Tim comforted Anna because _ was so upset.", + "answer1": [ + "Anna", + "she" + ], + "answer0": [ + "Tim", + "he" + ], + "correct_answer": [ + "Anna", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.9258339405059814 + ], + [ + "anna", + 0.03597695752978325 + ], + [ + "he", + 0.024506375193595886 + ] + ], + "score": 1 + } + ] + }, + { + "index": 36, + "sentences": [ + { + "sentence": "The sack had been placed above the bag, so the _ had to be moved first.", + "answer1": [ + "bag" + ], + "answer0": [ + "sack" + ], + "correct_answer": [ + "sack" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "bag", + 0.14206381142139435 + ], + [ + "sack", + 0.12494820356369019 + ] + ], + "score": 0 + } + ] + }, + { + "index": 37, + "sentences": [ + { + "sentence": "The sack had been placed below the bag, so the _ had to be moved first.", + "answer1": [ + "bag" + ], + "answer0": [ + "sack" + ], + "correct_answer": [ + "bag" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "bag", + 0.12820377945899963 + ], + [ + "sack", + 0.12242510914802551 + ] + ], + "score": 1 + } + ] + }, + { + "index": 38, + "sentences": [ + { + "sentence": "Laura envies Warren although _ is very successful.", + "answer1": [ + "Warren", + "he" + ], + "answer0": [ + "Laura", + "she" + ], + "correct_answer": [ + "Laura", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.5644444227218628 + ], + [ + "warren", + 0.27615460753440857 + ], + [ + "she", + 0.07276368141174316 + ], + [ + "laura", + 0.003620448987931013 + ] + ], + "score": 0 + }, + { + "sentence": "Daniel envies Emma although _ is very successful.", + "answer1": [ + "Emma", + "she" + ], + "answer0": [ + "Daniel", + "he" + ], + "correct_answer": [ + "Daniel", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.4373820424079895 + ], + [ + "he", + 0.3135783076286316 + ], + [ + "daniel", + 0.04151993989944458 + ], + [ + "emma", + 0.02725609578192234 + ] + ], + "score": 0 + } + ] + }, + { + "index": 39, + "sentences": [ + { + "sentence": "Jason envies Susan because _ is very successful.", + "answer1": [ + "Susan", + "she" + ], + "answer0": [ + "Jason", + "he" + ], + "correct_answer": [ + "Susan", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.9335176944732666 + ], + [ + "susan", + 0.04264418035745621 + ], + [ + "he", + 0.009751184843480587 + ] + ], + "score": 1 + }, + { + "sentence": "Sandra envies Zack because _ is very successful.", + "answer1": [ + "Zack", + "he" + ], + "answer0": [ + "Sandra", + "she" + ], + "correct_answer": [ + "Zack", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.8800366520881653 + ], + [ + "zack", + 0.09258320927619934 + ], + [ + "she", + 0.007650280836969614 + ] + ], + "score": 1 + } + ] + }, + { + "index": 40, + "sentences": [ + { + "sentence": "The older students were bullying the younger ones, so we punished the _ students .", + "answer1": [ + "younger" + ], + "answer0": [ + "older" + ], + "correct_answer": [ + "older" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "younger", + 0.5788522362709045 + ], + [ + "older", + 0.3715261220932007 + ] + ], + "score": 0 + } + ] + }, + { + "index": 41, + "sentences": [ + { + "sentence": "The older students were bullying the younger ones, so we rescued the _ students .", + "answer1": [ + "younger" + ], + "answer0": [ + "older" + ], + "correct_answer": [ + "younger" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "younger", + 0.459501713514328 + ], + [ + "older", + 0.3885803818702698 + ] + ], + "score": 1 + } + ] + }, + { + "index": 42, + "sentences": [ + { + "sentence": "I poured water from the bottle into the cup until the _ was empty.", + "answer1": [ + "cup" + ], + "answer0": [ + "bottle" + ], + "correct_answer": [ + "bottle" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "cup", + 0.16094626486301422 + ], + [ + "bottle", + 0.10395961999893188 + ] + ], + "score": 0 + } + ] + }, + { + "index": 43, + "sentences": [ + { + "sentence": "I poured water from the bottle into the cup until the _ was full.", + "answer1": [ + "cup" + ], + "answer0": [ + "bottle" + ], + "correct_answer": [ + "cup" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "cup", + 0.18986396491527557 + ], + [ + "bottle", + 0.03942202031612396 + ] + ], + "score": 1 + } + ] + }, + { + "index": 44, + "sentences": [ + { + "sentence": "William knows all about Sue's personal problems because _ is nosy.", + "answer1": [ + "Sue", + "she" + ], + "answer0": [ + "William", + "he" + ], + "correct_answer": [ + "William", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.4617686867713928 + ], + [ + "he", + 0.31756243109703064 + ], + [ + "sue", + 0.10323674231767654 + ], + [ + "william", + 0.01918354444205761 + ] + ], + "score": 0 + }, + { + "sentence": "Helen knows all about Warren's personal problems because _ is nosy.", + "answer1": [ + "Warren", + "he" + ], + "answer0": [ + "Helen", + "she" + ], + "correct_answer": [ + "Helen", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.37865328788757324 + ], + [ + "warren", + 0.3408011496067047 + ], + [ + "she", + 0.21000434458255768 + ], + [ + "helen", + 0.012840399518609047 + ] + ], + "score": 0 + } + ] + }, + { + "index": 45, + "sentences": [ + { + "sentence": "Emma knows all about Warren's personal problems because _ is indiscreet.", + "answer1": [ + "Warren", + "he" + ], + "answer0": [ + "Emma", + "she" + ], + "correct_answer": [ + "Warren", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "warren", + 0.6146198511123657 + ], + [ + "he", + 0.289546400308609 + ], + [ + "she", + 0.02142639271914959 + ] + ], + "score": 1 + }, + { + "sentence": "Alan knows all about Anna's personal problems because _ is indiscreet.", + "answer1": [ + "Anna", + "she" + ], + "answer0": [ + "Alan", + "he" + ], + "correct_answer": [ + "Anna", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.49282529950141907 + ], + [ + "anna", + 0.32425856590270996 + ], + [ + "he", + 0.07345326244831085 + ], + [ + "alan", + 0.017393626272678375 + ] + ], + "score": 1 + } + ] + }, + { + "index": 46, + "sentences": [ + { + "sentence": "John explained his theory to Wendy but _ couldn't convince her.", + "answer1": [ + "Wendy", + "she" + ], + "answer0": [ + "John", + "he" + ], + "correct_answer": [ + "John", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.9100954532623291 + ], + [ + "john", + 0.017341775819659233 + ], + [ + "wendy", + 0.004638417158275843 + ] + ], + "score": 1 + }, + { + "sentence": "Wendy explained her theory to Warren but _ couldn't convince him.", + "answer1": [ + "Warren", + "he" + ], + "answer0": [ + "Wendy", + "she" + ], + "correct_answer": [ + "Wendy", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.9247599244117737 + ], + [ + "wendy", + 0.0049832225777208805 + ] + ], + "score": 1 + } + ] + }, + { + "index": 47, + "sentences": [ + { + "sentence": "Sarah explained her theory to Steven but _ couldn't understand her.", + "answer1": [ + "Steven", + "he" + ], + "answer0": [ + "Sarah", + "she" + ], + "correct_answer": [ + "Steven", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.8895952105522156 + ], + [ + "steven", + 0.09927554428577423 + ], + [ + "she", + 0.000530878605786711 + ] + ], + "score": 1 + }, + { + "sentence": "David explained his theory to Susan but _ couldn't understand him.", + "answer1": [ + "Susan", + "she" + ], + "answer0": [ + "David", + "he" + ], + "correct_answer": [ + "Susan", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.9552003741264343 + ], + [ + "susan", + 0.025168264284729958 + ] + ], + "score": 1 + } + ] + }, + { + "index": 48, + "sentences": [ + { + "sentence": "Daniel knew that Margaret's son had been in a car accident, so _ told her about it.", + "answer1": [ + "Margaret", + "she" + ], + "answer0": [ + "Daniel", + "he" + ], + "correct_answer": [ + "Daniel", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.9160689115524292 + ], + [ + "daniel", + 0.05031977966427803 + ] + ], + "score": 1 + }, + { + "sentence": "Amy knew that Brian's son had been in a car accident, so _ told him about it.", + "answer1": [ + "Brian", + "he" + ], + "answer0": [ + "Amy", + "she" + ], + "correct_answer": [ + "Amy", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.825965404510498 + ], + [ + "amy", + 0.11254064738750458 + ] + ], + "score": 1 + } + ] + }, + { + "index": 49, + "sentences": [ + { + "sentence": "George knew that Sarah's son had been in a car accident, because _ told him about it.", + "answer1": [ + "Sarah", + "she" + ], + "answer0": [ + "George", + "he" + ], + "correct_answer": [ + "Sarah", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.25967302918434143 + ], + [ + "sarah", + 0.1378738284111023 + ], + [ + "george", + 0.1081177145242691 + ], + [ + "he", + 0.07801171392202377 + ] + ], + "score": 1 + }, + { + "sentence": "Susan knew that Tim's son had been in a car accident, because _ told her about it.", + "answer1": [ + "Tim", + "he" + ], + "answer0": [ + "Susan", + "she" + ], + "correct_answer": [ + "Tim", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.12469684332609177 + ], + [ + "tim", + 0.1100628525018692 + ], + [ + "susan", + 0.08253230154514313 + ], + [ + "she", + 0.044477976858615875 + ] + ], + "score": 1 + } + ] + }, + { + "index": 50, + "sentences": [ + { + "sentence": "Jason's aunt Betty can still beat him at tennis, even though _ is 30 years younger.", + "answer1": [ + "Betty", + "she" + ], + "answer0": [ + "Jason", + "he" + ], + "correct_answer": [ + "Jason", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.4724120795726776 + ], + [ + "he", + 0.3619353473186493 + ], + [ + "jason", + 0.11430627852678299 + ], + [ + "betty", + 0.009248387068510056 + ] + ], + "score": 0 + } + ] + }, + { + "index": 51, + "sentences": [ + { + "sentence": "Alan's aunt Laura can still beat him at tennis, even though _ is 30 years older.", + "answer1": [ + "Laura", + "she" + ], + "answer0": [ + "Alan", + "he" + ], + "correct_answer": [ + "Laura", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.410331130027771 + ], + [ + "he", + 0.3431159555912018 + ], + [ + "alan", + 0.2048305720090866 + ] + ], + "score": 1 + } + ] + }, + { + "index": 52, + "sentences": [ + { + "sentence": "The painting in Mark's living room shows a tree. the _ is to the right of the bookcase.", + "answer1": [ + "tree" + ], + "answer0": [ + "painting" + ], + "correct_answer": [ + "painting" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "painting", + 0.2549212872982025 + ], + [ + "tree", + 0.03998008370399475 + ] + ], + "score": 1 + } + ] + }, + { + "index": 53, + "sentences": [ + { + "sentence": "The painting in Mark's living room shows a tree. the _ is to the right of a house.", + "answer1": [ + "tree" + ], + "answer0": [ + "painting" + ], + "correct_answer": [ + "tree" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "painting", + 0.3817044198513031 + ], + [ + "tree", + 0.12644894421100616 + ] + ], + "score": 0 + } + ] + }, + { + "index": 54, + "sentences": [ + { + "sentence": "There is a gap in the wall. You can see the garden through the _ .", + "answer1": [ + "wall" + ], + "answer0": [ + "gap" + ], + "correct_answer": [ + "gap" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "wall", + 0.24062421917915344 + ], + [ + "gap", + 0.06364461034536362 + ] + ], + "score": 0 + } + ] + }, + { + "index": 55, + "sentences": [ + { + "sentence": "There is a gap in the wall. You can see the garden behind the _ .", + "answer1": [ + "wall" + ], + "answer0": [ + "gap" + ], + "correct_answer": [ + "wall" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "wall", + 0.1855088472366333 + ] + ], + "score": 1 + } + ] + }, + { + "index": 56, + "sentences": [ + { + "sentence": "The drain is clogged with hair. the _ has to be cleaned.", + "answer1": [ + "hair" + ], + "answer0": [ + "drain" + ], + "correct_answer": [ + "drain" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "drain", + 0.10474126785993576 + ] + ], + "score": 1 + } + ] + }, + { + "index": 57, + "sentences": [ + { + "sentence": "The drain is clogged with hair. the _ has to be removed.", + "answer1": [ + "hair" + ], + "answer0": [ + "drain" + ], + "correct_answer": [ + "hair" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "hair", + 0.35679975152015686 + ], + [ + "drain", + 0.06430690735578537 + ] + ], + "score": 1 + } + ] + }, + { + "index": 58, + "sentences": [ + { + "sentence": "My meeting started at 4:00 and I needed to catch the train at 4:30, so there wasn't much time. Luckily, the _ was short, so it worked out.", + "answer1": [ + "train" + ], + "answer0": [ + "meeting" + ], + "correct_answer": [ + "meeting" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "train", + 0.2332829087972641 + ] + ], + "score": 0 + } + ] + }, + { + "index": 59, + "sentences": [ + { + "sentence": "My meeting started at 4:00 and I needed to catch the train at 4:30, so there wasn't much time. Luckily, the _ was delayed, so it worked out.", + "answer1": [ + "train" + ], + "answer0": [ + "meeting" + ], + "correct_answer": [ + "train" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "meeting", + 0.574749231338501 + ], + [ + "train", + 0.2624363899230957 + ] + ], + "score": 0 + } + ] + }, + { + "index": 60, + "sentences": [ + { + "sentence": "There is a pillar between me and the stage, and I can't see around the _ .", + "answer1": [ + "stage" + ], + "answer0": [ + "pillar" + ], + "correct_answer": [ + "pillar" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "pillar", + 0.5279375910758972 + ] + ], + "score": 1 + } + ] + }, + { + "index": 61, + "sentences": [ + { + "sentence": "There is a pillar between me and the stage, and I can't see the _ .", + "answer1": [ + "stage" + ], + "answer0": [ + "pillar" + ], + "correct_answer": [ + "stage" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "stage", + 0.07283679395914078 + ] + ], + "score": 1 + } + ] + }, + { + "index": 62, + "sentences": [ + { + "sentence": "They broadcast an announcement, but a subway came into the station and I couldn't hear the _ .", + "answer1": [ + "subway" + ], + "answer0": [ + "announcement" + ], + "correct_answer": [ + "announcement" + ], + "adjacent_ref": false, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 63, + "sentences": [ + { + "sentence": "They broadcast an announcement, but a subway came into the station and I couldn't hear over the _ .", + "answer1": [ + "subway" + ], + "answer0": [ + "announcement" + ], + "correct_answer": [ + "subway" + ], + "adjacent_ref": true, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 64, + "sentences": [ + { + "sentence": "In the middle of the outdoor concert, the rain started falling, but the _ continued until 10.", + "answer1": [ + "rain" + ], + "answer0": [ + "concert" + ], + "correct_answer": [ + "concert" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "rain", + 0.3882569670677185 + ], + [ + "concert", + 0.0404825396835804 + ] + ], + "score": 0 + } + ] + }, + { + "index": 65, + "sentences": [ + { + "sentence": "In the middle of the outdoor concert, the rain started falling, and the _ continued until 10.", + "answer1": [ + "rain" + ], + "answer0": [ + "concert" + ], + "correct_answer": [ + "rain" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "rain", + 0.22699223458766937 + ], + [ + "concert", + 0.07272675633430481 + ] + ], + "score": 1 + } + ] + }, + { + "index": 66, + "sentences": [ + { + "sentence": "I used an old rag to clean the knife, and then I put the _ in the trash.", + "answer1": [ + "knife" + ], + "answer0": [ + "rag" + ], + "correct_answer": [ + "rag" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "knife", + 0.7708435654640198 + ] + ], + "score": 0 + } + ] + }, + { + "index": 67, + "sentences": [ + { + "sentence": "I used an old rag to clean the knife, and then I put the _ in the drawer.", + "answer1": [ + "knife" + ], + "answer0": [ + "rag" + ], + "correct_answer": [ + "knife" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "knife", + 0.5724927186965942 + ] + ], + "score": 1 + } + ] + }, + { + "index": 68, + "sentences": [ + { + "sentence": "Warren asked Sandra what time the library closes, because _ had forgotten.", + "answer1": [ + "Sandra", + "she" + ], + "answer0": [ + "Warren", + "he" + ], + "correct_answer": [ + "Warren", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.4140302836894989 + ], + [ + "she", + 0.08503930270671844 + ], + [ + "warren", + 0.03423245623707771 + ] + ], + "score": 1 + }, + { + "sentence": "Emma asked Thomas what time the library closes, because _ had forgotten.", + "answer1": [ + "Thomas", + "he" + ], + "answer0": [ + "Emma", + "she" + ], + "correct_answer": [ + "Emma", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.42612558603286743 + ], + [ + "thomas", + 0.15514008700847626 + ], + [ + "she", + 0.15142855048179626 + ] + ], + "score": 0 + } + ] + }, + { + "index": 69, + "sentences": [ + { + "sentence": "Zack asked Barbara what time the library closes, but _ had forgotten.", + "answer1": [ + "Barbara", + "she" + ], + "answer0": [ + "Zack", + "he" + ], + "correct_answer": [ + "Barbara", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.691577672958374 + ], + [ + "barbara", + 0.11628566682338715 + ], + [ + "he", + 0.09651367366313934 + ] + ], + "score": 1 + }, + { + "sentence": "Sarah asked Joseph what time the library closes, but _ had forgotten.", + "answer1": [ + "Joseph", + "he" + ], + "answer0": [ + "Sarah", + "she" + ], + "correct_answer": [ + "Joseph", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.5978205800056458 + ], + [ + "joseph", + 0.30730757117271423 + ], + [ + "she", + 0.0433766171336174 + ] + ], + "score": 1 + } + ] + }, + { + "index": 70, + "sentences": [ + { + "sentence": "I took the bottle out of the backpack so that the _ would be handy.", + "answer1": [ + "backpack" + ], + "answer0": [ + "bottle" + ], + "correct_answer": [ + "bottle" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "bottle", + 0.2538537085056305 + ] + ], + "score": 1 + } + ] + }, + { + "index": 71, + "sentences": [ + { + "sentence": "I took the bottle out of the backpack so that the _ would be lighter.", + "answer1": [ + "backpack" + ], + "answer0": [ + "bottle" + ], + "correct_answer": [ + "backpack" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "bottle", + 0.28798893094062805 + ] + ], + "score": 0 + } + ] + }, + { + "index": 72, + "sentences": [ + { + "sentence": "I couldn't put the pot on the shelf because the _ was too tall.", + "answer1": [ + "shelf" + ], + "answer0": [ + "pot" + ], + "correct_answer": [ + "pot" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "shelf", + 0.05657235160470009 + ], + [ + "pot", + 0.040349528193473816 + ] + ], + "score": 0 + } + ] + }, + { + "index": 73, + "sentences": [ + { + "sentence": "I couldn't put the pot on the shelf because the _ was too high.", + "answer1": [ + "shelf" + ], + "answer0": [ + "pot" + ], + "correct_answer": [ + "shelf" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "shelf", + 0.09407572448253632 + ] + ], + "score": 1 + } + ] + }, + { + "index": 74, + "sentences": [ + { + "sentence": "I'm sure that my map will show this building; the _ is very good.", + "answer1": [ + "building" + ], + "answer0": [ + "map" + ], + "correct_answer": [ + "map" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "map", + 0.2211541384458542 + ] + ], + "score": 1 + } + ] + }, + { + "index": 75, + "sentences": [ + { + "sentence": "I'm sure that my map will show this building; the _ is very famous.", + "answer1": [ + "building" + ], + "answer0": [ + "map" + ], + "correct_answer": [ + "building" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "building", + 0.08651192486286163 + ] + ], + "score": 1 + } + ] + }, + { + "index": 76, + "sentences": [ + { + "sentence": "Sandra paid for Tim's college education. _ is very generous.", + "answer1": [ + "Tim", + "he" + ], + "answer0": [ + "Sandra", + "she" + ], + "correct_answer": [ + "Sandra", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "sandra", + 0.5636617541313171 + ], + [ + "she", + 0.3318640887737274 + ], + [ + "he", + 0.016400793567299843 + ] + ], + "score": 1 + }, + { + "sentence": "George paid for Laura's college education. _ is very generous.", + "answer1": [ + "Laura", + "she" + ], + "answer0": [ + "George", + "he" + ], + "correct_answer": [ + "George", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "george", + 0.43826061487197876 + ], + [ + "he", + 0.2907955050468445 + ], + [ + "she", + 0.09768392145633698 + ], + [ + "laura", + 0.026110105216503143 + ] + ], + "score": 1 + } + ] + }, + { + "index": 77, + "sentences": [ + { + "sentence": "Laura paid for Tim's college education. _ is very grateful.", + "answer1": [ + "Tim", + "he" + ], + "answer0": [ + "Laura", + "she" + ], + "correct_answer": [ + "Tim", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.3613061308860779 + ], + [ + "laura", + 0.2712564170360565 + ], + [ + "tim", + 0.10081641376018524 + ], + [ + "he", + 0.09770604968070984 + ] + ], + "score": 0 + }, + { + "sentence": "George paid for Emma's college education. _ is very grateful.", + "answer1": [ + "Emma", + "she" + ], + "answer0": [ + "George", + "he" + ], + "correct_answer": [ + "Emma", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "emma", + 0.5662250518798828 + ], + [ + "she", + 0.16104619204998016 + ], + [ + "he", + 0.07634122669696808 + ], + [ + "george", + 0.037439413368701935 + ] + ], + "score": 1 + } + ] + }, + { + "index": 78, + "sentences": [ + { + "sentence": "Mandy paid for Steven's college education, but now Steven acts as though it never happened. _ is very hurt.", + "answer1": [ + "Steven", + "he" + ], + "answer0": [ + "Mandy", + "she" + ], + "correct_answer": [ + "Mandy", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "mandy", + 0.4452275037765503 + ], + [ + "steven", + 0.20133669674396515 + ], + [ + "he", + 0.04494181647896767 + ], + [ + "she", + 0.010965993627905846 + ] + ], + "score": 1 + }, + { + "sentence": "Edward paid for Sandra's college education, but now Sandra acts as though it never happened. _ is very hurt.", + "answer1": [ + "Sandra", + "she" + ], + "answer0": [ + "Edward", + "he" + ], + "correct_answer": [ + "Edward", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "sandra", + 0.4786634147167206 + ], + [ + "edward", + 0.3283962309360504 + ], + [ + "she", + 0.05131130293011665 + ], + [ + "he", + 0.004538827110081911 + ] + ], + "score": 0 + } + ] + }, + { + "index": 79, + "sentences": [ + { + "sentence": "David paid for Sue's college education, but now Sue acts as though it never happened. _ is very ungrateful.", + "answer1": [ + "Sue", + "she" + ], + "answer0": [ + "David", + "he" + ], + "correct_answer": [ + "Sue", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "sue", + 0.333128422498703 + ], + [ + "david", + 0.2768923044204712 + ], + [ + "she", + 0.11641097813844681 + ], + [ + "he", + 0.029392829164862633 + ] + ], + "score": 1 + }, + { + "sentence": "Emma paid for Tim's college education, but now Tim acts as though it never happened. _ is very ungrateful.", + "answer1": [ + "Tim", + "he" + ], + "answer0": [ + "Emma", + "she" + ], + "correct_answer": [ + "Tim", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "tim", + 0.312907338142395 + ], + [ + "emma", + 0.26578396558761597 + ], + [ + "he", + 0.20920135080814362 + ], + [ + "she", + 0.02565154619514942 + ] + ], + "score": 1 + } + ] + }, + { + "index": 80, + "sentences": [ + { + "sentence": "Lisa was playing cards with Joseph and was way ahead. If Joseph hadn't had a sudden run of good luck, _ would have won.", + "answer1": [ + "Joseph", + "he" + ], + "answer0": [ + "Lisa", + "she" + ], + "correct_answer": [ + "Lisa", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.4793993830680847 + ], + [ + "she", + 0.16311348974704742 + ], + [ + "lisa", + 0.1321823000907898 + ], + [ + "joseph", + 0.017298907041549683 + ] + ], + "score": 0 + }, + { + "sentence": "John was playing cards with Amy and was way ahead. If Amy hadn't had a sudden run of good luck, _ would have won.", + "answer1": [ + "Amy", + "she" + ], + "answer0": [ + "John", + "he" + ], + "correct_answer": [ + "John", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.48962944746017456 + ], + [ + "john", + 0.1949043720960617 + ], + [ + "he", + 0.1438501924276352 + ] + ], + "score": 0 + } + ] + }, + { + "index": 81, + "sentences": [ + { + "sentence": "Charles was playing cards with Betty and was way ahead. If Betty hadn't had a sudden run of good luck, _ would have lost.", + "answer1": [ + "Betty", + "she" + ], + "answer0": [ + "Charles", + "he" + ], + "correct_answer": [ + "Betty", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.559443473815918 + ], + [ + "he", + 0.10665259510278702 + ], + [ + "charles", + 0.08594903349876404 + ] + ], + "score": 1 + }, + { + "sentence": "Sue was playing cards with Charles and was way ahead. If Charles hadn't had a sudden run of good luck, _ would have lost.", + "answer1": [ + "Charles", + "he" + ], + "answer0": [ + "Sue", + "she" + ], + "correct_answer": [ + "Charles", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.651694118976593 + ], + [ + "sue", + 0.13403257727622986 + ], + [ + "she", + 0.06975802779197693 + ] + ], + "score": 1 + } + ] + }, + { + "index": 82, + "sentences": [ + { + "sentence": "David can't leave work here until Margaret arrives to replace him. If Margaret had left home for work on time, _ would be gone by this time.", + "answer1": [ + "Margaret", + "she" + ], + "answer0": [ + "David", + "he" + ], + "correct_answer": [ + "David", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "david", + 0.4532376229763031 + ], + [ + "she", + 0.3844938278198242 + ], + [ + "he", + 0.09111058712005615 + ] + ], + "score": 1 + }, + { + "sentence": "Linda can't leave work here until Thomas arrives to replace her. If Thomas had left home for work on time, _ would be gone by this time.", + "answer1": [ + "Thomas", + "he" + ], + "answer0": [ + "Linda", + "she" + ], + "correct_answer": [ + "Linda", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.46403148770332336 + ], + [ + "linda", + 0.46336105465888977 + ], + [ + "she", + 0.0296529158949852 + ] + ], + "score": 0 + } + ] + }, + { + "index": 83, + "sentences": [ + { + "sentence": "David can't leave work here until Sandra arrives to replace him. If Sandra had left home for work on time, _ would be here by this time.", + "answer1": [ + "Sandra", + "she" + ], + "answer0": [ + "David", + "he" + ], + "correct_answer": [ + "Sandra", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "david", + 0.4930424094200134 + ], + [ + "she", + 0.34252968430519104 + ], + [ + "he", + 0.06929971277713776 + ], + [ + "sandra", + 0.008408436551690102 + ] + ], + "score": 0 + }, + { + "sentence": "Linda can't leave work here until Tony arrives to replace her. If Tony had left home for work on time, _ would be here by this time.", + "answer1": [ + "Tony", + "he" + ], + "answer0": [ + "Linda", + "she" + ], + "correct_answer": [ + "Tony", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "linda", + 0.7818251252174377 + ], + [ + "he", + 0.1578403264284134 + ], + [ + "she", + 0.016681527718901634 + ], + [ + "tony", + 0.0018600979819893837 + ] + ], + "score": 0 + } + ] + }, + { + "index": 84, + "sentences": [ + { + "sentence": "If the woman has succeeded in fooling Sam, _ would have gotten a lot of money.", + "answer1": [ + "he" + ], + "answer0": [ + "she" + ], + "correct_answer": [ + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.66971355676651 + ], + [ + "he", + 0.14182616770267487 + ] + ], + "score": 1 + } + ] + }, + { + "index": 85, + "sentences": [ + { + "sentence": "If the woman has succeeded in fooling Sam, _ would have lost a lot of money.", + "answer1": [ + "he" + ], + "answer0": [ + "she" + ], + "correct_answer": [ + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.48567670583724976 + ], + [ + "he", + 0.2748803496360779 + ] + ], + "score": 0 + } + ] + }, + { + "index": 86, + "sentences": [] + }, + { + "index": 87, + "sentences": [] + }, + { + "index": 88, + "sentences": [ + { + "sentence": "The cat was lying by the mouse hole waiting for the mouse, but the _ was too impatient.", + "answer1": [ + "mouse" + ], + "answer0": [ + "cat" + ], + "correct_answer": [ + "cat" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "cat", + 0.8255143165588379 + ], + [ + "mouse", + 0.07212252169847488 + ] + ], + "score": 1 + } + ] + }, + { + "index": 89, + "sentences": [ + { + "sentence": "The cat was lying by the mouse hole waiting for the mouse, but the _ was too cautious.", + "answer1": [ + "mouse" + ], + "answer0": [ + "cat" + ], + "correct_answer": [ + "mouse" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "cat", + 0.7318105697631836 + ], + [ + "mouse", + 0.070926234126091 + ] + ], + "score": 0 + } + ] + }, + { + "index": 90, + "sentences": [ + { + "sentence": "Betty gave birth to a son Daniel last month. _ is a very charming woman.", + "answer1": [ + "Daniel", + "he" + ], + "answer0": [ + "Betty", + "she" + ], + "correct_answer": [ + "Betty", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "betty", + 0.5413283109664917 + ], + [ + "she", + 0.4113282859325409 + ], + [ + "daniel", + 0.0013550656149163842 + ] + ], + "score": 1 + } + ] + }, + { + "index": 91, + "sentences": [ + { + "sentence": "Laura gave birth to a son Jason last month. _ is a very charming baby.", + "answer1": [ + "Jason", + "he" + ], + "answer0": [ + "Laura", + "she" + ], + "correct_answer": [ + "Jason", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "jason", + 0.593578040599823 + ], + [ + "he", + 0.34084823727607727 + ] + ], + "score": 1 + } + ] + }, + { + "index": 92, + "sentences": [ + { + "sentence": "Paul tried frantically to stop his daughter Sandra from chatting at the party, leaving us to wonder why _ was behaving so strangely.", + "answer1": [ + "Sandra", + "she" + ], + "answer0": [ + "Paul", + "he" + ], + "correct_answer": [ + "Paul", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.780010461807251 + ], + [ + "he", + 0.06307154893875122 + ], + [ + "sandra", + 0.03193315863609314 + ], + [ + "paul", + 0.02212286926805973 + ] + ], + "score": 0 + } + ] + }, + { + "index": 93, + "sentences": [ + { + "sentence": "Thomas tried frantically to stop his daughter Helen from barking at the party, leaving us to wonder why _ was behaving so strangely.", + "answer1": [ + "Helen", + "she" + ], + "answer0": [ + "Thomas", + "he" + ], + "correct_answer": [ + "Helen", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.5904063582420349 + ], + [ + "he", + 0.12378611415624619 + ], + [ + "helen", + 0.04352760314941406 + ], + [ + "thomas", + 0.04339068755507469 + ] + ], + "score": 1 + } + ] + }, + { + "index": 94, + "sentences": [] + }, + { + "index": 95, + "sentences": [] + }, + { + "index": 96, + "sentences": [ + { + "sentence": "The fish ate the worm. the _ was hungry.", + "answer1": [ + "worm" + ], + "answer0": [ + "fish" + ], + "correct_answer": [ + "fish" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "fish", + 0.37108519673347473 + ], + [ + "worm", + 0.21286211907863617 + ] + ], + "score": 1 + } + ] + }, + { + "index": 97, + "sentences": [ + { + "sentence": "The fish ate the worm. the _ was tasty.", + "answer1": [ + "worm" + ], + "answer0": [ + "fish" + ], + "correct_answer": [ + "worm" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "fish", + 0.44388917088508606 + ], + [ + "worm", + 0.06959007680416107 + ] + ], + "score": 0 + } + ] + }, + { + "index": 98, + "sentences": [ + { + "sentence": "I was trying to open the lock with the key, but someone had filled the keyhole with gum, and I couldn't get the _ in.", + "answer1": [ + "gum" + ], + "answer0": [ + "key" + ], + "correct_answer": [ + "key" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "key", + 0.8628509640693665 + ] + ], + "score": 1 + } + ] + }, + { + "index": 99, + "sentences": [ + { + "sentence": "I was trying to open the lock with the key, but someone had filled the keyhole with gum, and I couldn't get the _ out.", + "answer1": [ + "gum" + ], + "answer0": [ + "key" + ], + "correct_answer": [ + "gum" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "key", + 0.7175930142402649 + ], + [ + "gum", + 0.006937938742339611 + ] + ], + "score": 0 + } + ] + }, + { + "index": 100, + "sentences": [ + { + "sentence": "The dog chased the cat, which ran up a tree. the _ waited at the bottom.", + "answer1": [ + "cat" + ], + "answer0": [ + "dog" + ], + "correct_answer": [ + "dog" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "cat", + 0.21951650083065033 + ], + [ + "dog", + 0.17522823810577393 + ] + ], + "score": 0 + } + ] + }, + { + "index": 101, + "sentences": [ + { + "sentence": "The dog chased the cat, which ran up a tree. the _ waited at the top.", + "answer1": [ + "cat" + ], + "answer0": [ + "dog" + ], + "correct_answer": [ + "cat" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "cat", + 0.18947550654411316 + ], + [ + "dog", + 0.1664084941148758 + ] + ], + "score": 1 + } + ] + }, + { + "index": 102, + "sentences": [ + { + "sentence": "In the storm, the tree fell down and crashed through the roof of my house. Now, I have to get the _ removed.", + "answer1": [ + "roof" + ], + "answer0": [ + "tree" + ], + "correct_answer": [ + "tree" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "tree", + 0.27868902683258057 + ] + ], + "score": 1 + } + ] + }, + { + "index": 103, + "sentences": [ + { + "sentence": "In the storm, the tree fell down and crashed through the roof of my house. Now, I have to get the _ repaired.", + "answer1": [ + "roof" + ], + "answer0": [ + "tree" + ], + "correct_answer": [ + "roof" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "tree", + 0.5251078605651855 + ], + [ + "roof", + 0.1584283411502838 + ] + ], + "score": 0 + } + ] + }, + { + "index": 104, + "sentences": [ + { + "sentence": "The customer walked into the bank and stabbed the tellers. the _ was immediately taken to the police station.", + "answer1": [ + "teller" + ], + "answer0": [ + "customer" + ], + "correct_answer": [ + "customer" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "customer", + 0.5774991512298584 + ] + ], + "score": 1 + } + ] + }, + { + "index": 105, + "sentences": [ + { + "sentence": "The customer walked into the bank and stabbed the tellers. the _ was immediately taken to the hospital.", + "answer1": [ + "teller" + ], + "answer0": [ + "customer" + ], + "correct_answer": [ + "teller" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "customer", + 0.7133771777153015 + ] + ], + "score": 0 + } + ] + }, + { + "index": 106, + "sentences": [ + { + "sentence": "John was doing research in the library when he heard a woman humming and whistling. _ was very annoyed.", + "answer1": [ + "she" + ], + "answer0": [ + "he" + ], + "correct_answer": [ + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.6243001818656921 + ], + [ + "she", + 0.007973955012857914 + ] + ], + "score": 1 + } + ] + }, + { + "index": 107, + "sentences": [ + { + "sentence": "John was doing research in the library when he heard a woman humming and whistling. _ was very annoying.", + "answer1": [ + "she" + ], + "answer0": [ + "he" + ], + "correct_answer": [ + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.2894514203071594 + ] + ], + "score": 1 + } + ] + }, + { + "index": 108, + "sentences": [ + { + "sentence": "John was jogging through the park when he saw a woman juggling watermelons. _ was very impressed.", + "answer1": [ + "she" + ], + "answer0": [ + "he" + ], + "correct_answer": [ + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.6197863221168518 + ], + [ + "she", + 0.028509652242064476 + ] + ], + "score": 1 + } + ] + }, + { + "index": 109, + "sentences": [ + { + "sentence": "John was jogging through the park when he saw a woman juggling watermelons. _ was very impressive.", + "answer1": [ + "she" + ], + "answer0": [ + "he" + ], + "correct_answer": [ + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.7331407070159912 + ] + ], + "score": 1 + } + ] + }, + { + "index": 110, + "sentences": [ + { + "sentence": "George collapsed on the sidewalk. Soon he saw Wendy coming to help. _ was very ill.", + "answer1": [ + "Wendy", + "she" + ], + "answer0": [ + "George", + "he" + ], + "correct_answer": [ + "George", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.9086498618125916 + ], + [ + "wendy", + 0.07171561568975449 + ], + [ + "he", + 0.00464142020791769 + ] + ], + "score": 0 + }, + { + "sentence": "Nancy collapsed on the sidewalk. Soon she saw Brian coming to help. _ was very ill.", + "answer1": [ + "Brian", + "he" + ], + "answer0": [ + "Nancy", + "she" + ], + "correct_answer": [ + "Nancy", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.8744707107543945 + ], + [ + "brian", + 0.09346448630094528 + ], + [ + "she", + 0.012748870067298412 + ] + ], + "score": 0 + } + ] + }, + { + "index": 111, + "sentences": [ + { + "sentence": "Steven collapsed on the sidewalk. Soon he saw Wendy coming to help. _ was very concerned.", + "answer1": [ + "Wendy", + "she" + ], + "answer0": [ + "Steven", + "he" + ], + "correct_answer": [ + "Wendy", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.669244647026062 + ], + [ + "wendy", + 0.28963056206703186 + ], + [ + "he", + 0.009827366098761559 + ] + ], + "score": 1 + }, + { + "sentence": "Betty collapsed on the sidewalk. Soon she saw Daniel coming to help. _ was very concerned.", + "answer1": [ + "Daniel", + "he" + ], + "answer0": [ + "Betty", + "she" + ], + "correct_answer": [ + "Daniel", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.7216044664382935 + ], + [ + "daniel", + 0.1329873502254486 + ], + [ + "she", + 0.07653984427452087 + ], + [ + "betty", + 0.025548135861754417 + ] + ], + "score": 1 + } + ] + }, + { + "index": 112, + "sentences": [] + }, + { + "index": 113, + "sentences": [] + }, + { + "index": 114, + "sentences": [ + { + "sentence": "Thomas told Margaret many lies about himself, which Margaret included in her book. _ should have been more truthful.", + "answer1": [ + "Margaret", + "she" + ], + "answer0": [ + "Thomas", + "he" + ], + "correct_answer": [ + "Thomas", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "thomas", + 0.30783167481422424 + ], + [ + "he", + 0.2996409833431244 + ], + [ + "she", + 0.12280318886041641 + ], + [ + "margaret", + 0.025127053260803223 + ] + ], + "score": 1 + }, + { + "sentence": "Lisa told William many lies about herself, which William included in his book. _ should have been more truthful.", + "answer1": [ + "William", + "he" + ], + "answer0": [ + "Lisa", + "she" + ], + "correct_answer": [ + "Lisa", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.4036584794521332 + ], + [ + "he", + 0.16349658370018005 + ], + [ + "lisa", + 0.10439618676900864 + ] + ], + "score": 1 + } + ] + }, + { + "index": 115, + "sentences": [ + { + "sentence": "Lisa told Edward many lies about herself, which Edward included in his book. _ should have been more skeptical.", + "answer1": [ + "Edward", + "he" + ], + "answer0": [ + "Lisa", + "she" + ], + "correct_answer": [ + "Edward", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.19039444625377655 + ], + [ + "she", + 0.08063376694917679 + ], + [ + "edward", + 0.07449514418840408 + ], + [ + "lisa", + 0.016218291595578194 + ] + ], + "score": 1 + }, + { + "sentence": "Daniel told Vivian many lies about himself, which Vivian included in her book. _ should have been more skeptical.", + "answer1": [ + "Vivian", + "she" + ], + "answer0": [ + "Daniel", + "he" + ], + "correct_answer": [ + "Vivian", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.13308386504650116 + ], + [ + "he", + 0.02119334787130356 + ], + [ + "vivian", + 0.006614150945097208 + ] + ], + "score": 1 + } + ] + }, + { + "index": 116, + "sentences": [ + { + "sentence": "Joe has sold his house and bought a new one a few miles away. He will be moving out of the _ house on Thursday.", + "answer1": [ + "new" + ], + "answer0": [ + "old" + ], + "correct_answer": [ + "old" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "old", + 0.5963925123214722 + ], + [ + "new", + 0.08838000893592834 + ] + ], + "score": 1 + } + ] + }, + { + "index": 117, + "sentences": [ + { + "sentence": "Joe has sold his house and bought a new one a few miles away. He will be moving into the _ house on Thursday.", + "answer1": [ + "new" + ], + "answer0": [ + "old" + ], + "correct_answer": [ + "new" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "new", + 0.8057393431663513 + ], + [ + "old", + 0.06867457181215286 + ] + ], + "score": 1 + } + ] + }, + { + "index": 118, + "sentences": [] + }, + { + "index": 119, + "sentences": [] + }, + { + "index": 120, + "sentences": [ + { + "sentence": "Mary took out her flute and played one of her favorite pieces. She has had the _ since she was a child.", + "answer1": [ + "piece" + ], + "answer0": [ + "flute" + ], + "correct_answer": [ + "flute" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "flute", + 0.8020106554031372 + ], + [ + "piece", + 0.0065211085602641106 + ] + ], + "score": 1 + } + ] + }, + { + "index": 121, + "sentences": [ + { + "sentence": "Mary took out her flute and played one of her favorite pieces. She has loved the _ since she was a child.", + "answer1": [ + "piece" + ], + "answer0": [ + "flute" + ], + "correct_answer": [ + "piece" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "flute", + 0.2698434591293335 + ], + [ + "piece", + 0.11442672461271286 + ] + ], + "score": 0 + } + ] + }, + { + "index": 122, + "sentences": [ + { + "sentence": "Sam pulled up a chair to the piano, but the _ was broken, so he had to stand instead.", + "answer1": [ + "piano" + ], + "answer0": [ + "chair" + ], + "correct_answer": [ + "chair" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "chair", + 0.2117273360490799 + ], + [ + "piano", + 0.0348532609641552 + ] + ], + "score": 1 + } + ] + }, + { + "index": 123, + "sentences": [ + { + "sentence": "Sam pulled up a chair to the piano, but the _ was broken, so he had to sing instead.", + "answer1": [ + "piano" + ], + "answer0": [ + "chair" + ], + "correct_answer": [ + "piano" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "piano", + 0.14480875432491302 + ], + [ + "chair", + 0.08749102801084518 + ] + ], + "score": 1 + } + ] + }, + { + "index": 124, + "sentences": [ + { + "sentence": "Since it was raining, I carried the newspaper in my backpack to keep the _ dry.", + "answer1": [ + "backpack" + ], + "answer0": [ + "newspaper" + ], + "correct_answer": [ + "newspaper" + ], + "adjacent_ref": false, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 125, + "sentences": [ + { + "sentence": "Since it was raining, I carried the newspaper over my backpack to keep the _ dry.", + "answer1": [ + "backpack" + ], + "answer0": [ + "newspaper" + ], + "correct_answer": [ + "backpack" + ], + "adjacent_ref": true, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 126, + "sentences": [ + { + "sentence": "Sara borrowed the book from the library because she needs it for an article she is working on. She reads the _ when she gets home from work.", + "answer1": [ + "article" + ], + "answer0": [ + "book" + ], + "correct_answer": [ + "book" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "article", + 0.7929431796073914 + ], + [ + "book", + 0.03670453652739525 + ] + ], + "score": 0 + } + ] + }, + { + "index": 127, + "sentences": [ + { + "sentence": "Sara borrowed the book from the library because she needs it for an article she is working on. She writes the _ when she gets home from work.", + "answer1": [ + "article" + ], + "answer0": [ + "book" + ], + "correct_answer": [ + "article" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "article", + 0.6547066569328308 + ], + [ + "book", + 0.049485430121421814 + ] + ], + "score": 1 + } + ] + }, + { + "index": 128, + "sentences": [ + { + "sentence": "This morning, Joey built a castle on the beach, and put a toy flag in the highest tower, but this afternoon the tide knocked the _ down.", + "answer1": [ + "flag" + ], + "answer0": [ + "castle" + ], + "correct_answer": [ + "castle" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "castle", + 0.3126690089702606 + ] + ], + "score": 1 + } + ] + }, + { + "index": 129, + "sentences": [ + { + "sentence": "This morning, Joey built a castle on the beach, and put a toy flag in the highest tower, but this afternoon the wind knocked the _ down.", + "answer1": [ + "flag" + ], + "answer0": [ + "castle" + ], + "correct_answer": [ + "flag" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "castle", + 0.14584790170192719 + ], + [ + "flag", + 0.0925380289554596 + ] + ], + "score": 0 + } + ] + }, + { + "index": 130, + "sentences": [ + { + "sentence": "Donna knocked on George's door, but there was no answer. _ was disappointed.", + "answer1": [ + "George", + "he" + ], + "answer0": [ + "Donna", + "she" + ], + "correct_answer": [ + "Donna", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.725192666053772 + ], + [ + "donna", + 0.21122732758522034 + ], + [ + "he", + 0.00751360272988677 + ], + [ + "george", + 0.002388492226600647 + ] + ], + "score": 1 + }, + { + "sentence": "Brian knocked on Sandra's door, but there was no answer. _ was disappointed.", + "answer1": [ + "Sandra", + "she" + ], + "answer0": [ + "Brian", + "he" + ], + "correct_answer": [ + "Brian", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.7136160135269165 + ], + [ + "brian", + 0.2146453559398651 + ], + [ + "she", + 0.03952759504318237 + ], + [ + "sandra", + 0.0034710802137851715 + ] + ], + "score": 1 + } + ] + }, + { + "index": 131, + "sentences": [ + { + "sentence": "Anthony knocked on Laura's door, but there was no answer. _ was out.", + "answer1": [ + "Laura", + "she" + ], + "answer0": [ + "Anthony", + "he" + ], + "correct_answer": [ + "Laura", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.3706660866737366 + ], + [ + "laura", + 0.23670922219753265 + ], + [ + "anthony", + 0.03675343096256256 + ], + [ + "he", + 0.03366513177752495 + ] + ], + "score": 1 + }, + { + "sentence": "Sarah knocked on David's door, but there was no answer. _ was out.", + "answer1": [ + "David", + "he" + ], + "answer0": [ + "Sarah", + "she" + ], + "correct_answer": [ + "David", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.40272143483161926 + ], + [ + "david", + 0.22155331075191498 + ], + [ + "sarah", + 0.08200158923864365 + ], + [ + "she", + 0.025603963062167168 + ] + ], + "score": 1 + } + ] + }, + { + "index": 132, + "sentences": [ + { + "sentence": "David knocked on the door, and Helen answered it. _ invited her to come out.", + "answer1": [ + "Helen", + "she" + ], + "answer0": [ + "David", + "he" + ], + "correct_answer": [ + "David", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.4498973488807678 + ], + [ + "david", + 0.4179869294166565 + ], + [ + "she", + 0.009854797273874283 + ], + [ + "helen", + 0.007285397034138441 + ] + ], + "score": 1 + }, + { + "sentence": "Vivian knocked on the door, and Daniel answered it. _ invited him to come out.", + "answer1": [ + "Daniel", + "he" + ], + "answer0": [ + "Vivian", + "she" + ], + "correct_answer": [ + "Vivian", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "vivian", + 0.45176881551742554 + ], + [ + "she", + 0.10893049836158752 + ] + ], + "score": 1 + } + ] + }, + { + "index": 133, + "sentences": [ + { + "sentence": "Jennifer knocked on the door, and Jason answered it. _ invited her to come in.", + "answer1": [ + "Jason", + "he" + ], + "answer0": [ + "Jennifer", + "she" + ], + "correct_answer": [ + "Jason", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.8066680431365967 + ], + [ + "jason", + 0.05348130315542221 + ], + [ + "jennifer", + 0.04620116949081421 + ] + ], + "score": 1 + }, + { + "sentence": "Joseph knocked on the door, and Amy answered it. _ invited him to come in.", + "answer1": [ + "Amy", + "she" + ], + "answer0": [ + "Joseph", + "he" + ], + "correct_answer": [ + "Amy", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.7397094368934631 + ], + [ + "amy", + 0.09842552989721298 + ], + [ + "joseph", + 0.02913120575249195 + ], + [ + "he", + 0.011561447754502296 + ] + ], + "score": 1 + } + ] + }, + { + "index": 134, + "sentences": [ + { + "sentence": "Daniel took French classes from Anna, because _ was eager to speak it fluently.", + "answer1": [ + "Anna", + "she" + ], + "answer0": [ + "Daniel", + "he" + ], + "correct_answer": [ + "Daniel", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.543056845664978 + ], + [ + "she", + 0.28391432762145996 + ], + [ + "anna", + 0.1334146112203598 + ], + [ + "daniel", + 0.004943103063851595 + ] + ], + "score": 1 + }, + { + "sentence": "Susan took French classes from Edward, because _ was eager to speak it fluently.", + "answer1": [ + "Edward", + "he" + ], + "answer0": [ + "Susan", + "she" + ], + "correct_answer": [ + "Susan", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.6854918003082275 + ], + [ + "he", + 0.16495540738105774 + ], + [ + "edward", + 0.040629129856824875 + ], + [ + "susan", + 0.005385664291679859 + ] + ], + "score": 1 + } + ] + }, + { + "index": 135, + "sentences": [ + { + "sentence": "Daniel took French classes from Nancy, because _ was known to speak it fluently.", + "answer1": [ + "Nancy", + "she" + ], + "answer0": [ + "Daniel", + "he" + ], + "correct_answer": [ + "Nancy", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.9642737507820129 + ], + [ + "she", + 0.03015826642513275 + ], + [ + "daniel", + 0.0008225942146964371 + ], + [ + "nancy", + 0.00018977400031872094 + ] + ], + "score": 0 + }, + { + "sentence": "Amy took French classes from Anthony, because _ was known to speak it fluently.", + "answer1": [ + "Anthony", + "he" + ], + "answer0": [ + "Amy", + "she" + ], + "correct_answer": [ + "Anthony", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.4824458956718445 + ], + [ + "he", + 0.4505062699317932 + ], + [ + "anthony", + 0.016775252297520638 + ], + [ + "amy", + 0.00974208302795887 + ] + ], + "score": 0 + } + ] + }, + { + "index": 136, + "sentences": [ + { + "sentence": "The path to the lake was blocked, so we couldn't use the _ .", + "answer1": [ + "lake" + ], + "answer0": [ + "path" + ], + "correct_answer": [ + "path" + ], + "adjacent_ref": false, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 137, + "sentences": [ + { + "sentence": "The path to the lake was blocked, so we couldn't reach the _ .", + "answer1": [ + "lake" + ], + "answer0": [ + "path" + ], + "correct_answer": [ + "lake" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "lake", + 0.08149494975805283 + ] + ], + "score": 1 + } + ] + }, + { + "index": 138, + "sentences": [ + { + "sentence": "The sun was covered by a thick cloud all morning, but luckily, by the time the picnic started, the _ was out.", + "answer1": [ + "cloud" + ], + "answer0": [ + "sun" + ], + "correct_answer": [ + "sun" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "sun", + 0.5122198462486267 + ] + ], + "score": 1 + } + ] + }, + { + "index": 139, + "sentences": [ + { + "sentence": "The sun was covered by a thick cloud all morning, but luckily, by the time the picnic started, the _ was gone.", + "answer1": [ + "cloud" + ], + "answer0": [ + "sun" + ], + "correct_answer": [ + "cloud" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "cloud", + 0.3546683192253113 + ], + [ + "sun", + 0.04581490159034729 + ] + ], + "score": 1 + } + ] + }, + { + "index": 140, + "sentences": [ + { + "sentence": "We went to the lake, because a shark had been seen at the beach, so the _ was a safer place to swim.", + "answer1": [ + "beach" + ], + "answer0": [ + "lake" + ], + "correct_answer": [ + "lake" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "lake", + 0.5537887811660767 + ], + [ + "beach", + 0.12069465965032578 + ] + ], + "score": 1 + } + ] + }, + { + "index": 141, + "sentences": [ + { + "sentence": "We went to the lake, because a shark had been seen at the beach, so the _ was a dangerous place to swim.", + "answer1": [ + "beach" + ], + "answer0": [ + "lake" + ], + "correct_answer": [ + "beach" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "lake", + 0.7313575148582458 + ], + [ + "beach", + 0.04965415969491005 + ] + ], + "score": 0 + } + ] + }, + { + "index": 142, + "sentences": [ + { + "sentence": "Sam tried to paint a picture of tourists with sheep, but the _ ended up looking more like golfers.", + "answer1": [ + "sheep" + ], + "answer0": [ + "tourists" + ], + "correct_answer": [ + "tourists" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "sheep", + 0.48821017146110535 + ] + ], + "score": 0 + } + ] + }, + { + "index": 143, + "sentences": [ + { + "sentence": "Sam tried to paint a picture of tourists with sheep, but the _ ended up looking more like dogs.", + "answer1": [ + "sheep" + ], + "answer0": [ + "tourists" + ], + "correct_answer": [ + "sheep" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "sheep", + 0.6432616710662842 + ] + ], + "score": 1 + } + ] + }, + { + "index": 144, + "sentences": [ + { + "sentence": "Margaret tucked her son Brian into bed, so that _ could work.", + "answer1": [ + "Brian", + "he" + ], + "answer0": [ + "Margaret", + "she" + ], + "correct_answer": [ + "Margaret", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.24223224818706512 + ], + [ + "he", + 0.22981160879135132 + ], + [ + "margaret", + 0.022159870713949203 + ] + ], + "score": 1 + } + ] + }, + { + "index": 145, + "sentences": [ + { + "sentence": "Wendy tucked her son Paul into bed, so that _ could sleep.", + "answer1": [ + "Paul", + "he" + ], + "answer0": [ + "Wendy", + "she" + ], + "correct_answer": [ + "Paul", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.8820493817329407 + ], + [ + "paul", + 0.020039597526192665 + ], + [ + "she", + 0.0034701882395893335 + ] + ], + "score": 1 + } + ] + }, + { + "index": 146, + "sentences": [] + }, + { + "index": 147, + "sentences": [] + }, + { + "index": 148, + "sentences": [ + { + "sentence": "Sarah visited George's grave in 1765. At that date _ had been travelling for five years.", + "answer1": [ + "George", + "he" + ], + "answer0": [ + "Sarah", + "she" + ], + "correct_answer": [ + "Sarah", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.5088940262794495 + ], + [ + "he", + 0.20025378465652466 + ], + [ + "george", + 0.07865126430988312 + ], + [ + "sarah", + 0.0399525985121727 + ] + ], + "score": 1 + }, + { + "sentence": "Steven visited Helen's grave in 1765. At that date _ had been travelling for five years.", + "answer1": [ + "Helen", + "she" + ], + "answer0": [ + "Steven", + "he" + ], + "correct_answer": [ + "Steven", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.5841214060783386 + ], + [ + "she", + 0.15333987772464752 + ], + [ + "steven", + 0.12337502837181091 + ], + [ + "helen", + 0.00795214157551527 + ] + ], + "score": 1 + } + ] + }, + { + "index": 149, + "sentences": [ + { + "sentence": "Paul visited Margaret's grave in 1765. At that date _ had been dead for five years.", + "answer1": [ + "Margaret", + "she" + ], + "answer0": [ + "Paul", + "he" + ], + "correct_answer": [ + "Margaret", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.8265026211738586 + ], + [ + "margaret", + 0.11342242360115051 + ], + [ + "he", + 0.023298852145671844 + ], + [ + "paul", + 0.0028004718478769064 + ] + ], + "score": 1 + }, + { + "sentence": "Sandra visited Zack's grave in 1765. At that date _ had been dead for five years.", + "answer1": [ + "Zack", + "he" + ], + "answer0": [ + "Sandra", + "she" + ], + "correct_answer": [ + "Zack", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.5330711007118225 + ], + [ + "zack", + 0.4511242210865021 + ], + [ + "she", + 0.0025638267397880554 + ] + ], + "score": 1 + } + ] + }, + { + "index": 150, + "sentences": [ + { + "sentence": "Vivian was greatly influenced by Steven, though _ lived two centuries later.", + "answer1": [ + "Steven", + "he" + ], + "answer0": [ + "Vivian", + "she" + ], + "correct_answer": [ + "Vivian", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.6800739765167236 + ], + [ + "she", + 0.0814879834651947 + ], + [ + "steven", + 0.0334942601621151 + ], + [ + "vivian", + 0.00919678620994091 + ] + ], + "score": 0 + }, + { + "sentence": "Anthony was greatly influenced by Sandra, though _ lived two centuries later.", + "answer1": [ + "Sandra", + "she" + ], + "answer0": [ + "Anthony", + "he" + ], + "correct_answer": [ + "Anthony", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.5291610956192017 + ], + [ + "he", + 0.24366435408592224 + ], + [ + "anthony", + 0.07931230962276459 + ], + [ + "sandra", + 0.014960510656237602 + ] + ], + "score": 0 + } + ] + }, + { + "index": 151, + "sentences": [ + { + "sentence": "Warren was greatly influenced by Mandy, though _ lived two centuries earlier.", + "answer1": [ + "Mandy", + "she" + ], + "answer0": [ + "Warren", + "he" + ], + "correct_answer": [ + "Mandy", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.6331958770751953 + ], + [ + "he", + 0.14606720209121704 + ], + [ + "warren", + 0.010313055478036404 + ] + ], + "score": 1 + }, + { + "sentence": "Lisa was greatly influenced by David, though _ lived two centuries earlier.", + "answer1": [ + "David", + "he" + ], + "answer0": [ + "Lisa", + "she" + ], + "correct_answer": [ + "David", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.6384512782096863 + ], + [ + "she", + 0.16565637290477753 + ], + [ + "david", + 0.0723688080906868 + ], + [ + "lisa", + 0.006370751652866602 + ] + ], + "score": 1 + } + ] + }, + { + "index": 152, + "sentences": [ + { + "sentence": "I can't cut that tree down with that axe; the _ is too thick.", + "answer1": [ + "axe" + ], + "answer0": [ + "tree" + ], + "correct_answer": [ + "tree" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "tree", + 0.025083722546696663 + ] + ], + "score": 1 + } + ] + }, + { + "index": 153, + "sentences": [ + { + "sentence": "I can't cut that tree down with that axe; the _ is too small.", + "answer1": [ + "axe" + ], + "answer0": [ + "tree" + ], + "correct_answer": [ + "axe" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "tree", + 0.4419287145137787 + ] + ], + "score": 0 + } + ] + }, + { + "index": 154, + "sentences": [ + { + "sentence": "The foxes are getting in at night and attacking the chickens. I shall have to kill the _ .", + "answer1": [ + "chickens" + ], + "answer0": [ + "foxes" + ], + "correct_answer": [ + "foxes" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "chickens", + 0.22372716665267944 + ], + [ + "foxes", + 0.1705324798822403 + ] + ], + "score": 0 + } + ] + }, + { + "index": 155, + "sentences": [ + { + "sentence": "The foxes are getting in at night and attacking the chickens. I shall have to guard the _ .", + "answer1": [ + "chickens" + ], + "answer0": [ + "foxes" + ], + "correct_answer": [ + "chickens" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "chickens", + 0.3232119679450989 + ] + ], + "score": 1 + } + ] + }, + { + "index": 156, + "sentences": [ + { + "sentence": "The foxes are getting in at night and attacking the chickens. the _ have gotten very bold.", + "answer1": [ + "chickens" + ], + "answer0": [ + "foxes" + ], + "correct_answer": [ + "foxes" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "foxes", + 0.8538748025894165 + ] + ], + "score": 1 + } + ] + }, + { + "index": 157, + "sentences": [ + { + "sentence": "The foxes are getting in at night and attacking the chickens. the _ have gotten very nervous.", + "answer1": [ + "chickens" + ], + "answer0": [ + "foxes" + ], + "correct_answer": [ + "chickens" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "foxes", + 0.24852731823921204 + ], + [ + "chickens", + 0.1355128437280655 + ] + ], + "score": 0 + } + ] + }, + { + "index": 158, + "sentences": [ + { + "sentence": "Fred covered his eyes with his hands, because the wind was blowing sand around. He opened his _ when the wind stopped.", + "answer1": [ + "hands" + ], + "answer0": [ + "eyes" + ], + "correct_answer": [ + "eyes" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "eyes", + 0.9919928312301636 + ] + ], + "score": 1 + } + ] + }, + { + "index": 159, + "sentences": [ + { + "sentence": "Fred covered his eyes with his hands, because the wind was blowing sand around. He lowered his _ when the wind stopped.", + "answer1": [ + "hands" + ], + "answer0": [ + "eyes" + ], + "correct_answer": [ + "hands" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "eyes", + 0.07590554654598236 + ], + [ + "hands", + 0.04639185965061188 + ] + ], + "score": 0 + } + ] + }, + { + "index": 160, + "sentences": [ + { + "sentence": "The actress used to be named Christina, but she changed it to Tina a few years ago, because she figured the _ was too hard to pronounce.", + "answer1": [ + "Tina" + ], + "answer0": [ + "Christina" + ], + "correct_answer": [ + "Christina" + ], + "adjacent_ref": false, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 161, + "sentences": [ + { + "sentence": "The actress used to be named Christina, but she changed it to Tina a few years ago, because she figured the _ was easier to pronounce.", + "answer1": [ + "Tina" + ], + "answer0": [ + "Christina" + ], + "correct_answer": [ + "Tina" + ], + "adjacent_ref": true, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 162, + "sentences": [ + { + "sentence": "George watched TV while Amy went out to buy groceries. After an hour _ got up.", + "answer1": [ + "Amy", + "she" + ], + "answer0": [ + "George", + "he" + ], + "correct_answer": [ + "George", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "george", + 0.4222693145275116 + ], + [ + "she", + 0.17023402452468872 + ], + [ + "he", + 0.14838194847106934 + ], + [ + "amy", + 0.09143561124801636 + ] + ], + "score": 1 + }, + { + "sentence": "Helen watched TV while Paul went out to buy groceries. After an hour _ got up.", + "answer1": [ + "Paul", + "he" + ], + "answer0": [ + "Helen", + "she" + ], + "correct_answer": [ + "Helen", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "helen", + 0.3572254478931427 + ], + [ + "she", + 0.18161025643348694 + ], + [ + "he", + 0.17783011496067047 + ], + [ + "paul", + 0.12632130086421967 + ] + ], + "score": 1 + } + ] + }, + { + "index": 163, + "sentences": [ + { + "sentence": "Sue watched TV while Richard went out to buy groceries. After an hour _ got back.", + "answer1": [ + "Richard", + "he" + ], + "answer0": [ + "Sue", + "she" + ], + "correct_answer": [ + "Richard", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.36730772256851196 + ], + [ + "richard", + 0.24268296360969543 + ], + [ + "she", + 0.0852513387799263 + ], + [ + "sue", + 0.0603502094745636 + ] + ], + "score": 1 + }, + { + "sentence": "Steven watched TV while Lisa went out to buy groceries. After an hour _ got back.", + "answer1": [ + "Lisa", + "she" + ], + "answer0": [ + "Steven", + "he" + ], + "correct_answer": [ + "Lisa", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.2513202726840973 + ], + [ + "lisa", + 0.21226151287555695 + ], + [ + "he", + 0.15152567625045776 + ], + [ + "steven", + 0.06623411923646927 + ] + ], + "score": 1 + } + ] + }, + { + "index": 164, + "sentences": [ + { + "sentence": "Fred was supposed to run the oven, but he put it off, because he wanted to watch TV. But the show turned out to be boring, so he changed his mind and turned the _ on.", + "answer1": [ + "TV" + ], + "answer0": [ + "oven" + ], + "correct_answer": [ + "oven" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "oven", + 0.33234283328056335 + ], + [ + "tv", + 0.24399203062057495 + ] + ], + "score": 1 + } + ] + }, + { + "index": 165, + "sentences": [ + { + "sentence": "Fred was supposed to run the oven, but he put it off, because he wanted to watch TV. But the show turned out to be boring, so he changed his mind and turned the _ off.", + "answer1": [ + "TV" + ], + "answer0": [ + "oven" + ], + "correct_answer": [ + "TV" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "oven", + 0.5511050224304199 + ], + [ + "tv", + 0.14227940142154694 + ] + ], + "score": 0 + } + ] + }, + { + "index": 166, + "sentences": [ + { + "sentence": "Tony is the only man still alive who remembers my great-grandmother Jennifer. _ is remarkable.", + "answer1": [ + "Jennifer", + "she" + ], + "answer0": [ + "Tony", + "he" + ], + "correct_answer": [ + "Tony", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.31883949041366577 + ], + [ + "he", + 0.2727121412754059 + ] + ], + "score": 0 + } + ] + }, + { + "index": 167, + "sentences": [ + { + "sentence": "Tony is the only man still alive who remembers my great-grandmother Mandy. _ was remarkable.", + "answer1": [ + "Mandy", + "she" + ], + "answer0": [ + "Tony", + "he" + ], + "correct_answer": [ + "Mandy", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.7409354448318481 + ], + [ + "mandy", + 0.02154599316418171 + ], + [ + "he", + 0.008621525950729847 + ] + ], + "score": 1 + } + ] + }, + { + "index": 168, + "sentences": [ + { + "sentence": "Daniel is the only man alive who still remembers my mother Wendy as an infant. When Daniel first saw my mother, _ was twelve years old.", + "answer1": [ + "Wendy", + "she" + ], + "answer0": [ + "Daniel", + "he" + ], + "correct_answer": [ + "Daniel", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.721801221370697 + ], + [ + "he", + 0.14972272515296936 + ], + [ + "daniel", + 0.004093638621270657 + ], + [ + "wendy", + 0.003174440236762166 + ] + ], + "score": 0 + } + ] + }, + { + "index": 169, + "sentences": [ + { + "sentence": "Paul is the only man alive who still remembers my mother Lisa as an infant. When Paul first saw my mother, _ was twelve months old.", + "answer1": [ + "Lisa", + "she" + ], + "answer0": [ + "Paul", + "he" + ], + "correct_answer": [ + "Lisa", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.8270816802978516 + ], + [ + "he", + 0.06970749795436859 + ], + [ + "paul", + 0.006479764357209206 + ], + [ + "lisa", + 0.003468103241175413 + ] + ], + "score": 1 + } + ] + }, + { + "index": 170, + "sentences": [ + { + "sentence": "In July, Spain declared war on Italy. Since Italy's army was much better equipped and ten times larger, the _ were defeated within weeks.", + "answer1": [ + "Italy" + ], + "answer0": [ + "Spain" + ], + "correct_answer": [ + "Spain" + ], + "adjacent_ref": false, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 171, + "sentences": [ + { + "sentence": "In July, France declared war on Russia. Since Russia's army was much better equipped and ten times larger, the _ were victorious within weeks.", + "answer1": [ + "Russia" + ], + "answer0": [ + "France" + ], + "correct_answer": [ + "Russia" + ], + "adjacent_ref": true, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 172, + "sentences": [ + { + "sentence": "Look! There is a fish swimming right below that duck! the _ had better get away to safety fast!", + "answer1": [ + "duck" + ], + "answer0": [ + "fish" + ], + "correct_answer": [ + "fish" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "fish", + 0.7521501183509827 + ], + [ + "duck", + 0.09160765260457993 + ] + ], + "score": 1 + } + ] + }, + { + "index": 173, + "sentences": [ + { + "sentence": "Look! There is a shark swimming right below that duck! the _ had better get away to safety fast!", + "answer1": [ + "duck" + ], + "answer0": [ + "shark" + ], + "correct_answer": [ + "duck" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "shark", + 0.39164668321609497 + ], + [ + "duck", + 0.049340344965457916 + ] + ], + "score": 0 + } + ] + }, + { + "index": 174, + "sentences": [] + }, + { + "index": 175, + "sentences": [] + }, + { + "index": 176, + "sentences": [ + { + "sentence": "The scientists are studying three species of fish that have recently been found living in the Indian Ocean. the _ began two years ago.", + "answer1": [ + "fish" + ], + "answer0": [ + "scientists" + ], + "correct_answer": [ + "scientists" + ], + "adjacent_ref": false, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 177, + "sentences": [ + { + "sentence": "The scientists are studying three species of fish that have recently been found living in the Indian Ocean. the _ appeared two years ago.", + "answer1": [ + "fish" + ], + "answer0": [ + "scientists" + ], + "correct_answer": [ + "fish" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "fish", + 0.014301100745797157 + ] + ], + "score": 1 + } + ] + }, + { + "index": 178, + "sentences": [ + { + "sentence": "The journalists interviewed the stars of the new movie. the _ were very persistent, so the interview lasted for a long time.", + "answer1": [ + "stars" + ], + "answer0": [ + "journalists" + ], + "correct_answer": [ + "journalists" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "journalists", + 0.06059985235333443 + ], + [ + "stars", + 0.04318194463849068 + ] + ], + "score": 1 + } + ] + }, + { + "index": 179, + "sentences": [ + { + "sentence": "The journalists interviewed the stars of the new movie. the _ were very cooperative, so the interview lasted for a long time.", + "answer1": [ + "stars" + ], + "answer0": [ + "journalists" + ], + "correct_answer": [ + "stars" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "journalists", + 0.06477633118629456 + ] + ], + "score": 0 + } + ] + }, + { + "index": 180, + "sentences": [ + { + "sentence": "The police arrested all of the students. the _ were trying to stop the drug trade in the neighborhood.", + "answer1": [ + "students" + ], + "answer0": [ + "police" + ], + "correct_answer": [ + "police" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "police", + 0.8427444100379944 + ], + [ + "students", + 0.08074833452701569 + ] + ], + "score": 1 + } + ] + }, + { + "index": 181, + "sentences": [ + { + "sentence": "The police arrested all of the students. the _ were trying to run the drug trade in the neighborhood.", + "answer1": [ + "students" + ], + "answer0": [ + "police" + ], + "correct_answer": [ + "students" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "police", + 0.5480837821960449 + ], + [ + "students", + 0.25696301460266113 + ] + ], + "score": 0 + } + ] + }, + { + "index": 182, + "sentences": [ + { + "sentence": "I put the cake away in the refrigerator. the _ has a lot of butter in it.", + "answer1": [ + "refrigerator" + ], + "answer0": [ + "cake" + ], + "correct_answer": [ + "cake" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "cake", + 0.3875311017036438 + ] + ], + "score": 1 + } + ] + }, + { + "index": 183, + "sentences": [ + { + "sentence": "I put the cake away in the refrigerator. the _ has a lot of leftovers in it.", + "answer1": [ + "refrigerator" + ], + "answer0": [ + "cake" + ], + "correct_answer": [ + "refrigerator" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "refrigerator", + 0.04797044396400452 + ] + ], + "score": 1 + } + ] + }, + { + "index": 184, + "sentences": [ + { + "sentence": "Sam broke both his ankles and he's walking with cane. But a month or so from now the _ should be better.", + "answer1": [ + "cane" + ], + "answer0": [ + "ankles" + ], + "correct_answer": [ + "ankles" + ], + "adjacent_ref": false, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 185, + "sentences": [ + { + "sentence": "Sam broke both his ankles and he's walking with cane. But a month or so from now the _ should be unnecessary.", + "answer1": [ + "cane" + ], + "answer0": [ + "ankles" + ], + "correct_answer": [ + "cane" + ], + "adjacent_ref": true, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 186, + "sentences": [ + { + "sentence": "When the sponsors of the bill got to the town hall, they were surprised to find that the room was full of opponents. the _ were very much in the minority.", + "answer1": [ + "opponents" + ], + "answer0": [ + "sponsors" + ], + "correct_answer": [ + "sponsors" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "opponents", + 0.1925417184829712 + ] + ], + "score": 0 + } + ] + }, + { + "index": 187, + "sentences": [ + { + "sentence": "When the sponsors of the bill got to the town hall, they were surprised to find that the room was full of opponents. the _ were very much in the majority.", + "answer1": [ + "opponents" + ], + "answer0": [ + "sponsors" + ], + "correct_answer": [ + "opponents" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "opponents", + 0.2790956199169159 + ] + ], + "score": 1 + } + ] + }, + { + "index": 188, + "sentences": [ + { + "sentence": "Everyone really loved the cake; only a few people liked the cookies. Next time, we should make more of the _ .", + "answer1": [ + "cookies" + ], + "answer0": [ + "cake" + ], + "correct_answer": [ + "cake" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "cookies", + 0.34039416909217834 + ], + [ + "cake", + 0.1966356337070465 + ] + ], + "score": 0 + } + ] + }, + { + "index": 189, + "sentences": [ + { + "sentence": "Everyone really loved the cake; only a few people liked the cookies. Next time, we should make fewer of the _ .", + "answer1": [ + "cookies" + ], + "answer0": [ + "cake" + ], + "correct_answer": [ + "cookies" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "cookies", + 0.7632879018783569 + ] + ], + "score": 1 + } + ] + }, + { + "index": 190, + "sentences": [ + { + "sentence": "We had hoped to place books on all the chairs in the auditorium, but there were simply not enough of the _ .", + "answer1": [ + "chairs" + ], + "answer0": [ + "books" + ], + "correct_answer": [ + "books" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "books", + 0.7032318115234375 + ], + [ + "chairs", + 0.02748037874698639 + ] + ], + "score": 1 + } + ] + }, + { + "index": 191, + "sentences": [ + { + "sentence": "We had hoped to place books on all the chairs in the auditorium, but there were simply too many of the _ .", + "answer1": [ + "chairs" + ], + "answer0": [ + "books" + ], + "correct_answer": [ + "chairs" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "chairs", + 0.31335213780403137 + ], + [ + "books", + 0.28027427196502686 + ] + ], + "score": 1 + } + ] + }, + { + "index": 192, + "sentences": [ + { + "sentence": "I stuck a pin through a carrot. When I pulled the pin out, the _ left a hole.", + "answer1": [ + "carrot" + ], + "answer0": [ + "pin" + ], + "correct_answer": [ + "pin" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "pin", + 0.24921917915344238 + ], + [ + "carrot", + 0.02873871475458145 + ] + ], + "score": 1 + } + ] + }, + { + "index": 193, + "sentences": [ + { + "sentence": "I stuck a pin through a carrot. When I pulled the pin out, the _ had a hole.", + "answer1": [ + "carrot" + ], + "answer0": [ + "pin" + ], + "correct_answer": [ + "carrot" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "carrot", + 0.7155035734176636 + ], + [ + "pin", + 0.04343157261610031 + ] + ], + "score": 1 + } + ] + }, + { + "index": 194, + "sentences": [ + { + "sentence": "I couldn't find a spoon, so I tried using a pen to stir my coffee. But that turned out to be a bad idea, because the _ got full of coffee.", + "answer1": [ + "coffee" + ], + "answer0": [ + "pen" + ], + "correct_answer": [ + "pen" + ], + "adjacent_ref": false, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 195, + "sentences": [ + { + "sentence": "I couldn't find a spoon, so I tried using a pen to stir my coffee. But that turned out to be a bad idea, because the _ got full of ink.", + "answer1": [ + "coffee" + ], + "answer0": [ + "pen" + ], + "correct_answer": [ + "coffee" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "pen", + 0.07221017777919769 + ] + ], + "score": 0 + } + ] + }, + { + "index": 196, + "sentences": [ + { + "sentence": "Alan follows Emma's example in everything. _ admires her hugely.", + "answer1": [ + "Emma", + "she" + ], + "answer0": [ + "Alan", + "he" + ], + "correct_answer": [ + "Alan", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.44828981161117554 + ], + [ + "alan", + 0.3871767222881317 + ], + [ + "emma", + 0.0232782494276762 + ], + [ + "she", + 0.01241056714206934 + ] + ], + "score": 1 + }, + { + "sentence": "Donna follows Jason's example in everything. _ admires him hugely.", + "answer1": [ + "Jason", + "he" + ], + "answer0": [ + "Donna", + "she" + ], + "correct_answer": [ + "Donna", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.9227645397186279 + ], + [ + "donna", + 0.03128309175372124 + ], + [ + "he", + 0.003405218245461583 + ] + ], + "score": 1 + } + ] + }, + { + "index": 197, + "sentences": [ + { + "sentence": "Vivian follows Thomas's example in everything. _ influences her hugely.", + "answer1": [ + "Thomas", + "he" + ], + "answer0": [ + "Vivian", + "she" + ], + "correct_answer": [ + "Thomas", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.6024318933486938 + ], + [ + "thomas", + 0.21580076217651367 + ], + [ + "vivian", + 0.005271642003208399 + ] + ], + "score": 1 + }, + { + "sentence": "Richard follows Nancy's example in everything. _ influences him hugely.", + "answer1": [ + "Nancy", + "she" + ], + "answer0": [ + "Richard", + "he" + ], + "correct_answer": [ + "Nancy", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "nancy", + 0.7275211215019226 + ], + [ + "she", + 0.23386697471141815 + ] + ], + "score": 1 + } + ] + }, + { + "index": 198, + "sentences": [ + { + "sentence": "The table won't fit through the doorway because the _ is too wide.", + "answer1": [ + "doorway" + ], + "answer0": [ + "table" + ], + "correct_answer": [ + "table" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "table", + 0.07816523313522339 + ] + ], + "score": 1 + } + ] + }, + { + "index": 199, + "sentences": [ + { + "sentence": "The table won't fit through the doorway because the _ is too narrow.", + "answer1": [ + "doorway" + ], + "answer0": [ + "table" + ], + "correct_answer": [ + "doorway" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "table", + 0.06917010992765427 + ] + ], + "score": 0 + } + ] + }, + { + "index": 200, + "sentences": [ + { + "sentence": "Grace was happy to trade me her sweater for my jacket. She thinks the _ looks dowdy on her.", + "answer1": [ + "jacket" + ], + "answer0": [ + "sweater" + ], + "correct_answer": [ + "sweater" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "sweater", + 0.2597585618495941 + ], + [ + "jacket", + 0.2361963987350464 + ] + ], + "score": 1 + } + ] + }, + { + "index": 201, + "sentences": [ + { + "sentence": "Grace was happy to trade me her sweater for my jacket. She thinks the _ looks great on her.", + "answer1": [ + "jacket" + ], + "answer0": [ + "sweater" + ], + "correct_answer": [ + "jacket" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "sweater", + 0.29831069707870483 + ], + [ + "jacket", + 0.158711239695549 + ] + ], + "score": 0 + } + ] + }, + { + "index": 202, + "sentences": [ + { + "sentence": "Helen hired Jason to take care of _ .", + "answer1": [ + "Jason", + "him" + ], + "answer0": [ + "Helen", + "her" + ], + "correct_answer": [ + "Helen", + "her" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "her", + 0.3049941062927246 + ], + [ + "him", + 0.028502047061920166 + ] + ], + "score": 1 + }, + { + "sentence": "Tim hired Lisa to take care of _ .", + "answer1": [ + "Lisa", + "her" + ], + "answer0": [ + "Tim", + "him" + ], + "correct_answer": [ + "Tim", + "him" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "him", + 0.21020644903182983 + ], + [ + "her", + 0.04846358299255371 + ] + ], + "score": 1 + } + ] + }, + { + "index": 203, + "sentences": [ + { + "sentence": "Emma hired herself out to Anthony to take care of _ .", + "answer1": [ + "Anthony", + "him" + ], + "answer0": [ + "Emma", + "her" + ], + "correct_answer": [ + "Anthony", + "him" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "him", + 0.17034678161144257 + ], + [ + "her", + 0.08058315515518188 + ] + ], + "score": 1 + }, + { + "sentence": "Daniel hired himself out to Emma to take care of _ .", + "answer1": [ + "Emma", + "her" + ], + "answer0": [ + "Daniel", + "him" + ], + "correct_answer": [ + "Emma", + "her" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "her", + 0.21644917130470276 + ], + [ + "him", + 0.043456241488456726 + ] + ], + "score": 1 + } + ] + }, + { + "index": 204, + "sentences": [ + { + "sentence": "Sandra promised Edward to leave, so an hour later _ left.", + "answer1": [ + "Edward", + "he" + ], + "answer0": [ + "Sandra", + "she" + ], + "correct_answer": [ + "Sandra", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.4670681357383728 + ], + [ + "she", + 0.18557576835155487 + ], + [ + "edward", + 0.14714860916137695 + ] + ], + "score": 0 + }, + { + "sentence": "Thomas promised Betty to leave, so an hour later _ left.", + "answer1": [ + "Betty", + "she" + ], + "answer0": [ + "Thomas", + "he" + ], + "correct_answer": [ + "Thomas", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.47173383831977844 + ], + [ + "betty", + 0.22963103652000427 + ], + [ + "he", + 0.14014899730682373 + ], + [ + "thomas", + 0.0091405613347888 + ] + ], + "score": 0 + } + ] + }, + { + "index": 205, + "sentences": [ + { + "sentence": "William ordered Mandy to leave, so an hour later _ left.", + "answer1": [ + "Mandy", + "she" + ], + "answer0": [ + "William", + "he" + ], + "correct_answer": [ + "Mandy", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.2989388108253479 + ], + [ + "he", + 0.23829956352710724 + ], + [ + "mandy", + 0.20623540878295898 + ] + ], + "score": 1 + }, + { + "sentence": "Amy ordered Warren to leave, so an hour later _ left.", + "answer1": [ + "Warren", + "he" + ], + "answer0": [ + "Amy", + "she" + ], + "correct_answer": [ + "Warren", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.4589509963989258 + ], + [ + "warren", + 0.33637410402297974 + ], + [ + "she", + 0.05308223143219948 + ] + ], + "score": 1 + } + ] + }, + { + "index": 206, + "sentences": [ + { + "sentence": "Thomas's biography of Emma conveys a vivid sense of the difficulties _ faced in his research.", + "answer1": [ + "Emma", + "she" + ], + "answer0": [ + "Thomas", + "he" + ], + "correct_answer": [ + "Thomas", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.38616475462913513 + ], + [ + "she", + 0.19750751554965973 + ], + [ + "emma", + 0.14758290350437164 + ], + [ + "thomas", + 0.08441972732543945 + ] + ], + "score": 1 + }, + { + "sentence": "Jennifer's biography of George conveys a vivid sense of the difficulties _ faced in her research.", + "answer1": [ + "George", + "he" + ], + "answer0": [ + "Jennifer", + "she" + ], + "correct_answer": [ + "Jennifer", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.7952712178230286 + ], + [ + "he", + 0.025763899087905884 + ], + [ + "george", + 0.025489602237939835 + ], + [ + "jennifer", + 0.00788148120045662 + ] + ], + "score": 1 + } + ] + }, + { + "index": 207, + "sentences": [ + { + "sentence": "Emma's biography of Alan conveys a vivid sense of the difficulties _ faced in his childhood.", + "answer1": [ + "Alan", + "he" + ], + "answer0": [ + "Emma", + "she" + ], + "correct_answer": [ + "Alan", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "alan", + 0.5420495271682739 + ], + [ + "he", + 0.4210248291492462 + ] + ], + "score": 1 + }, + { + "sentence": "Steven's biography of Margaret conveys a vivid sense of the difficulties _ faced in her childhood.", + "answer1": [ + "Margaret", + "she" + ], + "answer0": [ + "Steven", + "he" + ], + "correct_answer": [ + "Margaret", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.6349072456359863 + ], + [ + "margaret", + 0.29832860827445984 + ] + ], + "score": 1 + } + ] + }, + { + "index": 208, + "sentences": [ + { + "sentence": "Wendy's father Anthony had died long ago, and _ education had been managed by an excellent woman as governess.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "her", + 0.9348182082176208 + ], + [ + "his", + 0.011819146573543549 + ] + ], + "score": 1 + } + ] + }, + { + "index": 209, + "sentences": [ + { + "sentence": "Sandra's father Tim had died long ago, and _ place had been taken by an excellent woman as governess.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "his", + 0.3821287453174591 + ], + [ + "her", + 0.32249218225479126 + ] + ], + "score": 1 + } + ] + }, + { + "index": 210, + "sentences": [ + { + "sentence": "Anthony knocked on Betty's door but _ did not get an answer.", + "answer1": [ + "Betty", + "she" + ], + "answer0": [ + "Anthony", + "he" + ], + "correct_answer": [ + "Anthony", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.9353936910629272 + ], + [ + "she", + 0.014528683386743069 + ], + [ + "anthony", + 0.0021569491364061832 + ] + ], + "score": 1 + }, + { + "sentence": "Betty knocked on Thomas's door but _ did not get an answer.", + "answer1": [ + "Thomas", + "he" + ], + "answer0": [ + "Betty", + "she" + ], + "correct_answer": [ + "Betty", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.8701193928718567 + ], + [ + "he", + 0.04268043488264084 + ], + [ + "thomas", + 0.004556684289127588 + ] + ], + "score": 1 + } + ] + }, + { + "index": 211, + "sentences": [ + { + "sentence": "John knocked on Susan's door but _ did not answer.", + "answer1": [ + "Susan", + "she" + ], + "answer0": [ + "John", + "he" + ], + "correct_answer": [ + "Susan", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.9418527483940125 + ], + [ + "susan", + 0.05125227943062782 + ], + [ + "he", + 0.0006008930504322052 + ] + ], + "score": 1 + }, + { + "sentence": "Emma knocked on Charles's door but _ did not answer.", + "answer1": [ + "Charles", + "he" + ], + "answer0": [ + "Emma", + "she" + ], + "correct_answer": [ + "Charles", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.9730058908462524 + ], + [ + "charles", + 0.02500162087380886 + ] + ], + "score": 1 + } + ] + }, + { + "index": 212, + "sentences": [ + { + "sentence": "Zack paid the detective Susan after _ received the final report on the case.", + "answer1": [ + "Susan", + "she" + ], + "answer0": [ + "Zack", + "he" + ], + "correct_answer": [ + "Zack", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.6660929918289185 + ], + [ + "he", + 0.2357940971851349 + ], + [ + "zack", + 0.012386161834001541 + ], + [ + "susan", + 0.004307710099965334 + ] + ], + "score": 0 + }, + { + "sentence": "Anna paid the detective Daniel after _ received the final report on the case.", + "answer1": [ + "Daniel", + "he" + ], + "answer0": [ + "Anna", + "she" + ], + "correct_answer": [ + "Anna", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.6424099206924438 + ], + [ + "she", + 0.2567521929740906 + ], + [ + "anna", + 0.010955804027616978 + ] + ], + "score": 0 + } + ] + }, + { + "index": 213, + "sentences": [ + { + "sentence": "Brian paid the detective Margaret after _ delivered the final report on the case.", + "answer1": [ + "Margaret", + "she" + ], + "answer0": [ + "Brian", + "he" + ], + "correct_answer": [ + "Margaret", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.8936680555343628 + ], + [ + "he", + 0.036272548139095306 + ], + [ + "margaret", + 0.009478574618697166 + ], + [ + "brian", + 0.005012798588722944 + ] + ], + "score": 1 + }, + { + "sentence": "Helen paid the detective Warren after _ delivered the final report on the case.", + "answer1": [ + "Warren", + "he" + ], + "answer0": [ + "Helen", + "she" + ], + "correct_answer": [ + "Warren", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.759939432144165 + ], + [ + "she", + 0.13307538628578186 + ], + [ + "warren", + 0.012190349400043488 + ], + [ + "helen", + 0.008577443659305573 + ] + ], + "score": 1 + } + ] + }, + { + "index": 214, + "sentences": [ + { + "sentence": "Lisa didn't get angry with Thomas, who had cut her off, because _ stopped and counted to ten.", + "answer1": [ + "Thomas", + "he" + ], + "answer0": [ + "Lisa", + "she" + ], + "correct_answer": [ + "Lisa", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.7895767092704773 + ], + [ + "he", + 0.11457893252372742 + ], + [ + "lisa", + 0.03027874045073986 + ] + ], + "score": 1 + }, + { + "sentence": "Edward didn't get angry with Mandy, who had cut his off, because _ stopped and counted to ten.", + "answer1": [ + "Mandy", + "she" + ], + "answer0": [ + "Edward", + "he" + ], + "correct_answer": [ + "Edward", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.8440133929252625 + ], + [ + "she", + 0.03659467399120331 + ], + [ + "edward", + 0.018313562497496605 + ] + ], + "score": 1 + } + ] + }, + { + "index": 215, + "sentences": [ + { + "sentence": "Jason didn't get angry with Linda, who had cut his off, because _ stopped and apologized.", + "answer1": [ + "Linda", + "she" + ], + "answer0": [ + "Jason", + "he" + ], + "correct_answer": [ + "Linda", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.4764364957809448 + ], + [ + "she", + 0.37486517429351807 + ], + [ + "linda", + 0.0429224893450737 + ], + [ + "jason", + 0.030886851251125336 + ] + ], + "score": 0 + }, + { + "sentence": "Sue didn't get angry with Brian, who had cut her off, because _ stopped and apologized.", + "answer1": [ + "Brian", + "he" + ], + "answer0": [ + "Sue", + "she" + ], + "correct_answer": [ + "Brian", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.5377354621887207 + ], + [ + "she", + 0.31687992811203003 + ], + [ + "brian", + 0.037937115877866745 + ], + [ + "sue", + 0.014928928576409817 + ] + ], + "score": 1 + } + ] + }, + { + "index": 216, + "sentences": [ + { + "sentence": "William signaled Susan and gestured toward _ empty glass", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "his", + 0.274255633354187 + ], + [ + "her", + 0.10120938718318939 + ] + ], + "score": 1 + }, + { + "sentence": "Sue signaled Thomas and gestured toward _ empty glass", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "his", + 0.28291237354278564 + ], + [ + "her", + 0.14910736680030823 + ] + ], + "score": 0 + } + ] + }, + { + "index": 217, + "sentences": [ + { + "sentence": "David signaled Emma and gestured toward _ bathroom key.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "his", + 0.08032070845365524 + ], + [ + "her", + 0.03398525342345238 + ] + ], + "score": 0 + }, + { + "sentence": "Amy signaled Anthony and gestured toward _ bathroom key.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "her", + 0.095477394759655 + ], + [ + "his", + 0.02509940415620804 + ] + ], + "score": 0 + } + ] + }, + { + "index": 218, + "sentences": [ + { + "sentence": "Mandy took the rear seat while Joseph claimed the front because _ \"Dibs!\" was slow.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "his", + 0.4607595205307007 + ], + [ + "her", + 0.11643649637699127 + ] + ], + "score": 0 + }, + { + "sentence": "Charles took the rear seat while Helen claimed the front because _ \"Dibs!\" was slow.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "her", + 0.4100673198699951 + ], + [ + "his", + 0.139830082654953 + ] + ], + "score": 0 + } + ] + }, + { + "index": 219, + "sentences": [ + { + "sentence": "Jason took the rear seat while Vivian claimed the front because _ \"Dibs!\" was quicker.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "her", + 0.30036041140556335 + ], + [ + "his", + 0.1332821547985077 + ] + ], + "score": 1 + }, + { + "sentence": "Margaret took the rear seat while Tim claimed the front because _ \"Dibs!\" was quicker.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "his", + 0.3551889657974243 + ], + [ + "her", + 0.10693569481372833 + ] + ], + "score": 1 + } + ] + }, + { + "index": 220, + "sentences": [ + { + "sentence": "Mandy said \"Check\" to David as she moved _ bishop.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": false, + "predict_answer": [], + "score": 0 + }, + { + "sentence": "David said \"Check\" to Donna as he moved _ bishop.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": false, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 221, + "sentences": [ + { + "sentence": "Margaret said \"Check\" to Steven as she took _ bishop.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "his", + 0.045481689274311066 + ], + [ + "her", + 0.04472532123327255 + ] + ], + "score": 1 + }, + { + "sentence": "Anthony said \"Check\" to Sarah as he took _ bishop.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "his", + 0.04094506800174713 + ] + ], + "score": 0 + } + ] + }, + { + "index": 222, + "sentences": [ + { + "sentence": "As Sandra in the crop duster passed over Edward, _ could see the landing strip.", + "answer1": [ + "Edward", + "he" + ], + "answer0": [ + "Sandra", + "she" + ], + "correct_answer": [ + "Sandra", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.23987726867198944 + ], + [ + "she", + 0.19285492599010468 + ], + [ + "edward", + 0.07019305974245071 + ] + ], + "score": 0 + }, + { + "sentence": "As George in the crop duster passed over Lisa, _ could see the landing strip.", + "answer1": [ + "Lisa", + "she" + ], + "answer0": [ + "George", + "he" + ], + "correct_answer": [ + "George", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "lisa", + 0.34604838490486145 + ], + [ + "she", + 0.21167317032814026 + ], + [ + "he", + 0.20678868889808655 + ] + ], + "score": 0 + } + ] + }, + { + "index": 223, + "sentences": [ + { + "sentence": "As Thomas in the crop duster passed over Susan, _ could see the landing gear.", + "answer1": [ + "Susan", + "she" + ], + "answer0": [ + "Thomas", + "he" + ], + "correct_answer": [ + "Susan", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.4135798215866089 + ], + [ + "she", + 0.15865713357925415 + ], + [ + "susan", + 0.1331108957529068 + ] + ], + "score": 0 + }, + { + "sentence": "As Jennifer in the crop duster passed over Thomas, _ could see the landing gear.", + "answer1": [ + "Thomas", + "he" + ], + "answer0": [ + "Jennifer", + "she" + ], + "correct_answer": [ + "Thomas", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.4583258628845215 + ], + [ + "she", + 0.15161868929862976 + ], + [ + "thomas", + 0.1475401371717453 + ] + ], + "score": 1 + } + ] + }, + { + "index": 224, + "sentences": [ + { + "sentence": "Richard gave Mandy a lift to school so _ wouldn't have to drive alone.", + "answer1": [ + "Mandy", + "she" + ], + "answer0": [ + "Richard", + "he" + ], + "correct_answer": [ + "Richard", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.7928639054298401 + ], + [ + "he", + 0.06139402464032173 + ] + ], + "score": 0 + }, + { + "sentence": "Barbara gave Zack a lift to school so _ wouldn't have to drive alone.", + "answer1": [ + "Zack", + "he" + ], + "answer0": [ + "Barbara", + "she" + ], + "correct_answer": [ + "Barbara", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.8156458735466003 + ], + [ + "zack", + 0.10156332701444626 + ], + [ + "she", + 0.025199728086590767 + ] + ], + "score": 0 + } + ] + }, + { + "index": 225, + "sentences": [ + { + "sentence": "Nancy gave Tony a lift to school so _ wouldn't have to walk.", + "answer1": [ + "Tony", + "he" + ], + "answer0": [ + "Nancy", + "she" + ], + "correct_answer": [ + "Tony", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.6183673143386841 + ], + [ + "tony", + 0.19011403620243073 + ] + ], + "score": 1 + }, + { + "sentence": "Edward gave Wendy a lift to school so _ wouldn't have to walk.", + "answer1": [ + "Wendy", + "she" + ], + "answer0": [ + "Edward", + "he" + ], + "correct_answer": [ + "Wendy", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.8526315093040466 + ], + [ + "he", + 0.020969539880752563 + ] + ], + "score": 1 + } + ] + }, + { + "index": 226, + "sentences": [ + { + "sentence": "Margaret passed the half-empty plate to Anthony because _ was full.", + "answer1": [ + "Anthony", + "he" + ], + "answer0": [ + "Margaret", + "she" + ], + "correct_answer": [ + "Margaret", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.07039020210504532 + ], + [ + "she", + 0.0310561191290617 + ] + ], + "score": 0 + }, + { + "sentence": "Thomas passed the half-empty plate to Sarah because _ was full.", + "answer1": [ + "Sarah", + "she" + ], + "answer0": [ + "Thomas", + "he" + ], + "correct_answer": [ + "Thomas", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.05379188060760498 + ], + [ + "he", + 0.045447368174791336 + ] + ], + "score": 0 + } + ] + }, + { + "index": 227, + "sentences": [ + { + "sentence": "Helen passed the half-empty plate to Brian because _ was hungry.", + "answer1": [ + "Brian", + "he" + ], + "answer0": [ + "Helen", + "she" + ], + "correct_answer": [ + "Brian", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.9241517782211304 + ], + [ + "he", + 0.06978235393762589 + ] + ], + "score": 0 + }, + { + "sentence": "Warren passed the half-empty plate to Jennifer because _ was hungry.", + "answer1": [ + "Jennifer", + "she" + ], + "answer0": [ + "Warren", + "he" + ], + "correct_answer": [ + "Jennifer", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.6595138311386108 + ], + [ + "she", + 0.324116587638855 + ], + [ + "jennifer", + 0.0032959093805402517 + ] + ], + "score": 0 + } + ] + }, + { + "index": 228, + "sentences": [ + { + "sentence": "Nancy passed the gameboy to Brian because _ turn was over.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "his", + 0.26922306418418884 + ], + [ + "her", + 0.12208054959774017 + ] + ], + "score": 0 + }, + { + "sentence": "Paul passed the gameboy to Sandra because _ turn was over.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "her", + 0.26440608501434326 + ], + [ + "his", + 0.13414087891578674 + ] + ], + "score": 0 + } + ] + }, + { + "index": 229, + "sentences": [ + { + "sentence": "Zack passed the gameboy to Mandy because _ turn was next.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "her", + 0.4155150055885315 + ], + [ + "his", + 0.2723301649093628 + ] + ], + "score": 1 + }, + { + "sentence": "Vivian passed the gameboy to Paul because _ turn was next.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "his", + 0.5038822889328003 + ], + [ + "her", + 0.15782448649406433 + ] + ], + "score": 1 + } + ] + }, + { + "index": 230, + "sentences": [ + { + "sentence": "The man lifted the girl onto _ shoulders.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "his", + 0.9872719645500183 + ], + [ + "her", + 0.007788238115608692 + ] + ], + "score": 1 + } + ] + }, + { + "index": 231, + "sentences": [ + { + "sentence": "The man lifted the girl onto _ bunk bed.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "her", + 0.052843790501356125 + ], + [ + "his", + 0.028196819126605988 + ] + ], + "score": 1 + } + ] + }, + { + "index": 232, + "sentences": [ + { + "sentence": "Stretching _ back, the woman smiled at the boy.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "her", + 0.9602349996566772 + ], + [ + "his", + 0.0329311229288578 + ] + ], + "score": 1 + } + ] + }, + { + "index": 233, + "sentences": [ + { + "sentence": "Patting _ back, the woman smiled at the boy.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "his", + 0.931027889251709 + ], + [ + "her", + 0.051477789878845215 + ] + ], + "score": 1 + } + ] + }, + { + "index": 234, + "sentences": [ + { + "sentence": "William cried because Wendy wouldn't accept _ toy.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "his", + 0.2313782274723053 + ], + [ + "her", + 0.04676021263003349 + ] + ], + "score": 1 + }, + { + "sentence": "Sandra cried because Alan wouldn't accept _ toy.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "her", + 0.2570785582065582 + ], + [ + "his", + 0.09622883796691895 + ] + ], + "score": 1 + } + ] + }, + { + "index": 235, + "sentences": [ + { + "sentence": "Thomas cried because Susan wouldn't share _ toy.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "his", + 0.28240013122558594 + ], + [ + "her", + 0.10179656744003296 + ] + ], + "score": 0 + }, + { + "sentence": "Vivian cried because George wouldn't share _ toy.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "his", + 0.25874245166778564 + ], + [ + "her", + 0.18479324877262115 + ] + ], + "score": 1 + } + ] + }, + { + "index": 236, + "sentences": [ + { + "sentence": "Edward spoke to Margaret, breaking _ silence.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "her", + 0.004641450475901365 + ], + [ + "his", + 0.00293560978025198 + ] + ], + "score": 0 + }, + { + "sentence": "Wendy spoke to Jason, breaking _ silence.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "his", + 0.0020169508643448353 + ], + [ + "her", + 0.0009562345803715289 + ] + ], + "score": 0 + } + ] + }, + { + "index": 237, + "sentences": [ + { + "sentence": "Tim spoke to Donna, breaking _ concentration.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "her", + 0.5251883268356323 + ], + [ + "his", + 0.34837156534194946 + ] + ], + "score": 1 + }, + { + "sentence": "Sue spoke to Richard, breaking _ concentration.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "his", + 0.7294524908065796 + ], + [ + "her", + 0.18369422852993011 + ] + ], + "score": 1 + } + ] + }, + { + "index": 238, + "sentences": [ + { + "sentence": "When Helen dropped his ice cream, David giggled, so father gave _ a sympathetic look.", + "answer1": [ + "David", + "him" + ], + "answer0": [ + "Helen", + "her" + ], + "correct_answer": [ + "Helen", + "her" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "her", + 0.5764134526252747 + ], + [ + "him", + 0.25476816296577454 + ], + [ + "helen", + 0.02986416220664978 + ], + [ + "david", + 0.019353149458765984 + ] + ], + "score": 1 + }, + { + "sentence": "When Richard dropped his ice cream, Linda giggled, so father gave _ a sympathetic look.", + "answer1": [ + "Linda", + "her" + ], + "answer0": [ + "Richard", + "him" + ], + "correct_answer": [ + "Richard", + "him" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "her", + 0.8484610319137573 + ], + [ + "him", + 0.06817058473825455 + ], + [ + "linda", + 0.015336349606513977 + ] + ], + "score": 0 + } + ] + }, + { + "index": 239, + "sentences": [ + { + "sentence": "When Anna dropped his ice cream, Alan giggled, so father gave _ a stern look.", + "answer1": [ + "Alan", + "him" + ], + "answer0": [ + "Anna", + "her" + ], + "correct_answer": [ + "Alan", + "him" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "her", + 0.7019764184951782 + ], + [ + "him", + 0.139887273311615 + ], + [ + "anna", + 0.06648413836956024 + ], + [ + "alan", + 0.008537232875823975 + ] + ], + "score": 0 + }, + { + "sentence": "When Zack dropped his ice cream, Sue giggled, so father gave _ a stern look.", + "answer1": [ + "Sue", + "her" + ], + "answer0": [ + "Zack", + "him" + ], + "correct_answer": [ + "Sue", + "her" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "her", + 0.7724469900131226 + ], + [ + "him", + 0.11578436195850372 + ], + [ + "zack", + 0.02440289407968521 + ] + ], + "score": 1 + } + ] + }, + { + "index": 240, + "sentences": [ + { + "sentence": "As Donna carried Jason up the long winding steps, _ legs ached.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "her", + 0.7337371110916138 + ], + [ + "his", + 0.2596256732940674 + ] + ], + "score": 1 + }, + { + "sentence": "As Warren carried Sarah up the long winding steps, _ legs ached.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "his", + 0.5732764601707458 + ], + [ + "her", + 0.4222981929779053 + ] + ], + "score": 1 + } + ] + }, + { + "index": 241, + "sentences": [ + { + "sentence": "As Helen carried George up the long winding steps, _ legs dangled.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "his", + 0.6082881689071655 + ], + [ + "her", + 0.37898463010787964 + ] + ], + "score": 1 + }, + { + "sentence": "As Tim carried Amy up the long winding steps, _ legs dangled.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "her", + 0.9631314277648926 + ], + [ + "his", + 0.011471277102828026 + ] + ], + "score": 1 + } + ] + }, + { + "index": 242, + "sentences": [ + { + "sentence": "The father carried the sleeping girl in _ arms", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "his", + 0.9978468418121338 + ], + [ + "her", + 0.000320874823955819 + ] + ], + "score": 1 + } + ] + }, + { + "index": 243, + "sentences": [ + { + "sentence": "The father carried the sleeping girl in _ bassinet.", + "answer1": [ + "her" + ], + "answer0": [ + "his" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "his", + 0.19996562600135803 + ], + [ + "her", + 0.04508880525827408 + ] + ], + "score": 0 + } + ] + }, + { + "index": 244, + "sentences": [ + { + "sentence": "The woman held the boy against _ chest", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "her", + 0.9982455968856812 + ], + [ + "his", + 0.00023115644580684602 + ] + ], + "score": 1 + } + ] + }, + { + "index": 245, + "sentences": [ + { + "sentence": "The woman held the boy against _ will.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "his", + 0.945933997631073 + ], + [ + "her", + 0.05019375681877136 + ] + ], + "score": 1 + } + ] + }, + { + "index": 246, + "sentences": [] + }, + { + "index": 247, + "sentences": [] + }, + { + "index": 248, + "sentences": [ + { + "sentence": "Amy informed Brian that _ had retired and presented several options for future treatment.", + "answer1": [ + "Brian", + "he" + ], + "answer0": [ + "Amy", + "she" + ], + "correct_answer": [ + "Amy", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.37087374925613403 + ], + [ + "she", + 0.3327399492263794 + ], + [ + "amy", + 0.015426949597895145 + ] + ], + "score": 0 + }, + { + "sentence": "Tim informed Linda that _ had retired and presented several options for future treatment.", + "answer1": [ + "Linda", + "she" + ], + "answer0": [ + "Tim", + "he" + ], + "correct_answer": [ + "Tim", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.5563133358955383 + ], + [ + "she", + 0.13216964900493622 + ], + [ + "tim", + 0.018639085814356804 + ] + ], + "score": 1 + } + ] + }, + { + "index": 249, + "sentences": [ + { + "sentence": "Brian informed Sandra that _ had cancer and presented several options for future treatment.", + "answer1": [ + "Sandra", + "she" + ], + "answer0": [ + "Brian", + "he" + ], + "correct_answer": [ + "Sandra", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.5297118425369263 + ], + [ + "she", + 0.2422800213098526 + ], + [ + "brian", + 0.005271288100630045 + ] + ], + "score": 0 + }, + { + "sentence": "Helen informed Paul that _ had cancer and presented several options for future treatment.", + "answer1": [ + "Paul", + "he" + ], + "answer0": [ + "Helen", + "she" + ], + "correct_answer": [ + "Paul", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.4575677514076233 + ], + [ + "he", + 0.31825608015060425 + ], + [ + "paul", + 0.009974068962037563 + ], + [ + "helen", + 0.007482711225748062 + ] + ], + "score": 0 + } + ] + }, + { + "index": 250, + "sentences": [ + { + "sentence": "Daniel had to stop Jennifer from toying with the injured bird. _ is very compassionate.", + "answer1": [ + "Jennifer", + "she" + ], + "answer0": [ + "Daniel", + "he" + ], + "correct_answer": [ + "Daniel", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "jennifer", + 0.2903973460197449 + ], + [ + "she", + 0.22931936383247375 + ], + [ + "he", + 0.16045786440372467 + ], + [ + "daniel", + 0.08556315302848816 + ] + ], + "score": 0 + }, + { + "sentence": "Sarah had to stop Tim from toying with the injured bird. _ is very compassionate.", + "answer1": [ + "Tim", + "he" + ], + "answer0": [ + "Sarah", + "she" + ], + "correct_answer": [ + "Sarah", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "tim", + 0.29738715291023254 + ], + [ + "sarah", + 0.274274617433548 + ], + [ + "he", + 0.18724749982357025 + ], + [ + "she", + 0.10372157394886017 + ] + ], + "score": 0 + } + ] + }, + { + "index": 251, + "sentences": [ + { + "sentence": "Linda had to stop Charles from toying with the injured bird. _ is very cruel.", + "answer1": [ + "Charles", + "he" + ], + "answer0": [ + "Linda", + "she" + ], + "correct_answer": [ + "Charles", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "charles", + 0.752829909324646 + ], + [ + "he", + 0.14354054629802704 + ], + [ + "she", + 0.02448778599500656 + ] + ], + "score": 1 + }, + { + "sentence": "John had to stop Amy from toying with the injured bird. _ is very cruel.", + "answer1": [ + "Amy", + "she" + ], + "answer0": [ + "John", + "he" + ], + "correct_answer": [ + "Amy", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.28404945135116577 + ], + [ + "amy", + 0.20439286530017853 + ], + [ + "john", + 0.15464074909687042 + ], + [ + "she", + 0.14667406678199768 + ] + ], + "score": 0 + } + ] + }, + { + "index": 252, + "sentences": [ + { + "sentence": "Sarah got free tickets to the play, but she gave them to Charles, even though _ was particularly eager to see it.", + "answer1": [ + "Charles", + "he" + ], + "answer0": [ + "Sarah", + "she" + ], + "correct_answer": [ + "Sarah", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.8693797588348389 + ], + [ + "she", + 0.06461245566606522 + ], + [ + "charles", + 0.048112157732248306 + ], + [ + "sarah", + 0.0034832460805773735 + ] + ], + "score": 0 + }, + { + "sentence": "Daniel got free tickets to the play, but he gave them to Amy, even though _ was particularly eager to see it.", + "answer1": [ + "Amy", + "she" + ], + "answer0": [ + "Daniel", + "he" + ], + "correct_answer": [ + "Daniel", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.8007415533065796 + ], + [ + "he", + 0.08351122587919235 + ], + [ + "amy", + 0.07764580100774765 + ], + [ + "daniel", + 0.0045530772767961025 + ] + ], + "score": 0 + } + ] + }, + { + "index": 253, + "sentences": [ + { + "sentence": "Paul got free tickets to the play, but he gave them to Laura, because _ was particularly eager to see it.", + "answer1": [ + "Laura", + "she" + ], + "answer0": [ + "Paul", + "he" + ], + "correct_answer": [ + "Laura", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.9574117064476013 + ], + [ + "laura", + 0.026944546028971672 + ], + [ + "he", + 0.008284908719360828 + ], + [ + "paul", + 0.0004410938418004662 + ] + ], + "score": 1 + }, + { + "sentence": "Margaret got free tickets to the play, but she gave them to Thomas, because _ was particularly eager to see it.", + "answer1": [ + "Thomas", + "he" + ], + "answer0": [ + "Margaret", + "she" + ], + "correct_answer": [ + "Thomas", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.9332942366600037 + ], + [ + "thomas", + 0.03162537142634392 + ], + [ + "she", + 0.018393704667687416 + ], + [ + "margaret", + 0.0010122362291440368 + ] + ], + "score": 1 + } + ] + }, + { + "index": 254, + "sentences": [ + { + "sentence": "Steven got free tickets to the play, but he gave them to Nancy, because _ was not particularly eager to see it.", + "answer1": [ + "Nancy", + "she" + ], + "answer0": [ + "Steven", + "he" + ], + "correct_answer": [ + "Steven", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.9163604378700256 + ], + [ + "nancy", + 0.0480802021920681 + ], + [ + "he", + 0.021462248638272285 + ], + [ + "steven", + 0.0004176660440862179 + ] + ], + "score": 0 + }, + { + "sentence": "Margaret got free tickets to the play, but she gave them to Warren, because _ was not particularly eager to see it.", + "answer1": [ + "Warren", + "he" + ], + "answer0": [ + "Margaret", + "she" + ], + "correct_answer": [ + "Margaret", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.8333559036254883 + ], + [ + "warren", + 0.0700656995177269 + ], + [ + "she", + 0.06793110817670822 + ], + [ + "margaret", + 0.0037946717347949743 + ] + ], + "score": 0 + } + ] + }, + { + "index": 255, + "sentences": [ + { + "sentence": "Emma gave Edward candy because _ wasn't hungry.", + "answer1": [ + "Edward", + "he" + ], + "answer0": [ + "Emma", + "she" + ], + "correct_answer": [ + "Emma", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.6763647794723511 + ], + [ + "she", + 0.26191577315330505 + ], + [ + "edward", + 0.023742079734802246 + ] + ], + "score": 0 + }, + { + "sentence": "Jason gave Betty candy because _ wasn't hungry.", + "answer1": [ + "Betty", + "she" + ], + "answer0": [ + "Jason", + "he" + ], + "correct_answer": [ + "Jason", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.75534987449646 + ], + [ + "he", + 0.1928008496761322 + ], + [ + "betty", + 0.03130156919360161 + ] + ], + "score": 0 + } + ] + }, + { + "index": 256, + "sentences": [ + { + "sentence": "Sue gave Tim candy because _ was hungry.", + "answer1": [ + "Tim", + "he" + ], + "answer0": [ + "Sue", + "she" + ], + "correct_answer": [ + "Tim", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.7316045165061951 + ], + [ + "she", + 0.22841434180736542 + ], + [ + "tim", + 0.025842225179076195 + ] + ], + "score": 1 + }, + { + "sentence": "Jason gave Jennifer candy because _ was hungry.", + "answer1": [ + "Jennifer", + "she" + ], + "answer0": [ + "Jason", + "he" + ], + "correct_answer": [ + "Jennifer", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.5606254935264587 + ], + [ + "she", + 0.4133855998516083 + ], + [ + "jennifer", + 0.010734654031693935 + ], + [ + "jason", + 0.0010448332177475095 + ] + ], + "score": 0 + } + ] + }, + { + "index": 257, + "sentences": [ + { + "sentence": "I tried to paint a picture of an orchard, with apples in the trees, but the _ came out looking more like light bulbs.", + "answer1": [ + "trees" + ], + "answer0": [ + "apples" + ], + "correct_answer": [ + "apples" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "apples", + 0.5211644172668457 + ] + ], + "score": 1 + } + ] + }, + { + "index": 258, + "sentences": [ + { + "sentence": "I tried to paint a picture of an orchard, with apples in the trees, but the _ came out looking more like telephone poles.", + "answer1": [ + "trees" + ], + "answer0": [ + "apples" + ], + "correct_answer": [ + "trees" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "apples", + 0.3111271262168884 + ], + [ + "trees", + 0.0494505800306797 + ] + ], + "score": 0 + } + ] + }, + { + "index": 259, + "sentences": [ + { + "sentence": "Margaret asked Tony for a favor but _ was refused.", + "answer1": [ + "Tony", + "he" + ], + "answer0": [ + "Margaret", + "she" + ], + "correct_answer": [ + "Margaret", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.3978308141231537 + ], + [ + "tony", + 0.05781383812427521 + ], + [ + "she", + 0.05164272338151932 + ] + ], + "score": 0 + }, + { + "sentence": "Anthony asked Amy for a favor but _ was refused.", + "answer1": [ + "Amy", + "she" + ], + "answer0": [ + "Anthony", + "he" + ], + "correct_answer": [ + "Anthony", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.4784584939479828 + ], + [ + "amy", + 0.06777182221412659 + ], + [ + "he", + 0.021794931963086128 + ] + ], + "score": 0 + } + ] + }, + { + "index": 260, + "sentences": [ + { + "sentence": "Emma asked Joseph for a favor but _ refused.", + "answer1": [ + "Joseph", + "he" + ], + "answer0": [ + "Emma", + "she" + ], + "correct_answer": [ + "Joseph", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "joseph", + 0.5529801249504089 + ], + [ + "he", + 0.43354299664497375 + ], + [ + "she", + 0.0008864352712407708 + ] + ], + "score": 1 + }, + { + "sentence": "Warren asked Laura for a favor but _ refused.", + "answer1": [ + "Laura", + "she" + ], + "answer0": [ + "Warren", + "he" + ], + "correct_answer": [ + "Laura", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.8105427026748657 + ], + [ + "laura", + 0.17909826338291168 + ] + ], + "score": 1 + } + ] + }, + { + "index": 261, + "sentences": [ + { + "sentence": "Mandy ceded the presidency to Warren because _ was less popular.", + "answer1": [ + "Warren", + "he" + ], + "answer0": [ + "Mandy", + "she" + ], + "correct_answer": [ + "Mandy", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.5756611824035645 + ], + [ + "warren", + 0.18201638758182526 + ], + [ + "she", + 0.006012094207108021 + ] + ], + "score": 0 + }, + { + "sentence": "Brian ceded the presidency to Wendy because _ was less popular.", + "answer1": [ + "Wendy", + "she" + ], + "answer0": [ + "Brian", + "he" + ], + "correct_answer": [ + "Brian", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.708384096622467 + ], + [ + "he", + 0.1238713338971138 + ], + [ + "wendy", + 0.04093684256076813 + ], + [ + "brian", + 0.01032228209078312 + ] + ], + "score": 0 + } + ] + }, + { + "index": 262, + "sentences": [ + { + "sentence": "Betty ceded the presidency to Edward because _ was more popular.", + "answer1": [ + "Edward", + "he" + ], + "answer0": [ + "Betty", + "she" + ], + "correct_answer": [ + "Edward", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.5475196242332458 + ], + [ + "she", + 0.19534984230995178 + ], + [ + "edward", + 0.11332083493471146 + ], + [ + "betty", + 0.011675244197249413 + ] + ], + "score": 1 + }, + { + "sentence": "Anthony ceded the presidency to Mandy because _ was more popular.", + "answer1": [ + "Mandy", + "she" + ], + "answer0": [ + "Anthony", + "he" + ], + "correct_answer": [ + "Mandy", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.7501657009124756 + ], + [ + "mandy", + 0.17399102449417114 + ], + [ + "he", + 0.03566601499915123 + ] + ], + "score": 1 + } + ] + }, + { + "index": 263, + "sentences": [ + { + "sentence": "Wendy did not pass the ball to Steven although _ saw that he was open.", + "answer1": [ + "Steven", + "he" + ], + "answer0": [ + "Wendy", + "she" + ], + "correct_answer": [ + "Wendy", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.8957375288009644 + ], + [ + "wendy", + 0.028812197968363762 + ], + [ + "he", + 0.014717331156134605 + ] + ], + "score": 1 + }, + { + "sentence": "Tony did not pass the ball to Anna although _ saw that she was open.", + "answer1": [ + "Anna", + "she" + ], + "answer0": [ + "Tony", + "he" + ], + "correct_answer": [ + "Tony", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.931627631187439 + ], + [ + "tony", + 0.03243102505803108 + ], + [ + "she", + 0.006165023893117905 + ] + ], + "score": 1 + } + ] + }, + { + "index": 264, + "sentences": [ + { + "sentence": "Linda did not pass the ball to Zack although _ was open.", + "answer1": [ + "Zack", + "he" + ], + "answer0": [ + "Linda", + "she" + ], + "correct_answer": [ + "Zack", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.1047290489077568 + ], + [ + "she", + 0.035551343113183975 + ] + ], + "score": 1 + }, + { + "sentence": "David did not pass the ball to Mandy although _ was open.", + "answer1": [ + "Mandy", + "she" + ], + "answer0": [ + "David", + "he" + ], + "correct_answer": [ + "Mandy", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.10433197766542435 + ], + [ + "he", + 0.09989483654499054 + ] + ], + "score": 1 + } + ] + }, + { + "index": 265, + "sentences": [ + { + "sentence": "I put the specimen on the table and the _ broke.", + "answer1": [ + "table" + ], + "answer0": [ + "specimen" + ], + "correct_answer": [ + "specimen" + ], + "adjacent_ref": false, + "predict_answer": [], + "score": 0 + } + ] + }, + { + "index": 266, + "sentences": [ + { + "sentence": "I put the stone on the table and the _ broke.", + "answer1": [ + "table" + ], + "answer0": [ + "stone" + ], + "correct_answer": [ + "table" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "stone", + 0.10115251690149307 + ] + ], + "score": 0 + } + ] + }, + { + "index": 267, + "sentences": [ + { + "sentence": "Margaret fired her trainer Edward because _ couldn't stand his friend.", + "answer1": [ + "Edward", + "he" + ], + "answer0": [ + "Margaret", + "she" + ], + "correct_answer": [ + "Margaret", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.731697142124176 + ], + [ + "she", + 0.17442825436592102 + ], + [ + "edward", + 0.03614525496959686 + ], + [ + "margaret", + 0.0032051822636276484 + ] + ], + "score": 0 + } + ] + }, + { + "index": 268, + "sentences": [ + { + "sentence": "Jennifer fired her trainer Thomas because _ slept with her friend.", + "answer1": [ + "Thomas", + "he" + ], + "answer0": [ + "Jennifer", + "she" + ], + "correct_answer": [ + "Thomas", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.8891029357910156 + ], + [ + "she", + 0.055782753974199295 + ], + [ + "thomas", + 0.013204317539930344 + ], + [ + "jennifer", + 0.0024945021141320467 + ] + ], + "score": 1 + } + ] + }, + { + "index": 269, + "sentences": [ + { + "sentence": "Sarah fired her trainer John because he slept with _ friend.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "her" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "her", + 0.7042596936225891 + ], + [ + "his", + 0.0390113890171051 + ] + ], + "score": 1 + } + ] + }, + { + "index": 270, + "sentences": [ + { + "sentence": "Amy fired her trainer George because she couldn't stand _ friend.", + "answer1": [ + "his" + ], + "answer0": [ + "her" + ], + "correct_answer": [ + "his" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "her", + 0.8472513556480408 + ], + [ + "his", + 0.0945512130856514 + ] + ], + "score": 0 + } + ] + }, + { + "index": 271, + "sentences": [ + { + "sentence": "Paul believed that Sue suspected that _ had stolen the watch.", + "answer1": [ + "Sue", + "she" + ], + "answer0": [ + "Paul", + "he" + ], + "correct_answer": [ + "Paul", + "he" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "he", + 0.23854734003543854 + ], + [ + "she", + 0.1008545383810997 + ], + [ + "sue", + 0.07258161902427673 + ], + [ + "paul", + 0.06131039932370186 + ] + ], + "score": 1 + }, + { + "sentence": "Mandy believed that Edward suspected that _ had stolen the watch.", + "answer1": [ + "Edward", + "he" + ], + "answer0": [ + "Mandy", + "she" + ], + "correct_answer": [ + "Mandy", + "she" + ], + "adjacent_ref": false, + "predict_answer": [ + [ + "she", + 0.14960864186286926 + ], + [ + "he", + 0.12885059416294098 + ], + [ + "mandy", + 0.05671042948961258 + ], + [ + "edward", + 0.04502331465482712 + ] + ], + "score": 1 + } + ] + }, + { + "index": 272, + "sentences": [ + { + "sentence": "Thomas believed that Wendy regretted that _ had stolen the watch.", + "answer1": [ + "Wendy", + "she" + ], + "answer0": [ + "Thomas", + "he" + ], + "correct_answer": [ + "Wendy", + "she" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "she", + 0.707285463809967 + ], + [ + "he", + 0.11299549043178558 + ], + [ + "thomas", + 0.02341422811150551 + ], + [ + "wendy", + 0.011723235249519348 + ] + ], + "score": 1 + }, + { + "sentence": "Barbara believed that Charles regretted that _ had stolen the watch.", + "answer1": [ + "Charles", + "he" + ], + "answer0": [ + "Barbara", + "she" + ], + "correct_answer": [ + "Charles", + "he" + ], + "adjacent_ref": true, + "predict_answer": [ + [ + "he", + 0.9224457144737244 + ], + [ + "she", + 0.0191626138985157 + ], + [ + "charles", + 0.006952487863600254 + ], + [ + "barbara", + 0.0011777096660807729 + ] + ], + "score": 1 + } + ] + } +] \ No newline at end of file diff --git a/WSC_selected.txt b/WSC_selected.txt new file mode 100644 index 00000000000000..47c30cd309f331 --- /dev/null +++ b/WSC_selected.txt @@ -0,0 +1,8 @@ +The trophy doesn't fit into the brown suitcase because the [trophy] is too large. A because B +The trophy doesn't fit into the brown suitcase because the [suitcase] is too small. A because B +The brown suitcase doesn't hold the trophy because the [trophy] is too large. A because B +The brown suitcase doesn't hold the trophy because the [suitcase] is too small. A because B +The trophy can fit into the brown suitcase because the [trophy] is so small. ~A because ~B +The trophy can fit into the brown suitcase because the [suitcase] is so large. ~A because ~B +The brown suitcase can hold the trophy because the [trophy] is so small. ~A because ~B +The brown suitcase can fit into the trophy because the [suitcase] is so large. ~A because ~B diff --git a/WSC_switched_label.json b/WSC_switched_label.json new file mode 100644 index 00000000000000..ccd4a286c1ab9d --- /dev/null +++ b/WSC_switched_label.json @@ -0,0 +1,3005 @@ +[ + { + "index": 0, + "is_switchable": 0, + "sentence": "The city councilmen refused the demonstrators a permit because [they] feared violence.", + "answer1": "The demonstrators", + "answer0": "The city councilmen", + "sentence_switched": "The demonstrators refused the city councilmen a permit because [they] feared violence.", + "correct_answer": "The city councilmen", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 1, + "is_switchable": 0, + "sentence": "The city councilmen refused the demonstrators a permit because [they] advocated violence.", + "answer1": "The demonstrators", + "answer0": "The city councilmen", + "sentence_switched": "The demonstrators refused the city councilmen a permit because [they] advocated violence.", + "correct_answer": "The demonstrators", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 2, + "is_switchable": 0, + "sentence": "The trophy doesn't fit into the brown suitcase because [it] is too large.", + "answer1": "the suitcase", + "answer0": "the trophy", + "sentence_switched": "The suitcase doesn't fit into the brown trophy because [it] is too large.", + "correct_answer": "the trophy", + "relational_word": "fit into:large/small", + "is_associative": 0 + }, + { + "index": 3, + "is_switchable": 0, + "sentence": "The trophy doesn't fit into the brown suitcase because [it] is too small.", + "answer1": "the suitcase", + "answer0": "the trophy", + "sentence_switched": "The suitcase doesn't fit into the brown trophy because [it] is too small.", + "correct_answer": "the suitcase", + "relational_word": "fit into:large/small", + "is_associative": 0 + }, + { + "index": 4, + "is_switchable": 1, + "sentence": "Joan made sure to thank Susan for all the help [she] had recieved.", + "answer1": "Susan", + "answer0": "Joan", + "sentence_switched": "Susan made sure to thank joan for all the help [she] had recieved.", + "correct_answer": "Joan", + "relational_word": "thank:receive/give", + "is_associative": 0 + }, + { + "index": 5, + "is_switchable": 1, + "sentence": "Joan made sure to thank Susan for all the help [she] had given.", + "answer1": "Susan", + "answer0": "Joan", + "sentence_switched": "Susan made sure to thank joan for all the help [she] had given.", + "correct_answer": "Susan", + "relational_word": "thank:receive/give", + "is_associative": 0 + }, + { + "index": 6, + "is_switchable": 1, + "sentence": "Paul tried to call George on the phone, but [he] wasn't successful.", + "answer1": "George", + "answer0": "Paul", + "sentence_switched": "George tried to call paul on the phone, but [he] wasn't successful.", + "correct_answer": "Paul", + "relational_word": "call:successful available", + "is_associative": 0 + }, + { + "index": 7, + "is_switchable": 1, + "sentence": "Paul tried to call George on the phone, but [he] wasn't available.", + "answer1": "George", + "answer0": "Paul", + "sentence_switched": "George tried to call paul on the phone, but [he] wasn't available.", + "correct_answer": "George", + "relational_word": "call:successful available", + "is_associative": 0 + }, + { + "index": 8, + "is_switchable": 0, + "sentence": "The lawyer asked the witness a question, but [he] was reluctant to repeat it.", + "answer1": "the witness", + "answer0": "the lawyer", + "sentence_switched": "The witness asked the lawyer a question, but [he] was reluctant to repeat it.", + "correct_answer": "the lawyer", + "relational_word": "ask:repeat answer", + "is_associative": 0 + }, + { + "index": 9, + "is_switchable": 0, + "sentence": "The lawyer asked the witness a question, but [he] was reluctant to answer it.", + "answer1": "the witness", + "answer0": "the lawyer", + "sentence_switched": "The witness asked the lawyer a question, but [he] was reluctant to answer it.", + "correct_answer": "the witness", + "relational_word": "ask:repeat answer", + "is_associative": 0 + }, + { + "index": 10, + "is_switchable": 1, + "sentence": "The delivery truck zoomed by the school bus because [it] was going so fast.", + "answer1": "the school bus", + "answer0": "the delivery truck", + "sentence_switched": "The school bus zoomed by the delivery truck because [it] was going so fast.", + "correct_answer": "the delivery truck", + "relational_word": "zoom by:fast/slow", + "is_associative": 0 + }, + { + "index": 11, + "is_switchable": 1, + "sentence": "The delivery truck zoomed by the school bus because [it] was going so slow.", + "answer1": "the school bus", + "answer0": "the delivery truck", + "sentence_switched": "The school bus zoomed by the delivery truck because [it] was going so slow.", + "correct_answer": "the school bus", + "relational_word": "zoom by:fast/slow", + "is_associative": 0 + }, + { + "index": 12, + "is_switchable": 1, + "sentence": "Frank felt vindicated when his longtime rival Bill revealed that [he] was the winner of the competition.", + "answer1": "Bill", + "answer0": "Frank", + "sentence_switched": "Bill felt vindicated when his longtime rival frank revealed that [he] was the winner of the competition.", + "correct_answer": "Frank", + "relational_word": "vindicated/crushed:be the winner", + "is_associative": 0 + }, + { + "index": 13, + "is_switchable": 1, + "sentence": "Frank felt crushed when his longtime rival Bill revealed that [he] was the winner of the competition.", + "answer1": "Bill", + "answer0": "Frank", + "sentence_switched": "Bill felt crushed when his longtime rival frank revealed that [he] was the winner of the competition.", + "correct_answer": "Bill", + "relational_word": "vindicated/crushed:be the winner", + "is_associative": 0 + }, + { + "index": 14, + "is_switchable": 1, + "sentence": "The man couldn't lift his son because [he] was so weak.", + "answer1": "The son", + "answer0": "The man", + "sentence_switched": "The son couldn't lift the man because [he] was so weak.", + "correct_answer": "The man", + "relational_word": "lift:weak heavy", + "is_associative": 0 + }, + { + "index": 15, + "is_switchable": 1, + "sentence": "The man couldn't lift his son because [he] was so heavy.", + "answer1": "The son", + "answer0": "The man", + "sentence_switched": "The son couldn't lift his man because [he] was so heavy.", + "correct_answer": "The son", + "relational_word": "lift:weak heavy", + "is_associative": 0 + }, + { + "index": 16, + "is_switchable": 0, + "sentence": "The large ball crashed right through the table because [it] was made of steel.", + "answer1": "The table", + "answer0": "The large ball", + "sentence_switched": "The table crashed right through the large ball because [it] was made of steel.", + "correct_answer": "The large ball", + "relational_word": "crash through:[hard]/[soft]", + "is_associative": 0 + }, + { + "index": 17, + "is_switchable": 0, + "sentence": "The large ball crashed right through the table because [it] was made of styrofoam.", + "answer1": "The table", + "answer0": "The large ball", + "sentence_switched": "The table crashed right through the large ball because [it] was made of styrofoam.", + "correct_answer": "The table", + "relational_word": "crash through:[hard]/[soft]", + "is_associative": 0 + }, + { + "index": 18, + "is_switchable": 1, + "sentence": "John couldn't see the stage with Billy in front of him because [he] is so short.", + "answer1": "Billy", + "answer0": "John", + "sentence_switched": "Billy couldn't see the stage with john in front of him because [he] is so short.", + "correct_answer": "John", + "relational_word": "[block]:short/tall", + "is_associative": 0 + }, + { + "index": 19, + "is_switchable": 1, + "sentence": "John couldn't see the stage with Billy in front of him because [he] is so tall.", + "answer1": "Billy", + "answer0": "John", + "sentence_switched": "Billy couldn't see the stage with john in front of him because [he] is so tall.", + "correct_answer": "Billy", + "relational_word": "[block]:short/tall", + "is_associative": 0 + }, + { + "index": 20, + "is_switchable": 1, + "sentence": "Tom threw his schoolbag down to Ray after [he] reached the top of the stairs.", + "answer1": "Ray", + "answer0": "Tom", + "sentence_switched": "Ray threw his schoolbag down to tom after [he] reached the top of the stairs.", + "correct_answer": "Tom", + "relational_word": "down to:top/bottom", + "is_associative": 0 + }, + { + "index": 21, + "is_switchable": 1, + "sentence": "Tom threw his schoolbag down to Ray after [he] reached the bottom of the stairs.", + "answer1": "Ray", + "answer0": "Tom", + "sentence_switched": "Ray threw his schoolbag down to tom after [he] reached the botray of the stairs.", + "correct_answer": "Ray", + "relational_word": "down to:top/bottom", + "is_associative": 0 + }, + { + "index": 22, + "is_switchable": 1, + "sentence": "Although they ran at about the same speed, Sue beat Sally because [she] had such a good start.", + "answer1": "Sally", + "answer0": "Sue", + "sentence_switched": "Although they ran at about the same speed, sally beat sue because [she] had such a good start.", + "correct_answer": "Sue", + "relational_word": "beat:good/bad", + "is_associative": 0 + }, + { + "index": 23, + "is_switchable": 1, + "sentence": "Although they ran at about the same speed, Sue beat Sally because [she] had such a bad start.", + "answer1": "Sally", + "answer0": "Sue", + "sentence_switched": "Although they ran at about the same speed, sally beat sue because [she] had such a bad start.", + "correct_answer": "Sally", + "relational_word": "beat:good/bad", + "is_associative": 0 + }, + { + "index": 24, + "is_switchable": 0, + "sentence": "The sculpture rolled off the shelf because [it] wasn't anchored.", + "answer1": "The shelf", + "answer0": "The sculpture", + "sentence_switched": "The shelf rolled off the sculpture because [it] wasn't anchored.", + "correct_answer": "The sculpture", + "relational_word": "roll off:anchored level", + "is_associative": 0 + }, + { + "index": 25, + "is_switchable": 0, + "sentence": "The sculpture rolled off the shelf because [it] wasn't level.", + "answer1": "The shelf", + "answer0": "The sculpture", + "sentence_switched": "The shelf rolled off the sculpture because [it] wasn't level.", + "correct_answer": "The shelf", + "relational_word": "roll off:anchored level", + "is_associative": 0 + }, + { + "index": 26, + "is_switchable": 1, + "sentence": "Sam's drawing was hung just above Tina's and [it] did look much better with another one below it.", + "answer1": "Tina's drawing", + "answer0": "Sam's drawing", + "sentence_switched": "Tina's drawing was hung just above sam's and [it] did look much better with another one below it.", + "correct_answer": "Sam's drawing", + "relational_word": "above/below", + "is_associative": 0 + }, + { + "index": 27, + "is_switchable": 1, + "sentence": "Sam's drawing was hung just above Tina's and [it] did look much better with another one above it.", + "answer1": "Tina's drawing", + "answer0": "Sam's drawing", + "sentence_switched": "Tina's drawing was hung just above sam's and [it] did look much better with another one above it.", + "correct_answer": "Tina's drawing", + "relational_word": "above/below", + "is_associative": 0 + }, + { + "index": 28, + "is_switchable": 1, + "sentence": "Anna did a lot better than her good friend Lucy on the test because [she] had studied so hard.", + "answer1": "Lucy", + "answer0": "Anna", + "sentence_switched": "Lucy did a lot better than her good friend anna on the test because [she] had studied so hard.", + "correct_answer": "Anna", + "relational_word": "better/worse:study hard", + "is_associative": 0 + }, + { + "index": 29, + "is_switchable": 1, + "sentence": "Anna did a lot worse than her good friend Lucy on the test because [she] had studied so hard.", + "answer1": "Lucy", + "answer0": "Anna", + "sentence_switched": "Lucy did a lot worse than her good friend anna on the test because [she] had studied so hard.", + "correct_answer": "Lucy", + "relational_word": "better/worse:study hard", + "is_associative": 0 + }, + { + "index": 30, + "is_switchable": 1, + "sentence": "The firemen arrived after the police because [they] were coming from so far away.", + "answer1": "The police", + "answer0": "The firemen", + "sentence_switched": "The police arrived after the firemen because [they] were coming from so far away.", + "correct_answer": "The firemen", + "relational_word": "after/before:far away", + "is_associative": 0 + }, + { + "index": 31, + "is_switchable": 1, + "sentence": "The firemen arrived before the police because [they] were coming from so far away.", + "answer1": "The police", + "answer0": "The firemen", + "sentence_switched": "The police arrived before the firemen because [they] were coming from so far away.", + "correct_answer": "The police", + "relational_word": "after/before:far away", + "is_associative": 0 + }, + { + "index": 32, + "is_switchable": 1, + "sentence": "Frank was upset with Tom because the toaster [he] had bought from him didn't work.", + "answer1": "Tom", + "answer0": "Frank", + "sentence_switched": "Tom was upset with frank because the toaster [he] had bought from him didn't work.", + "correct_answer": "Frank", + "relational_word": "be upset with:buy from not work/sell not work", + "is_associative": 2 + }, + { + "index": 33, + "is_switchable": 1, + "sentence": "Frank was upset with Tom because the toaster [he] had sold him didn't work.", + "answer1": "Tom", + "answer0": "Frank", + "sentence_switched": "Tom was upset with frank because the toaster [he] had sold him didn't work.", + "correct_answer": "Tom", + "relational_word": "be upset with:buy from not work/sell not work", + "is_associative": 2 + }, + { + "index": 34, + "is_switchable": 1, + "sentence": "Jim yelled at Kevin because [he] was so upset.", + "answer1": "Kevin", + "answer0": "Jim", + "sentence_switched": "Kevin yelled at jim because [he] was so upset.", + "correct_answer": "Jim", + "relational_word": "?yell at comfort:upset", + "is_associative": 0 + }, + { + "index": 35, + "is_switchable": 1, + "sentence": "Jim comforted Kevin because [he] was so upset.", + "answer1": "Kevin", + "answer0": "Jim", + "sentence_switched": "Kevin comforted jim because [he] was so upset.", + "correct_answer": "Kevin", + "relational_word": "?yell at comfort:upset", + "is_associative": 0 + }, + { + "index": 36, + "is_switchable": 1, + "sentence": "The sack of potatoes had been placed above the bag of flour, so [it] had to be moved first.", + "answer1": "The bag of flour", + "answer0": "The sack of potatoes", + "sentence_switched": "The bag of flour had been placed above the sack of potatoes, so [it] had to be moved first.", + "correct_answer": "The sack of potatoes", + "relational_word": "above/below:moved first", + "is_associative": 0 + }, + { + "index": 37, + "is_switchable": 1, + "sentence": "The sack of potatoes had been placed below the bag of flour, so [it] had to be moved first.", + "answer1": "The bag of flour", + "answer0": "The sack of potatoes", + "sentence_switched": "The bag of flour had been placed below the sack of potatoes, so [it] had to be moved first.", + "correct_answer": "The bag of flour", + "relational_word": "above/below:moved first", + "is_associative": 0 + }, + { + "index": 38, + "is_switchable": 1, + "sentence": "Pete envies Martin although [he] is very successful.", + "answer1": "Martin", + "answer0": "Pete", + "sentence_switched": "Martin envies pete although [he] is very successful.", + "correct_answer": "Pete", + "relational_word": "although/because", + "is_associative": 0 + }, + { + "index": 39, + "is_switchable": 1, + "sentence": "Pete envies Martin because [he] is very successful.", + "answer1": "Martin", + "answer0": "Pete", + "sentence_switched": "Martin envies pete because [he] is very successful.", + "correct_answer": "Martin", + "relational_word": "although/because", + "is_associative": 0 + }, + { + "index": 40, + "is_switchable": 1, + "sentence": "The older students were bullying the younger ones, so we punished [them] .", + "answer1": "The younger students", + "answer0": "The older students", + "sentence_switched": "The younger students were bullying the older ones, so we punished [them] .", + "correct_answer": "The older students", + "relational_word": "bully:punish rescue", + "is_associative": 0 + }, + { + "index": 41, + "is_switchable": 1, + "sentence": "The older students were bullying the younger ones, so we rescued [them] .", + "answer1": "The younger students", + "answer0": "The older students", + "sentence_switched": "The younger students were bullying the older ones, so we rescued [them] .", + "correct_answer": "The younger students", + "relational_word": "bully:punish rescue", + "is_associative": 0 + }, + { + "index": 42, + "is_switchable": 1, + "sentence": "I poured water from the bottle into the cup until [it] was empty.", + "answer1": "the cup", + "answer0": "the bottle", + "sentence_switched": "I poured water from the cup into the bottle until [it] was empty.", + "correct_answer": "the bottle", + "relational_word": "pour:empty/full", + "is_associative": 0 + }, + { + "index": 43, + "is_switchable": 1, + "sentence": "I poured water from the bottle into the cup until [it] was full.", + "answer1": "the cup", + "answer0": "the bottle", + "sentence_switched": "I poured water from the cup into the bottle until [it] was full.", + "correct_answer": "the cup", + "relational_word": "pour:empty/full", + "is_associative": 0 + }, + { + "index": 44, + "is_switchable": 1, + "sentence": "Susan knows all about Ann's personal problems because [she] is nosy.", + "answer1": "Ann", + "answer0": "Susan", + "sentence_switched": "Ann knows all about susan's personal problems because [she] is nosy.", + "correct_answer": "Susan", + "relational_word": "know:nosy indiscreet", + "is_associative": 0 + }, + { + "index": 45, + "is_switchable": 1, + "sentence": "Susan knows all about Ann's personal problems because [she] is indiscreet.", + "answer1": "Ann", + "answer0": "Susan", + "sentence_switched": "Ann knows all about susan's personal problems because [she] is indiscreet.", + "correct_answer": "Ann", + "relational_word": "know:nosy indiscreet", + "is_associative": 0 + }, + { + "index": 46, + "is_switchable": 1, + "sentence": "Sid explained his theory to Mark but [he] couldn't convince him.", + "answer1": "Mark", + "answer0": "Sid", + "sentence_switched": "Mark explained his theory to sid but [he] couldn't convince him.", + "correct_answer": "Sid", + "relational_word": "explain:convince/understand", + "is_associative": 2 + }, + { + "index": 47, + "is_switchable": 1, + "sentence": "Sid explained his theory to Mark but [he] couldn't understand him.", + "answer1": "Mark", + "answer0": "Sid", + "sentence_switched": "Mark explained his theory to sid but [he] couldn't understand him.", + "correct_answer": "Mark", + "relational_word": "explain:convince/understand", + "is_associative": 2 + }, + { + "index": 48, + "is_switchable": 1, + "sentence": "Susan knew that Ann's son had been in a car accident, so [she] told her about it.", + "answer1": "Ann", + "answer0": "Susan", + "sentence_switched": "Ann knew that susan's son had been in a car accident, so [she] told her about it.", + "correct_answer": "Susan", + "relational_word": "?know tell:so/because", + "is_associative": 2 + }, + { + "index": 49, + "is_switchable": 1, + "sentence": "Susan knew that Ann's son had been in a car accident, because [she] told her about it.", + "answer1": "Ann", + "answer0": "Susan", + "sentence_switched": "Ann knew that susan's son had been in a car accident, because [she] told her about it.", + "correct_answer": "Ann", + "relational_word": "?know tell:so/because", + "is_associative": 2 + }, + { + "index": 50, + "is_switchable": 0, + "sentence": "Joe's uncle can still beat him at tennis, even though [he] is 30 years younger.", + "answer1": "Joe's uncle", + "answer0": "Joe", + "sentence_switched": "Joe's uncle can still beat him at tennis, even though [he] is 30 years younger.", + "correct_answer": "Joe", + "relational_word": "beat:younger/older", + "is_associative": 0 + }, + { + "index": 51, + "is_switchable": 0, + "sentence": "Joe's uncle can still beat him at tennis, even though [he] is 30 years older.", + "answer1": "Joe's uncle", + "answer0": "Joe", + "sentence_switched": "Joe can still beat him at tennis, even though [he] is 30 years older.", + "correct_answer": "Joe's uncle", + "relational_word": "beat:younger/older", + "is_associative": 0 + }, + { + "index": 52, + "is_switchable": 0, + "sentence": "The painting in Mark's living room shows an oak tree. [It] is to the right of the bookcase.", + "answer1": "The oak tree", + "answer0": "The painting", + "sentence_switched": "The oak tree in mark's living room shows a painting. [it] is to the right of the bookcase.", + "correct_answer": "The painting", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 53, + "is_switchable": 0, + "sentence": "The painting in Mark's living room shows an oak tree. [It] is to the right of a house.", + "answer1": "The oak tree", + "answer0": "The painting", + "sentence_switched": "The oak tree in mark's living room shows a painting. [it] is to the right of a house.", + "correct_answer": "The oak tree", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 54, + "is_switchable": 0, + "sentence": "There is a gap in the wall. You can see the garden through [it] .", + "answer1": "The wall", + "answer0": "The gap", + "sentence_switched": "There is a wall in the gap. you can see the garden through [it] .", + "correct_answer": "The gap", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 55, + "is_switchable": 0, + "sentence": "There is a gap in the wall. You can see the garden behind [it] .", + "answer1": "The wall", + "answer0": "The gap", + "sentence_switched": "There is a wall in the gap. you can see the garden behind [it] .", + "correct_answer": "The wall", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 56, + "is_switchable": 0, + "sentence": "The drain is clogged with hair. [It] has to be cleaned.", + "answer1": "The hair", + "answer0": "The drain", + "sentence_switched": "The hair is clogged with drain. [it] has to be cleaned.", + "correct_answer": "The drain", + "relational_word": "clog:cleaned removed", + "is_associative": 0 + }, + { + "index": 57, + "is_switchable": 0, + "sentence": "The drain is clogged with hair. [It] has to be removed.", + "answer1": "The hair", + "answer0": "The drain", + "sentence_switched": "The hair is clogged with drain. [it] has to be removed.", + "correct_answer": "The hair", + "relational_word": "clog:cleaned removed", + "is_associative": 0 + }, + { + "index": 58, + "is_switchable": 0, + "sentence": "My meeting started at 4:00 and I needed to catch the train at 4:30, so there wasn't much time. Luckily, [it] was short, so it worked out.", + "answer1": "The train", + "answer0": "The meeting", + "sentence_switched": "My train started at 4:00 and i needed to catch the meeting at 4:30, so there wasn't much time. luckily, [it] was short, so it worked out.", + "correct_answer": "The meeting", + "relational_word": "?immediately follow:short delayed", + "is_associative": 1 + }, + { + "index": 59, + "is_switchable": 0, + "sentence": "My meeting started at 4:00 and I needed to catch the train at 4:30, so there wasn't much time. Luckily, [it] was delayed, so it worked out.", + "answer1": "The train", + "answer0": "The meeting", + "sentence_switched": "My train started at 4:00 and i needed to catch the meeting at 4:30, so there wasn't much time. luckily, [it] was delayed, so it worked out.", + "correct_answer": "The train", + "relational_word": "?immediately follow:short delayed", + "is_associative": 0 + }, + { + "index": 60, + "is_switchable": 0, + "sentence": "There is a pillar between me and the stage, and I can't see around [it] .", + "answer1": "The stage", + "answer0": "The pillar", + "sentence_switched": "There is a stage between me and the pillar, and i can't see around [it] .", + "correct_answer": "The pillar", + "relational_word": "?between:see see around", + "is_associative": 2 + }, + { + "index": 61, + "is_switchable": 0, + "sentence": "There is a pillar between me and the stage, and I can't see [it] .", + "answer1": "The stage", + "answer0": "The pillar", + "sentence_switched": "There is a stage between me and the pillar, and i can't see [it] .", + "correct_answer": "The stage", + "relational_word": "?between:see see around", + "is_associative": 2 + }, + { + "index": 62, + "is_switchable": 0, + "sentence": "They broadcast an announcement, but a subway came into the station and I couldn't hear [it] .", + "answer1": "The subway", + "answer0": "The announcement", + "sentence_switched": "They broadcast a subway, but an announcement came into the station and i couldn't hear [it] .", + "correct_answer": "The announcement", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 63, + "is_switchable": 0, + "sentence": "They broadcast an announcement, but a subway came into the station and I couldn't hear over [it] .", + "answer1": "The subway", + "answer0": "The announcement", + "sentence_switched": "They broadcast a subway, but an announcement came into the station and i couldn't hear over [it] .", + "correct_answer": "The subway", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 64, + "is_switchable": 0, + "sentence": "In the middle of the outdoor concert, the rain started falling, but [it] continued until 10.", + "answer1": "The rain", + "answer0": "The concert", + "sentence_switched": "In the middle of the outdoor rain, the concert started falling, but [it] continued until 10.", + "correct_answer": "The concert", + "relational_word": "but/and", + "is_associative": 0 + }, + { + "index": 65, + "is_switchable": 0, + "sentence": "In the middle of the outdoor concert, the rain started falling, and [it] continued until 10.", + "answer1": "The rain", + "answer0": "The concert", + "sentence_switched": "In the middle of the outdoor rain, the concert started falling, and [it] continued until 10.", + "correct_answer": "The rain", + "relational_word": "but/and", + "is_associative": 0 + }, + { + "index": 66, + "is_switchable": 0, + "sentence": "I used an old rag to clean the knife, and then I put [it] in the trash.", + "answer1": "The knife", + "answer0": "The rag", + "sentence_switched": "I used an old knife to clean the rag, and then i put [it] in the trash.", + "correct_answer": "The rag", + "relational_word": "clean:put in the trash put in the drawer", + "is_associative": 0 + }, + { + "index": 67, + "is_switchable": 0, + "sentence": "I used an old rag to clean the knife, and then I put [it] in the drawer.", + "answer1": "The knife", + "answer0": "The rag", + "sentence_switched": "I used an old knife to clean the rag, and then i put [it] in the drawer.", + "correct_answer": "The knife", + "relational_word": "clean:put in the trash put in the drawer", + "is_associative": 0 + }, + { + "index": 68, + "is_switchable": 1, + "sentence": "Ann asked Mary what time the library closes, because [she] had forgotten.", + "answer1": "Mary", + "answer0": "Ann", + "sentence_switched": "Mary asked ann what time the library closes, because [she] had forgotten.", + "correct_answer": "Ann", + "relational_word": "because/but", + "is_associative": 0 + }, + { + "index": 69, + "is_switchable": 1, + "sentence": "Ann asked Mary what time the library closes, but [she] had forgotten.", + "answer1": "Mary", + "answer0": "Ann", + "sentence_switched": "Mary asked ann what time the library closes, but [she] had forgotten.", + "correct_answer": "Mary", + "relational_word": "because/but", + "is_associative": 0 + }, + { + "index": 70, + "is_switchable": 0, + "sentence": "I took the water bottle out of the backpack so that [it] would be handy.", + "answer1": "The backpack", + "answer0": "The water bottle", + "sentence_switched": "I took the backpack out of the water bottle so that [it] would be handy.", + "correct_answer": "The water bottle", + "relational_word": "out of:handy lighter", + "is_associative": 0 + }, + { + "index": 71, + "is_switchable": 0, + "sentence": "I took the water bottle out of the backpack so that [it] would be lighter.", + "answer1": "The backpack", + "answer0": "The water bottle", + "sentence_switched": "I took the backpack out of the water bottle so that [it] would be lighter.", + "correct_answer": "The backpack", + "relational_word": "out of:handy lighter", + "is_associative": 0 + }, + { + "index": 72, + "is_switchable": 0, + "sentence": "I couldn't put the pot on the shelf because [it] was too tall.", + "answer1": "The shelf", + "answer0": "The pot", + "sentence_switched": "I couldn't put the shelf on the pot because [it] was too tall.", + "correct_answer": "The pot", + "relational_word": "put:tall high", + "is_associative": 1 + }, + { + "index": 73, + "is_switchable": 0, + "sentence": "I couldn't put the pot on the shelf because [it] was too high.", + "answer1": "The shelf", + "answer0": "The pot", + "sentence_switched": "I couldn't put the shelf on the pot because [it] was too high.", + "correct_answer": "The shelf", + "relational_word": "put:tall high", + "is_associative": 0 + }, + { + "index": 74, + "is_switchable": 0, + "sentence": "I'm sure that my map will show this building; [it] is very good.", + "answer1": "The building", + "answer0": "The map", + "sentence_switched": "I'm sure that my building will show this map; [it] is very good.", + "correct_answer": "The map", + "relational_word": "show:good famous", + "is_associative": 1 + }, + { + "index": 75, + "is_switchable": 0, + "sentence": "I'm sure that my map will show this building; [it] is very famous.", + "answer1": "The building", + "answer0": "The map", + "sentence_switched": "I'm sure that my building will show this map; [it] is very famous.", + "correct_answer": "The building", + "relational_word": "show:good famous", + "is_associative": 1 + }, + { + "index": 76, + "is_switchable": 1, + "sentence": "Bob paid for Charlie's college education. [He] is very generous.", + "answer1": "Charlie", + "answer0": "Bob", + "sentence_switched": "Charlie paid for bob's college education. [he] is very generous.", + "correct_answer": "Bob", + "relational_word": "pay for:generous grateful", + "is_associative": 0 + }, + { + "index": 77, + "is_switchable": 1, + "sentence": "Bob paid for Charlie's college education. [He] is very grateful.", + "answer1": "Charlie", + "answer0": "Bob", + "sentence_switched": "Charlie paid for bob's college education. [he] is very grateful.", + "correct_answer": "Charlie", + "relational_word": "pay for:generous grateful", + "is_associative": 0 + }, + { + "index": 78, + "is_switchable": 1, + "sentence": "Bob paid for Charlie's college education, but now Charlie acts as though it never happened. [He] is very hurt.", + "answer1": "Charlie", + "answer0": "Bob", + "sentence_switched": "Charlie paid for bob's college education, but now bob acts as though it never happened. [he] is very hurt.", + "correct_answer": "Bob", + "relational_word": "but", + "is_associative": 0 + }, + { + "index": 79, + "is_switchable": 1, + "sentence": "Bob paid for Charlie's college education, but now Charlie acts as though it never happened. [He] is very ungrateful.", + "answer1": "Charlie", + "answer0": "Bob", + "sentence_switched": "Charlie paid for bob's college education, but now bob acts as though it never happened. [he] is very ungrateful.", + "correct_answer": "Charlie", + "relational_word": "but", + "is_associative": 0 + }, + { + "index": 80, + "is_switchable": 1, + "sentence": "Bob was playing cards with Adam and was way ahead. If Adam hadn't had a sudden run of good luck, [he] would have won.", + "answer1": "Adam", + "answer0": "Bob", + "sentence_switched": "Adam was playing cards with bob and was way ahead. if bob hadn't had a sudden run of good luck, [he] would have won.", + "correct_answer": "Bob", + "relational_word": "if", + "is_associative": 0 + }, + { + "index": 81, + "is_switchable": 1, + "sentence": "Bob was playing cards with Adam and was way ahead. If Adam hadn't had a sudden run of good luck, [he] would have lost.", + "answer1": "Adam", + "answer0": "Bob", + "sentence_switched": "Adam was playing cards with bob and was way ahead. if bob hadn't had a sudden run of good luck, [he] would have lost.", + "correct_answer": "Adam", + "relational_word": "if", + "is_associative": 0 + }, + { + "index": 82, + "is_switchable": 1, + "sentence": "Adam can't leave work here until Bob arrives to replace him. If Bob had left home for work on time, [he] would be gone by this time.", + "answer1": "Bob", + "answer0": "Adam", + "sentence_switched": "Bob can't leave work here until adam arrives to replace him. if adam had left home for work on time, [he] would be gone by this time.", + "correct_answer": "Adam", + "relational_word": "if", + "is_associative": 0 + }, + { + "index": 83, + "is_switchable": 1, + "sentence": "Adam can't leave work here until Bob arrives to replace him. If Bob had left home for work on time, [he] would be here by this time.", + "answer1": "Bob", + "answer0": "Adam", + "sentence_switched": "Bob can't leave work here until adam arrives to replace him. if adam had left home for work on time, [he] would be here by this time.", + "correct_answer": "Bob", + "relational_word": "if", + "is_associative": 0 + }, + { + "index": 84, + "is_switchable": 0, + "sentence": "If the con artist has succeeded in fooling Sam, [he] would have gotten a lot of money.", + "answer1": "Sam", + "answer0": "The con artist", + "sentence_switched": "If sam has succeeded in fooling the con artist, [he] would have gotten a lot of money.", + "correct_answer": "The con artist", + "relational_word": "fool:get/lose", + "is_associative": 0 + }, + { + "index": 85, + "is_switchable": 0, + "sentence": "If the con artist has succeeded in fooling Sam, [he] would have lost a lot of money.", + "answer1": "Sam", + "answer0": "The con artist", + "sentence_switched": "If sam has succeeded in fooling the con artist, [he] would have lost a lot of money.", + "correct_answer": "Sam", + "relational_word": "fool:get/lose", + "is_associative": 0 + }, + { + "index": 86, + "is_switchable": 0, + "sentence": "It was a summer afternoon, and the dog was sitting in the middle of the lawn. After a while, it got up and moved to a spot under the tree, because [it] was hot.", + "answer1": "The spot under the tree", + "answer0": "The dog", + "sentence_switched": "It was a summer afternoon, and the spot under tree was sitting in the middle of the lawn. after a while, it got up and moved to a dog, because [it] was hot.", + "correct_answer": "The dog", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 87, + "is_switchable": 0, + "sentence": "It was a summer afternoon, and the dog was sitting in the middle of the lawn. After a while, it got up and moved to a spot under the tree, because [it] was cooler.", + "answer1": "The spot under the tree", + "answer0": "The dog", + "sentence_switched": "It was a summer afternoon, and the spot under tree was sitting in the middle of the lawn. after a while, it got up and moved to a dog, because [it] was cooler.", + "correct_answer": "The spot under the tree", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 88, + "is_switchable": 0, + "sentence": "The cat was lying by the mouse hole waiting for the mouse, but [it] was too impatient.", + "answer1": "The mouse", + "answer0": "The cat", + "sentence_switched": "The mouse was lying by the cat hole waiting for the cat, but [it] was too impatient.", + "correct_answer": "The cat", + "relational_word": "wait:impatient cautious", + "is_associative": 0 + }, + { + "index": 89, + "is_switchable": 0, + "sentence": "The cat was lying by the mouse hole waiting for the mouse, but [it] was too cautious.", + "answer1": "The mouse", + "answer0": "The cat", + "sentence_switched": "The mouse was lying by the cat hole waiting for the cat, but [it] was too cautious.", + "correct_answer": "The mouse", + "relational_word": "wait:impatient cautious", + "is_associative": 0 + }, + { + "index": 90, + "is_switchable": 0, + "sentence": "Anne gave birth to a daughter last month. [She] is a very charming woman.", + "answer1": "Anne's daughter", + "answer0": "Anne", + "sentence_switched": "Anne's daughter gave birth to Anne last month. [she] is a very charming woman.", + "correct_answer": "Anne", + "relational_word": "give birth:woman baby", + "is_associative": 0 + }, + { + "index": 91, + "is_switchable": 0, + "sentence": "Anne daughter gave birth to Anne last month. [She] is a very charming baby.", + "answer1": "Anne's daughter", + "answer0": "Anne", + "sentence_switched": "Anne's daughter gave birth to Anne last month. [she] is a very charming baby.", + "correct_answer": "Anne's daughter", + "relational_word": "give birth:woman baby", + "is_associative": 0 + }, + { + "index": 92, + "is_switchable": 0, + "sentence": "Alice tried frantically to stop her daughter from chatting at the party, leaving us to wonder why [she] was behaving so strangely.", + "answer1": "Alice's daughter", + "answer0": "Alice", + "sentence_switched": "Alice's daughter tried frantically to stop Alice from chatting at the party, leaving us to wonder why [she] was behaving so strangely.", + "correct_answer": "Alice", + "relational_word": "?stop normal/stop abnormal:strange", + "is_associative": 0 + }, + { + "index": 93, + "is_switchable": 0, + "sentence": "Alice tried frantically to stop her daughter from barking at the party, leaving us to wonder why [she] was behaving so strangely.", + "answer1": "Alice's daughter", + "answer0": "Alice", + "sentence_switched": "Alice's daughter tried frantically to stop Alice from barking at the party, leaving us to wonder why [she] was behaving so strangely.", + "correct_answer": "Alice's daughter", + "relational_word": "?stop normal/stop abnormal:strange", + "is_associative": 0 + }, + { + "index": 94, + "is_switchable": 1, + "sentence": "I saw Jim yelling at some guy in a military uniform with a huge red beard. I don't know why [he] was, but he looked very unhappy.", + "answer1": "the guy in uniform", + "answer0": "Jim", + "sentence_switched": "I saw the guy in military uniform with a huge red beard yelling at jim. i don't know why [he] was, but he looked very unhappy.", + "correct_answer": "Jim", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 95, + "is_switchable": 1, + "sentence": "I saw Jim yelling at some guy in a military uniform with a huge red beard. I don't know who [he] was, but he looked very unhappy.", + "answer1": "the guy in uniform", + "answer0": "Jim", + "sentence_switched": "I saw the guy in military uniform with a huge red beard yelling at jim. i don't know who [he] was, but he looked very unhappy.", + "correct_answer": "the guy in uniform", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 96, + "is_switchable": 0, + "sentence": "The fish ate the worm. [It] was hungry.", + "answer1": "The worm", + "answer0": "The fish", + "sentence_switched": "The worm ate the fish. [it] was hungry.", + "correct_answer": "The fish", + "relational_word": "eat:hungry tasty", + "is_associative": 0 + }, + { + "index": 97, + "is_switchable": 0, + "sentence": "The fish ate the worm. [It] was tasty.", + "answer1": "The worm", + "answer0": "The fish", + "sentence_switched": "The worm ate the fish. [it] was tasty.", + "correct_answer": "The worm", + "relational_word": "eat:hungry tasty", + "is_associative": 0 + }, + { + "index": 98, + "is_switchable": 0, + "sentence": "I was trying to open the lock with the key, but someone had filled the keyhole with chewing gum, and I couldn't get [it] in.", + "answer1": "The chewing gum", + "answer0": "The key", + "sentence_switched": "I was trying to open the lock with the chewing gum, but someone had filled the keyhole with the key, and i couldn't get [it] in.", + "correct_answer": "The key", + "relational_word": "put ... into filled with ... :get in/get out", + "is_associative": 1 + }, + { + "index": 99, + "is_switchable": 0, + "sentence": "I was trying to open the lock with the key, but someone had filled the keyhole with chewing gum, and I couldn't get [it] out.", + "answer1": "The chewing gum", + "answer0": "The key", + "sentence_switched": "I was trying to open the lock with the chewing gum, but someone had filled the keyhole with the key, and i couldn't get [it] out.", + "correct_answer": "The chewing gum", + "relational_word": "put ... into filled with ... :get in/get out", + "is_associative": 0 + }, + { + "index": 100, + "is_switchable": 0, + "sentence": "The dog chased the cat, which ran up a tree. [It] waited at the bottom.", + "answer1": "The cat", + "answer0": "The dog", + "sentence_switched": "The cat chased the dog, which ran up a tree. [it] waited at the bottom.", + "correct_answer": "The dog", + "relational_word": "up:at the bottom/at the top", + "is_associative": 0 + }, + { + "index": 101, + "is_switchable": 0, + "sentence": "The dog chased the cat, which ran up a tree. [It] waited at the top.", + "answer1": "The cat", + "answer0": "The dog", + "sentence_switched": "The cat chased the dog, which ran up a tree. [it] waited at the top.", + "correct_answer": "The cat", + "relational_word": "up:at the bottom/at the top", + "is_associative": 0 + }, + { + "index": 102, + "is_switchable": 0, + "sentence": "In the storm, the tree fell down and crashed through the roof of my house. Now, I have to get [it] removed.", + "answer1": "The roof", + "answer0": "The tree", + "sentence_switched": "In the storm, the roof fell down and crashed through the tree of my house. now, i have to get [it] removed.", + "correct_answer": "The tree", + "relational_word": "crash through:removed repaired", + "is_associative": 0 + }, + { + "index": 103, + "is_switchable": 0, + "sentence": "In the storm, the tree fell down and crashed through the roof of my house. Now, I have to get [it] repaired.", + "answer1": "The roof", + "answer0": "The tree", + "sentence_switched": "In the storm, the roof fell down and crashed through the tree of my house. now, i have to get [it] repaired.", + "correct_answer": "The roof", + "relational_word": "crash through:removed repaired", + "is_associative": 1 + }, + { + "index": 104, + "is_switchable": 0, + "sentence": "The customer walked into the bank and stabbed one of the tellers. [He] was immediately taken to the police station.", + "answer1": "The teller", + "answer0": "The customer", + "sentence_switched": "The teller walked into the bank and stabbed one of the customers. [he] was immediately taken to the police station.", + "correct_answer": "The customer", + "relational_word": "stab:taken to the police station taken to the hospital", + "is_associative": 0 + }, + { + "index": 105, + "is_switchable": 0, + "sentence": "The customer walked into the bank and stabbed one of the tellers. [He] was immediately taken to the hospital.", + "answer1": "The teller", + "answer0": "The customer", + "sentence_switched": "The teller walked into the bank and stabbed one of the customers. [he] was immediately taken to the hospital.", + "correct_answer": "The teller", + "relational_word": "stab:taken to the police station taken to the hospital", + "is_associative": 0 + }, + { + "index": 106, + "is_switchable": 1, + "sentence": "John was doing research in the library when he heard a man humming and whistling. [He] was very annoyed.", + "answer1": "The man", + "answer0": "John", + "sentence_switched": "Man was doing research in the library when he heard a john humming and whistling. [he] was very annoyed.", + "correct_answer": "John", + "relational_word": "hear ... humming and whistling:annoyed/annoying", + "is_associative": 0 + }, + { + "index": 107, + "is_switchable": 1, + "sentence": "John was doing research in the library when he heard a man humming and whistling. [He] was very annoying.", + "answer1": "The man", + "answer0": "John", + "sentence_switched": "A man was doing research in the library when he heard john humming and whistling. [he] was very annoying.", + "correct_answer": "The man", + "relational_word": "hear ... humming and whistling:annoyed/annoying", + "is_associative": 0 + }, + { + "index": 108, + "is_switchable": 0, + "sentence": "John was jogging through the park when he saw a man juggling watermelons. [He] was very impressed.", + "answer1": "The juggler", + "answer0": "John", + "sentence_switched": "The juggler was jogging through the park when he saw a man juggling watermelons. [he] was very impressed.", + "correct_answer": "John", + "relational_word": "see ... juggling watermelons:impressed/impressive", + "is_associative": 0 + }, + { + "index": 109, + "is_switchable": 0, + "sentence": "John was jogging through the park when he saw a man juggling watermelons. [He] was very impressive.", + "answer1": "The juggler", + "answer0": "John", + "sentence_switched": "The juggler was jogging through the park when he saw a man juggling watermelons. [he] was very impressive.", + "correct_answer": "The juggler", + "relational_word": "see ... juggling watermelons:impressed/impressive", + "is_associative": 1 + }, + { + "index": 110, + "is_switchable": 1, + "sentence": "Bob collapsed on the sidewalk. Soon he saw Carl coming to help. [He] was very ill.", + "answer1": "Carl", + "answer0": "Bob", + "sentence_switched": "Carl collapsed on the sidewalk. soon he saw bob coming to help. [he] was very ill.", + "correct_answer": "Bob", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 111, + "is_switchable": 1, + "sentence": "Bob collapsed on the sidewalk. Soon he saw Carl coming to help. [He] was very concerned.", + "answer1": "Carl", + "answer0": "Bob", + "sentence_switched": "Carl collapsed on the sidewalk. soon he saw bob coming to help. [he] was very concerned.", + "correct_answer": "Carl", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 112, + "is_switchable": 0, + "sentence": "Sam and Amy are passionately in love, but Amy's parents are unhappy about it, because [they] are fifteen.", + "answer1": "Amy's parents", + "answer0": "Sam and Amy", + "sentence_switched": "Amy's parents are passionately in love, but sam and amy are unhappy about it, because [they] are fifteen.", + "correct_answer": "Sam and Amy", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 113, + "is_switchable": 0, + "sentence": "Sam and Amy are passionately in love, but Amy's parents are unhappy about it, because [they] are snobs.", + "answer1": "Amy's parents", + "answer0": "Sam and Amy", + "sentence_switched": "Amy's parents are passionately in love, but sam and amy are unhappy about it, because [they] are snobs.", + "correct_answer": "Amy's parents", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 114, + "is_switchable": 1, + "sentence": "Mark told Pete many lies about himself, which Pete included in his book. [He] should have been more truthful.", + "answer1": "Pete", + "answer0": "Mark", + "sentence_switched": "Pete told mark many lies about himself, which mark included in his book. [he] should have been more truthful.", + "correct_answer": "Mark", + "relational_word": "tell lies: truthful skeptical", + "is_associative": 0 + }, + { + "index": 115, + "is_switchable": 1, + "sentence": "Mark told Pete many lies about himself, which Pete included in his book. [He] should have been more skeptical.", + "answer1": "Pete", + "answer0": "Mark", + "sentence_switched": "Pete told mark many lies about himself, which mark included in his book. [he] should have been more skeptical.", + "correct_answer": "Pete", + "relational_word": "tell lies: truthful skeptical", + "is_associative": 0 + }, + { + "index": 116, + "is_switchable": 0, + "sentence": "Joe has sold his house and bought a new one a few miles away. He will be moving out of [it] on Thursday.", + "answer1": "The new house", + "answer0": "The old house", + "sentence_switched": "Joe has sold his new house and bought a old one a few miles away. he will be moving out of [it] on thursday.", + "correct_answer": "The old house", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 117, + "is_switchable": 0, + "sentence": "Joe has sold his house and bought a new one a few miles away. He will be moving into [it] on Thursday.", + "answer1": "The new house", + "answer0": "The old house", + "sentence_switched": "Joe has sold his new house and bought a old one a few miles away. he will be moving into [it] on thursday.", + "correct_answer": "The new house", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 118, + "is_switchable": 0, + "sentence": "Many people start to read Paul's books and can't put them down. [They] are gripped because Paul writes so well.", + "answer1": "Paul's books", + "answer0": "People", + "sentence_switched": "Many paul's books start to read people and can't put them down. [they] are gripped because paul writes so well.", + "correct_answer": "People", + "relational_word": "read:gripped popular", + "is_associative": 1 + }, + { + "index": 119, + "is_switchable": 0, + "sentence": "Many people start to read Paul's books and can't put them down. [They] are popular because Paul writes so well.", + "answer1": "Paul's books", + "answer0": "People", + "sentence_switched": "Many paul's books start to read people and can't put them down. [they] are popular because paul writes so well.", + "correct_answer": "Paul's books", + "relational_word": "read:gripped popular", + "is_associative": 1 + }, + { + "index": 120, + "is_switchable": 0, + "sentence": "Mary took out her flute and played one of her favorite pieces. She has had [it] since she was a child.", + "answer1": "The piece", + "answer0": "The flute", + "sentence_switched": "Mary took out her piece and played one of her favorite flute. she has had [it] since she was a child.", + "correct_answer": "The flute", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 121, + "is_switchable": 0, + "sentence": "Mary took out her flute and played one of her favorite pieces. She has loved [it] since she was a child.", + "answer1": "The piece", + "answer0": "The flute", + "sentence_switched": "Mary took out her piece and played one of her favorite flute. she has loved [it] since she was a child.", + "correct_answer": "The piece", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 122, + "is_switchable": 0, + "sentence": "Sam pulled up a chair to the piano, but [it] was broken, so he had to stand instead.", + "answer1": "The piano", + "answer0": "The chair", + "sentence_switched": "Sam pulled up a piano to the chair, but [it] was broken, so he had to stand instead.", + "correct_answer": "The chair", + "relational_word": "none", + "is_associative": 2 + }, + { + "index": 123, + "is_switchable": 0, + "sentence": "Sam pulled up a chair to the piano, but [it] was broken, so he had to sing instead.", + "answer1": "The piano", + "answer0": "The chair", + "sentence_switched": "Sam pulled up a piano to the chair, but [it] was broken, so he had to sing instead.", + "correct_answer": "The piano", + "relational_word": "none", + "is_associative": 2 + }, + { + "index": 124, + "is_switchable": 0, + "sentence": "Since it was raining, I carried the newspaper in my backpack to keep [it] dry.", + "answer1": "The backpack", + "answer0": "The newspaper", + "sentence_switched": "Since it was raining, i carried the backpack in my newspaper to keep [it] dry.", + "correct_answer": "The newspaper", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 125, + "is_switchable": 0, + "sentence": "Since it was raining, I carried the newspaper over my backpack to keep [it] dry.", + "answer1": "The backpack", + "answer0": "The newspaper", + "sentence_switched": "Since it was raining, i carried the backpack over my newspaper to keep [it] dry.", + "correct_answer": "The backpack", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 126, + "is_switchable": 0, + "sentence": "Sara borrowed the book from the library because she needs it for an article she is working on. She reads [it] when she gets home from work.", + "answer1": "The article", + "answer0": "The book", + "sentence_switched": "Sara borrowed the article from the library because she needs it for an book she is working on. she reads [it] when she gets home from work.", + "correct_answer": "The book", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 127, + "is_switchable": 0, + "sentence": "Sara borrowed the book from the library because she needs it for an article she is working on. She writes [it] when she gets home from work.", + "answer1": "The article", + "answer0": "The book", + "sentence_switched": "Sara borrowed the article from the library because she needs it for an book she is working on. she writes [it] when she gets home from work.", + "correct_answer": "The article", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 128, + "is_switchable": 0, + "sentence": "This morning, Joey built a sand castle on the beach, and put a toy flag in the highest tower, but this afternoon the tide knocked [it] down.", + "answer1": "The flag", + "answer0": "The sand castle", + "sentence_switched": "This morning, joey built a flag on the beach, and put a toy sand castle in the highest tower, but this afternoon the tide knocked [it] down.", + "correct_answer": "The sand castle", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 129, + "is_switchable": 0, + "sentence": "This morning, Joey built a sand castle on the beach, and put a toy flag in the highest tower, but this afternoon the wind knocked [it] down.", + "answer1": "The flag", + "answer0": "The sand castle", + "sentence_switched": "This morning, joey built a flag on the beach, and put a toy sand castle in the highest tower, but this afternoon the wind knocked [it] down.", + "correct_answer": "The flag", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 130, + "is_switchable": 1, + "sentence": "Jane knocked on Susan's door, but there was no answer. [She] was disappointed.", + "answer1": "Susan", + "answer0": "Jane", + "sentence_switched": "Susan knocked on jane's door, but there was no answer. [she] was disappointed.", + "correct_answer": "Jane", + "relational_word": "but:disappointed", + "is_associative": 0 + }, + { + "index": 131, + "is_switchable": 1, + "sentence": "Jane knocked on Susan's door, but there was no answer. [She] was out.", + "answer1": "Susan", + "answer0": "Jane", + "sentence_switched": "Susan knocked on jane's door, but there was no answer. [she] was out.", + "correct_answer": "Susan", + "relational_word": "but:disappointed", + "is_associative": 0 + }, + { + "index": 132, + "is_switchable": 1, + "sentence": "Jane knocked on the door, and Susan answered it. [She] invited her to come out.", + "answer1": "Susan", + "answer0": "Jane", + "sentence_switched": "Susan knocked on the door, and jane answered it. [she] invited her to come out.", + "correct_answer": "Jane", + "relational_word": "visit:invite come out/invite come in", + "is_associative": 2 + }, + { + "index": 133, + "is_switchable": 1, + "sentence": "Jane knocked on the door, and Susan answered it. [She] invited her to come in.", + "answer1": "Susan", + "answer0": "Jane", + "sentence_switched": "Susan knocked on the door, and jane answered it. [she] invited her to come in.", + "correct_answer": "Susan", + "relational_word": "visit:invite come out/invite come in", + "is_associative": 2 + }, + { + "index": 134, + "is_switchable": 1, + "sentence": "Sam took French classes from Adam, because [he] was eager to speak it fluently.", + "answer1": "Adam", + "answer0": "Sam", + "sentence_switched": "Adam took french classes from sam, because [he] was eager to speak it fluently.", + "correct_answer": "Sam", + "relational_word": "take classes from:eager known to speak it fluently", + "is_associative": 0 + }, + { + "index": 135, + "is_switchable": 1, + "sentence": "Sam took French classes from Adam, because [he] was known to speak it fluently.", + "answer1": "Adam", + "answer0": "Sam", + "sentence_switched": "Adam took french classes from sam, because [he] was known to speak it fluently.", + "correct_answer": "Adam", + "relational_word": "take classes from:eager known to speak it fluently", + "is_associative": 0 + }, + { + "index": 136, + "is_switchable": 0, + "sentence": "The path to the lake was blocked, so we couldn't use [it] .", + "answer1": "The lake", + "answer0": "The path", + "sentence_switched": "The lake to the path was blocked, so we couldn't use [it] .", + "correct_answer": "The path", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 137, + "is_switchable": 0, + "sentence": "The path to the lake was blocked, so we couldn't reach [it] .", + "answer1": "The lake", + "answer0": "The path", + "sentence_switched": "The lake to the path was blocked, so we couldn't reach [it] .", + "correct_answer": "The lake", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 138, + "is_switchable": 0, + "sentence": "The sun was covered by a thick cloud all morning, but luckily, by the time the picnic started, [it] was out.", + "answer1": "The cloud", + "answer0": "The sun", + "sentence_switched": "The cloud was covered by a thick sun all morning, but luckily, by the time the picnic started, [it] was out.", + "correct_answer": "The sun", + "relational_word": "cover:out gone", + "is_associative": 1 + }, + { + "index": 139, + "is_switchable": 0, + "sentence": "The sun was covered by a thick cloud all morning, but luckily, by the time the picnic started, [it] was gone.", + "answer1": "The cloud", + "answer0": "The sun", + "sentence_switched": "The cloud was covered by a thick sun all morning, but luckily, by the time the picnic started, [it] was gone.", + "correct_answer": "The cloud", + "relational_word": "cover:out gone", + "is_associative": 2 + }, + { + "index": 140, + "is_switchable": 0, + "sentence": "We went to the lake, because a shark had been seen at the ocean beach, so [it] was a safer place to swim.", + "answer1": "The ocean beach", + "answer0": "The lake", + "sentence_switched": "We went to the ocean beach, because a shark had been seen at the lake, so [it] was a safer place to swim.", + "correct_answer": "The lake", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 141, + "is_switchable": 0, + "sentence": "We went to the lake, because a shark had been seen at the ocean beach, so [it] was a dangerous place to swim.", + "answer1": "The ocean beach", + "answer0": "The lake", + "sentence_switched": "We went to the ocean beach, because a shark had been seen at the lake, so [it] was a dangerous place to swim.", + "correct_answer": "The ocean beach", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 142, + "is_switchable": 0, + "sentence": "Sam tried to paint a picture of shepherds with sheep, but [they] ended up looking more like golfers.", + "answer1": "The sheep", + "answer0": "The shepherds", + "sentence_switched": "Sam tried to paint a picture of sheep with shepherds, but [they] ended up looking more like golfers.", + "correct_answer": "The shepherds", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 143, + "is_switchable": 0, + "sentence": "Sam tried to paint a picture of shepherds with sheep, but [they] ended up looking more like dogs.", + "answer1": "The sheep", + "answer0": "The shepherds", + "sentence_switched": "Sam tried to paint a picture of sheep with shepherds, but [they] ended up looking more like dogs.", + "correct_answer": "The sheep", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 144, + "is_switchable": 0, + "sentence": "Mary tucked her daughter Anne into bed, so that [she] could work.", + "answer1": "Mary's daughter", + "answer0": "Mary", + "sentence_switched": "Mary's daughter tucked Mary into bed, so that [she] could work.", + "correct_answer": "Mary", + "relational_word": "tuck:work sleep", + "is_associative": 0 + }, + { + "index": 145, + "is_switchable": 0, + "sentence": "Mary tucked her daughter Anne into bed, so that [she] could sleep.", + "answer1": "Mary's daughter", + "answer0": "Mary", + "sentence_switched": "Mary's daughter tucked Mary into bed, so that [she] could sleep.", + "correct_answer": "Mary's daughter", + "relational_word": "tuck:work sleep", + "is_associative": 0 + }, + { + "index": 146, + "is_switchable": 0, + "sentence": "Fred and Alice had very warm down coats, but [they] were not prepared for the cold in Alaska.", + "answer1": "coats", + "answer0": "Fred and Alice", + "sentence_switched": "Coats had very warm down fred and alice, but [they] were not prepared for the cold in alaska.", + "correct_answer": "Fred and Alice", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 147, + "is_switchable": 0, + "sentence": "Fred and Alice had very warm down coats, but [they] were not enough for the cold in Alaska.", + "answer1": "coats", + "answer0": "Fred and Alice", + "sentence_switched": "Coats had very warm down fred and alice, but [they] were not enough for the cold in alaska.", + "correct_answer": "coats", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 148, + "is_switchable": 1, + "sentence": "Thomson visited Cooper's grave in 1765. At that date [he] had been travelling for five years.", + "answer1": "Cooper", + "answer0": "Thomson", + "sentence_switched": "Cooper visited thomson's grave in 1765. at that date [he] had been travelling for five years.", + "correct_answer": "Thomson", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 149, + "is_switchable": 1, + "sentence": "Thomson visited Cooper's grave in 1765. At that date [he] had been dead for five years.", + "answer1": "Cooper", + "answer0": "Thomson", + "sentence_switched": "Cooper visited thomson's grave in 1765. at that date [he] had been dead for five years.", + "correct_answer": "Cooper", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 150, + "is_switchable": 1, + "sentence": "Jackson was greatly influenced by Arnold, though [he] lived two centuries later.", + "answer1": "Arnold", + "answer0": "Jackson", + "sentence_switched": "Arnold was greatly influenced by jackson, though [he] lived two centuries later.", + "correct_answer": "Jackson", + "relational_word": "influence:later/earlier", + "is_associative": 0 + }, + { + "index": 151, + "is_switchable": 1, + "sentence": "Jackson was greatly influenced by Arnold, though [he] lived two centuries earlier.", + "answer1": "Arnold", + "answer0": "Jackson", + "sentence_switched": "Arnold was greatly influenced by jackson, though [he] lived two centuries earlier.", + "correct_answer": "Arnold", + "relational_word": "influence:later/earlier", + "is_associative": 0 + }, + { + "index": 152, + "is_switchable": 0, + "sentence": "I can't cut that tree down with that axe; [it] is too thick.", + "answer1": "The axe", + "answer0": "The tree", + "sentence_switched": "I can't cut that axe down with that tree; [it] is too thick.", + "correct_answer": "The tree", + "relational_word": "can not cut:thick small", + "is_associative": 0 + }, + { + "index": 153, + "is_switchable": 0, + "sentence": "I can't cut that tree down with that axe; [it] is too small.", + "answer1": "The axe", + "answer0": "The tree", + "sentence_switched": "I can't cut that axe down with that tree; [it] is too small.", + "correct_answer": "The axe", + "relational_word": "can not cut:thick small", + "is_associative": 0 + }, + { + "index": 154, + "is_switchable": 0, + "sentence": "The foxes are getting in at night and attacking the chickens. I shall have to kill [them] .", + "answer1": "The chickens", + "answer0": "The foxes", + "sentence_switched": "The chickens are getting in at night and attacking the foxes. i shall have to kill [them] .", + "correct_answer": "The foxes", + "relational_word": "attack:kill guard", + "is_associative": 0 + }, + { + "index": 155, + "is_switchable": 0, + "sentence": "The foxes are getting in at night and attacking the chickens. I shall have to guard [them] .", + "answer1": "The chickens", + "answer0": "The foxes", + "sentence_switched": "The chickens are getting in at night and attacking the foxes. i shall have to guard [them] .", + "correct_answer": "The chickens", + "relational_word": "attack:kill guard", + "is_associative": 1 + }, + { + "index": 156, + "is_switchable": 0, + "sentence": "The foxes are getting in at night and attacking the chickens. [They] have gotten very bold.", + "answer1": "The chickens", + "answer0": "The foxes", + "sentence_switched": "The chickens are getting in at night and attacking the foxes. [they] have gotten very bold.", + "correct_answer": "The foxes", + "relational_word": "attack:bold nervous", + "is_associative": 0 + }, + { + "index": 157, + "is_switchable": 0, + "sentence": "The foxes are getting in at night and attacking the chickens. [They] have gotten very nervous.", + "answer1": "The chickens", + "answer0": "The foxes", + "sentence_switched": "The chickens are getting in at night and attacking the foxes. [they] have gotten very nervous.", + "correct_answer": "The chickens", + "relational_word": "attack:bold nervous", + "is_associative": 0 + }, + { + "index": 158, + "is_switchable": 0, + "sentence": "Fred covered his eyes with his hands, because the wind was blowing sand around. He opened [them] when the wind stopped.", + "answer1": "His hands", + "answer0": "His eyes", + "sentence_switched": "Fred covered his hands with his eyes, because the wind was blowing sand around. he opened [them] when the wind stopped.", + "correct_answer": "His eyes", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 159, + "is_switchable": 0, + "sentence": "Fred covered his eyes with his hands, because the wind was blowing sand around. He lowered [them] when the wind stopped.", + "answer1": "His hands", + "answer0": "His eyes", + "sentence_switched": "Fred covered his hands with his eyes, because the wind was blowing sand around. he lowered [them] when the wind stopped.", + "correct_answer": "His hands", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 160, + "is_switchable": 1, + "sentence": "The actress used to be named Terpsichore, but she changed it to Tina a few years ago, because she figured [it] was too hard to pronounce.", + "answer1": "Tina", + "answer0": "Terpsichore", + "sentence_switched": "The actress used to be named tina, but she changed it to terpsichore a few years ago, because she figured [it] was too hard to pronounce.", + "correct_answer": "Terpsichore", + "relational_word": "change:hard/easy", + "is_associative": 0 + }, + { + "index": 161, + "is_switchable": 1, + "sentence": "The actress used to be named Terpsichore, but she changed it to Tina a few years ago, because she figured [it] was easier to pronounce.", + "answer1": "Tina", + "answer0": "Terpsichore", + "sentence_switched": "The actress used to be named tina, but she changed it to terpsichore a few years ago, because she figured [it] was easier to pronounce.", + "correct_answer": "Tina", + "relational_word": "change:hard/easy", + "is_associative": 0 + }, + { + "index": 162, + "is_switchable": 1, + "sentence": "Fred watched TV while George went out to buy groceries. After an hour [he] got up.", + "answer1": "George", + "answer0": "Fred", + "sentence_switched": "George watched tv while fred went out to buy groceries. after an hour [he] got up.", + "correct_answer": "Fred", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 163, + "is_switchable": 1, + "sentence": "Fred watched TV while George went out to buy groceries. After an hour [he] got back.", + "answer1": "George", + "answer0": "Fred", + "sentence_switched": "George watched tv while fred went out to buy groceries. after an hour [he] got back.", + "correct_answer": "George", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 164, + "is_switchable": 0, + "sentence": "Fred was supposed to run the dishwasher, but he put it off, because he wanted to watch TV. But the show turned out to be boring, so he changed his mind and turned [it] on.", + "answer1": "The TV", + "answer0": "The dishwasher", + "sentence_switched": "Fred was supposed to run the tv, but he put it off, because he wanted to watch dishwasher. but the show turned out to be boring, so he changed his mind and turned [it] on.", + "correct_answer": "The dishwasher", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 165, + "is_switchable": 0, + "sentence": "Fred was supposed to run the dishwasher, but he put it off, because he wanted to watch TV. But the show turned out to be boring, so he changed his mind and turned [it] off.", + "answer1": "The TV", + "answer0": "The dishwasher", + "sentence_switched": "Fred was supposed to run the tv, but he put it off, because he wanted to watch dishwasher. but the show turned out to be boring, so he changed his mind and turned [it] off.", + "correct_answer": "The TV", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 166, + "is_switchable": 0, + "sentence": "Fred is the only man still alive who remembers my great-grandfather. [He] is a remarkable man.", + "answer1": "My great-grandfather", + "answer0": "Fred", + "sentence_switched": "My great-grandfather is the only man still alive who remembers fred. [he] is a remarkable man.", + "correct_answer": "Fred", + "relational_word": "alive:is/was", + "is_associative": 0 + }, + { + "index": 167, + "is_switchable": 0, + "sentence": "Fred is the only man still alive who remembers my great-grandfather. [He] was a remarkable man.", + "answer1": "My great-grandfather", + "answer0": "Fred", + "sentence_switched": "My great-grandfather is the only man still alive who remembers fred. [he] was a remarkable man.", + "correct_answer": "My great-grandfather", + "relational_word": "alive:is/was", + "is_associative": 0 + }, + { + "index": 168, + "is_switchable": 0, + "sentence": "Fred is the only man alive who still remembers my father as an infant. When Fred first saw my father, [he] was twelve years old.", + "answer1": "My father", + "answer0": "Fred", + "sentence_switched": "My father is the only man alive who still remembers fred as an infant. when my father first saw fred, [he] was twelve years old.", + "correct_answer": "Fred", + "relational_word": "infant:twelve years old twelve months old", + "is_associative": 0 + }, + { + "index": 169, + "is_switchable": 0, + "sentence": "Fred is the only man alive who still remembers my father as an infant. When Fred first saw my father, [he] was twelve months old.", + "answer1": "My father", + "answer0": "Fred", + "sentence_switched": "My father is the only man alive who still remembers fred as an infant. when my father first saw fred, [he] was twelve months old.", + "correct_answer": "My father", + "relational_word": "infant:twelve years old twelve months old", + "is_associative": 0 + }, + { + "index": 170, + "is_switchable": 1, + "sentence": "In July, Kamtchatka declared war on Yakutsk. Since Yakutsk's army was much better equipped and ten times larger, [they] were defeated within weeks.", + "answer1": "Yakutsk", + "answer0": "Kamchatka", + "sentence_switched": "In july, Yakutsk declared war on kamchatka. since kamchatka's army was much better equipped and ten times larger, [they] were defeated within weeks.", + "correct_answer": "Kamchatka", + "relational_word": "better equipped and large:defeated/victorious", + "is_associative": 0 + }, + { + "index": 171, + "is_switchable": 1, + "sentence": "In July, Kamtchatka declared war on Yakutsk. Since Yakutsk's army was much better equipped and ten times larger, [they] were victorious within weeks.", + "answer1": "Yakutsk", + "answer0": "Kamchatka", + "sentence_switched": "In july, Yakutsk declared war on kamchatka. since kamchatka's army was much better equipped and ten times larger, [they] were victorious within weeks.", + "correct_answer": "Yakutsk", + "relational_word": "better equipped and large:defeated/victorious", + "is_associative": 0 + }, + { + "index": 172, + "is_switchable": 0, + "sentence": "Look! There is a minnow swimming right below that duck! [It] had better get away to safety fast!", + "answer1": "The duck", + "answer0": "The minnow", + "sentence_switched": "Look! there is a duck swimming right below that minnow! [it] had better get away to safety fast!", + "correct_answer": "The minnow", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 173, + "is_switchable": 0, + "sentence": "Look! There is a shark swimming right below that duck! [It] had better get away to safety fast!", + "answer1": "The duck", + "answer0": "The shark", + "sentence_switched": "Look! there is a duck swimming right below that shark! [it] had better get away to safety fast!", + "correct_answer": "The duck", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 174, + "is_switchable": 0, + "sentence": "Archaeologists have concluded that humans lived in Laputa 20,000 years ago. [They] hunted for evidence on the river banks.", + "answer1": "Prehistoric humans", + "answer0": "Archaeologists", + "sentence_switched": "Prehistoric humans have concluded that humans lived in laputa 20,000 years ago. [they] hunted for evidence on the river banks.", + "correct_answer": "Archaeologists", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 175, + "is_switchable": 0, + "sentence": "Archaeologists have concluded that humans lived in Laputa 20,000 years ago. [They] hunted for deer on the river banks.", + "answer1": "Prehistoric humans", + "answer0": "Archaeologists", + "sentence_switched": "Prehistoric humans have concluded that humans lived in laputa 20,000 years ago. [they] hunted for deer on the river banks.", + "correct_answer": "Prehistoric humans", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 176, + "is_switchable": 0, + "sentence": "The scientists are studying three species of fish that have recently been found living in the Indian Ocean. [They] began two years ago.", + "answer1": "The fish", + "answer0": "The scientists", + "sentence_switched": "The fish are studying three species of scientists that have recently been found living in the indian ocean. [they] began two years ago.", + "correct_answer": "The scientists", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 177, + "is_switchable": 0, + "sentence": "The scientists are studying three species of fish that have recently been found living in the Indian Ocean. [They] appeared two years ago.", + "answer1": "The fish", + "answer0": "The scientists", + "sentence_switched": "The fish are studying three species of scientists that have recently been found living in the indian ocean. [they] appeared two years ago.", + "correct_answer": "The fish", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 178, + "is_switchable": 0, + "sentence": "The journalists interviewed the stars of the new movie. [They] were very persistent, so the interview lasted for a long time.", + "answer1": "The stars", + "answer0": "The journalists", + "sentence_switched": "The stars interviewed the journalists of the new movie. [they] were very persistent, so the interview lasted for a long time.", + "correct_answer": "The journalists", + "relational_word": "interview:persistent cooperative", + "is_associative": 0 + }, + { + "index": 179, + "is_switchable": 0, + "sentence": "The journalists interviewed the stars of the new movie. [They] were very cooperative, so the interview lasted for a long time.", + "answer1": "The stars", + "answer0": "The journalists", + "sentence_switched": "The stars interviewed the journalists of the new movie. [they] were very cooperative, so the interview lasted for a long time.", + "correct_answer": "The stars", + "relational_word": "interview:persistent cooperative", + "is_associative": 0 + }, + { + "index": 180, + "is_switchable": 0, + "sentence": "The police arrested all of the gang members. [They] were trying to stop the drug trade in the neighborhood.", + "answer1": "The gang members", + "answer0": "The police", + "sentence_switched": "The gang members arrested all of the police. [they] were trying to stop the drug trade in the neighborhood.", + "correct_answer": "The police", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 181, + "is_switchable": 0, + "sentence": "The police arrested all of the gang members. [They] were trying to run the drug trade in the neighborhood.", + "answer1": "The gang members", + "answer0": "The police", + "sentence_switched": "The gang members arrested all of the police. [they] were trying to run the drug trade in the neighborhood.", + "correct_answer": "The gang members", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 182, + "is_switchable": 0, + "sentence": "I put the cake away in the refrigerator. [It] has a lot of butter in it.", + "answer1": "The refrigerator", + "answer0": "The cake", + "sentence_switched": "I put the refrigerator away in the cake. [it] has a lot of butter in it.", + "correct_answer": "The cake", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 183, + "is_switchable": 0, + "sentence": "I put the cake away in the refrigerator. [It] has a lot of leftovers in it.", + "answer1": "The refrigerator", + "answer0": "The cake", + "sentence_switched": "I put the refrigerator away in the cake. [it] has a lot of leftovers in it.", + "correct_answer": "The refrigerator", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 184, + "is_switchable": 0, + "sentence": "Sam broke both his ankles and he's walking with crutches. But a month or so from now [they] should be better.", + "answer1": "The crutches", + "answer0": "The ankles", + "sentence_switched": "Sam broke both his crutches and he's walking with ankles. but a month or so from now [they] should be better.", + "correct_answer": "The ankles", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 185, + "is_switchable": 0, + "sentence": "Sam broke both his ankles and he's walking with crutches. But a month or so from now [they] should be unnecessary.", + "answer1": "The crutches", + "answer0": "The ankles", + "sentence_switched": "Sam broke both his crutches and he's walking with ankles. but a month or so from now [they] should be unnecessary.", + "correct_answer": "The crutches", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 186, + "is_switchable": 0, + "sentence": "When the sponsors of the bill got to the town hall, they were surprised to find that the room was full of opponents. [They] were very much in the minority.", + "answer1": "The opponents", + "answer0": "The sponsors", + "sentence_switched": "When the opponents of the bill got to the town hall, they were surprised to find that the room was full of sponsors. [they] were very much in the minority.", + "correct_answer": "The sponsors", + "relational_word": "be full of:minority/majority", + "is_associative": 0 + }, + { + "index": 187, + "is_switchable": 0, + "sentence": "When the sponsors of the bill got to the town hall, they were surprised to find that the room was full of opponents. [They] were very much in the majority.", + "answer1": "The opponents", + "answer0": "The sponsors", + "sentence_switched": "When the opponents of the bill got to the town hall, they were surprised to find that the room was full of sponsors. [they] were very much in the majority.", + "correct_answer": "The opponents", + "relational_word": "be full of:minority/majority", + "is_associative": 0 + }, + { + "index": 188, + "is_switchable": 1, + "sentence": "Everyone really loved the oatmeal cookies; only a few people liked the chocolate chip cookies. Next time, we should make more of [them] .", + "answer1": "The chocolate chip cookies", + "answer0": "The oatmeal cookies", + "sentence_switched": "Everyone really loved the chocolate chip cookies; only a few people liked the oatmeal cookies. next time, we should make more of [them] .", + "correct_answer": "The oatmeal cookies", + "relational_word": "like over:more/fewer", + "is_associative": 0 + }, + { + "index": 189, + "is_switchable": 1, + "sentence": "Everyone really loved the oatmeal cookies; only a few people liked the chocolate chip cookies. Next time, we should make fewer of [them] .", + "answer1": "The chocolate chip cookies", + "answer0": "The oatmeal cookies", + "sentence_switched": "Everyone really loved the chocolate chip cookies; only a few people liked the oatmeal cookies. next time, we should make fewer of [them] .", + "correct_answer": "The chocolate chip cookies", + "relational_word": "like over:more/fewer", + "is_associative": 0 + }, + { + "index": 190, + "is_switchable": 0, + "sentence": "We had hoped to place copies of our newsletter on all the chairs in the auditorium, but there were simply not enough of [them] .", + "answer1": "chairs", + "answer0": "copies of the newsletter", + "sentence_switched": "We had hoped to place chairs on all the copies of the newsletter in the auditorium, but there were simply not enough of [them] .", + "correct_answer": "copies of the newsletter", + "relational_word": "place on all:not enough/too many", + "is_associative": 0 + }, + { + "index": 191, + "is_switchable": 0, + "sentence": "We had hoped to place copies of our newsletter on all the chairs in the auditorium, but there were simply too many of [them] .", + "answer1": "chairs", + "answer0": "copies of the newsletter", + "sentence_switched": "We had hoped to place chairs on all the copies of the newsletter in the auditorium, but there were simply too many of [them] .", + "correct_answer": "chairs", + "relational_word": "place on all:not enough/too many", + "is_associative": 0 + }, + { + "index": 192, + "is_switchable": 0, + "sentence": "I stuck a pin through a carrot. When I pulled the pin out, [it] left a hole.", + "answer1": "The carrot", + "answer0": "The pin", + "sentence_switched": "I stuck a carrot through a pin. when i pulled the carrot out, [it] left a hole.", + "correct_answer": "The pin", + "relational_word": "stick:leave have", + "is_associative": 1 + }, + { + "index": 193, + "is_switchable": 0, + "sentence": "I stuck a pin through a carrot. When I pulled the pin out, [it] had a hole.", + "answer1": "The carrot", + "answer0": "The pin", + "sentence_switched": "I stuck a carrot through a pin. when i pulled the carrot out, [it] had a hole.", + "correct_answer": "The carrot", + "relational_word": "stick:leave have", + "is_associative": 2 + }, + { + "index": 194, + "is_switchable": 0, + "sentence": "I couldn't find a spoon, so I tried using a pen to stir my coffee. But that turned out to be a bad idea, because [it] got full of coffee.", + "answer1": "The coffee", + "answer0": "The pen", + "sentence_switched": "I couldn't find a spoon, so i tried using a coffee to stir my pen. but that turned out to be a bad idea, because [it] got full of pen.", + "correct_answer": "The pen", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 195, + "is_switchable": 0, + "sentence": "I couldn't find a spoon, so I tried using a pen to stir my coffee. But that turned out to be a bad idea, because [it] got full of ink.", + "answer1": "The coffee", + "answer0": "The pen", + "sentence_switched": "I couldn't find a spoon, so i tried using a coffee to stir my pen. but that turned out to be a bad idea, because [it] got full of ink.", + "correct_answer": "The coffee", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 196, + "is_switchable": 1, + "sentence": "Steve follows Fred's example in everything. [He] admires him hugely.", + "answer1": "Fred", + "answer0": "Steve", + "sentence_switched": "Fred follows steve's example in everything. [he] admires him hugely.", + "correct_answer": "Steve", + "relational_word": "follow:admire/influence", + "is_associative": 2 + }, + { + "index": 197, + "is_switchable": 1, + "sentence": "Steve follows Fred's example in everything. [He] influences him hugely.", + "answer1": "Fred", + "answer0": "Steve", + "sentence_switched": "Fred follows steve's example in everything. [he] influences him hugely.", + "correct_answer": "Fred", + "relational_word": "follow:admire/influence", + "is_associative": 2 + }, + { + "index": 198, + "is_switchable": 0, + "sentence": "The table won't fit through the doorway because [it] is too wide.", + "answer1": "The doorway", + "answer0": "The table", + "sentence_switched": "The doorway won't fit through the table because [it] is too wide.", + "correct_answer": "The table", + "relational_word": "fit through:wide/narrow", + "is_associative": 0 + }, + { + "index": 199, + "is_switchable": 0, + "sentence": "The table won't fit through the doorway because [it] is too narrow.", + "answer1": "The doorway", + "answer0": "The table", + "sentence_switched": "The doorway won't fit through the table because [it] is too narrow.", + "correct_answer": "The doorway", + "relational_word": "fit through:wide/narrow", + "is_associative": 0 + }, + { + "index": 200, + "is_switchable": 1, + "sentence": "Grace was happy to trade me her sweater for my jacket. She thinks [it] looks dowdy on her.", + "answer1": "The jacket", + "answer0": "The sweater", + "sentence_switched": "Grace was happy to trade me her jacket for my sweater. she thinks [it] looks dowdy on her.", + "correct_answer": "The sweater", + "relational_word": "trade:dowdy/great", + "is_associative": 0 + }, + { + "index": 201, + "is_switchable": 1, + "sentence": "Grace was happy to trade me her sweater for my jacket. She thinks [it] looks great on her.", + "answer1": "The jacket", + "answer0": "The sweater", + "sentence_switched": "Grace was happy to trade me her jacket for my sweater. she thinks [it] looks great on her.", + "correct_answer": "The jacket", + "relational_word": "trade:dowdy/great", + "is_associative": 0 + }, + { + "index": 202, + "is_switchable": 1, + "sentence": "John hired Bill to take care of [him] .", + "answer1": "Bill", + "answer0": "John", + "sentence_switched": "Bill hired john to take care of [him] .", + "correct_answer": "John", + "relational_word": "hire/hire oneself to:take care of", + "is_associative": 0 + }, + { + "index": 203, + "is_switchable": 1, + "sentence": "John hired himself out to Bill to take care of [him] .", + "answer1": "Bill", + "answer0": "John", + "sentence_switched": "Bill hired himself out to john to take care of [him] .", + "correct_answer": "Bill", + "relational_word": "hire/hire oneself to:take care of", + "is_associative": 0 + }, + { + "index": 204, + "is_switchable": 1, + "sentence": "John promised Bill to leave, so an hour later [he] left.", + "answer1": "Bill", + "answer0": "John", + "sentence_switched": "Bill promised john to leave, so an hour later [he] left.", + "correct_answer": "John", + "relational_word": "promise/order", + "is_associative": 0 + }, + { + "index": 205, + "is_switchable": 1, + "sentence": "John ordered Bill to leave, so an hour later [he] left.", + "answer1": "Bill", + "answer0": "John", + "sentence_switched": "Bill ordered john to leave, so an hour later [he] left.", + "correct_answer": "Bill", + "relational_word": "promise/order", + "is_associative": 0 + }, + { + "index": 206, + "is_switchable": 1, + "sentence": "Sam Goodman's biography of the Spartan general Xenophanes conveys a vivid sense of the difficulties [he] faced in his research.", + "answer1": "Xenophanes", + "answer0": "Goodman", + "sentence_switched": "Sam xenophanes's biography of the spartan general goodman conveys a vivid sense of the difficulties [he] faced in his research.", + "correct_answer": "Goodman", + "relational_word": "none", + "is_associative": 2 + }, + { + "index": 207, + "is_switchable": 1, + "sentence": "Sam Goodman's biography of the Spartan general Xenophanes conveys a vivid sense of the difficulties [he] faced in his childhood.", + "answer1": "Xenophanes", + "answer0": "Goodman", + "sentence_switched": "Sam xenophanes's biography of the spartan general goodman conveys a vivid sense of the difficulties [he] faced in his childhood.", + "correct_answer": "Xenophanes", + "relational_word": "none", + "is_associative": 2 + }, + { + "index": 208, + "is_switchable": 0, + "sentence": "Emma's mother had died long ago, and [her] education had been managed by an excellent woman as governess.", + "answer1": "Emma's mother", + "answer0": "Emma", + "sentence_switched": "Emma had died long ago, and [her] education had been managed by an excellent woman as governess.", + "correct_answer": "Emma", + "relational_word": "mother:education place", + "is_associative": 0 + }, + { + "index": 209, + "is_switchable": 0, + "sentence": "Emma's mother had died long ago, and [her] place had been taken by an excellent woman as governess.", + "answer1": "Emma's mother", + "answer0": "Emma", + "sentence_switched": "Emma had died long ago, and [her] place had been taken by an excellent woman as governess.", + "correct_answer": "Emma's mother", + "relational_word": "mother:education place", + "is_associative": 0 + }, + { + "index": 210, + "is_switchable": 1, + "sentence": "Jane knocked on Susan's door but [she] did not get an answer.", + "answer1": "Susan", + "answer0": "Jane", + "sentence_switched": "Susan knocked on jane's door but [she] did not get an answer.", + "correct_answer": "Jane", + "relational_word": "knock:get an answer/answer", + "is_associative": 0 + }, + { + "index": 211, + "is_switchable": 1, + "sentence": "Jane knocked on Susan's door but [she] did not answer.", + "answer1": "Susan", + "answer0": "Jane", + "sentence_switched": "Susan knocked on jane's door but [she] did not answer.", + "correct_answer": "Susan", + "relational_word": "knock:get an answer/answer", + "is_associative": 0 + }, + { + "index": 212, + "is_switchable": 0, + "sentence": "Joe paid the detective after [he] received the final report on the case.", + "answer1": "the detective", + "answer0": "Joe", + "sentence_switched": "The detective paid joe after [he] received the final report on the case.", + "correct_answer": "Joe", + "relational_word": "pay:receive/deliver", + "is_associative": 0 + }, + { + "index": 213, + "is_switchable": 0, + "sentence": "Joe paid the detective after [he] delivered the final report on the case.", + "answer1": "the detective", + "answer0": "Joe", + "sentence_switched": "The detective paid joe after [he] delivered the final report on the case.", + "correct_answer": "the detective", + "relational_word": "pay:receive/deliver", + "is_associative": 0 + }, + { + "index": 214, + "is_switchable": 1, + "sentence": "Beth didn't get angry with Sally, who had cut her off, because [she] stopped and counted to ten.", + "answer1": "Sally", + "answer0": "Beth", + "sentence_switched": "Sally didn't get angry with beth, who had cut her off, because [she] stopped and counted to ten.", + "correct_answer": "Beth", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 215, + "is_switchable": 1, + "sentence": "Beth didn't get angry with Sally, who had cut her off, because [she] stopped and apologized.", + "answer1": "Sally", + "answer0": "Beth", + "sentence_switched": "Sally didn't get angry with beth, who had cut her off, because [she] stopped and apologized.", + "correct_answer": "Sally", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 216, + "is_switchable": 0, + "sentence": "Jim signaled the barman and gestured toward [his] empty glass", + "answer1": "The barman", + "answer0": "Jim", + "sentence_switched": "The barman signaled jim and gestured toward [his] empty glass", + "correct_answer": "Jim", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 217, + "is_switchable": 0, + "sentence": "Jim signaled the barman and gestured toward [his] bathroom key.", + "answer1": "The barman", + "answer0": "Jim", + "sentence_switched": "The barman signaled jim and gestured toward [his] bathroom key.", + "correct_answer": "The barman", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 218, + "is_switchable": 1, + "sentence": "Dan took the rear seat while Bill claimed the front because [his] \"Dibs!\" was slow.", + "answer1": "Bill", + "answer0": "Dan", + "sentence_switched": "Bill took the rear seat while dan claimed the front because [his] \"dibs!\" was slow.", + "correct_answer": "Dan", + "relational_word": "?", + "is_associative": 0 + }, + { + "index": 219, + "is_switchable": 1, + "sentence": "Dan took the rear seat while Bill claimed the front because [his] \"Dibs!\" was quicker.", + "answer1": "Bill", + "answer0": "Dan", + "sentence_switched": "Bill took the rear seat while dan claimed the front because [his] \"dibs!\" was quicker.", + "correct_answer": "Bill", + "relational_word": "?", + "is_associative": 0 + }, + { + "index": 220, + "is_switchable": 1, + "sentence": "Tom said \"Check\" to Ralph as he moved [his] bishop.", + "answer1": "Ralph", + "answer0": "Tom", + "sentence_switched": "Ralph said \"check\" to tom as he moved [his] bishop.", + "correct_answer": "Tom", + "relational_word": "say check:move take", + "is_associative": 0 + }, + { + "index": 221, + "is_switchable": 1, + "sentence": "Tom said \"Check\" to Ralph as he took [his] bishop.", + "answer1": "Ralph", + "answer0": "Tom", + "sentence_switched": "Ralph said \"check\" to tom as he took [his] bishop.", + "correct_answer": "Ralph", + "relational_word": "say check:move take", + "is_associative": 0 + }, + { + "index": 222, + "is_switchable": 1, + "sentence": "As Andrea in the crop duster passed over Susan, [she] could see the landing strip.", + "answer1": "Susan", + "answer0": "Andrea", + "sentence_switched": "As susan in the crop duster passed over andrea, [she] could see the landing strip.", + "correct_answer": "Andrea", + "relational_word": "?", + "is_associative": 0 + }, + { + "index": 223, + "is_switchable": 1, + "sentence": "As Andrea in the crop duster passed over Susan, [she] could see the landing gear.", + "answer1": "Susan", + "answer0": "Andrea", + "sentence_switched": "As susan in the crop duster passed over andrea, [she] could see the landing gear.", + "correct_answer": "Susan", + "relational_word": "?", + "is_associative": 0 + }, + { + "index": 224, + "is_switchable": 1, + "sentence": "Tom gave Ralph a lift to school so [he] wouldn't have to drive alone.", + "answer1": "Ralph", + "answer0": "Tom", + "sentence_switched": "Ralph gave tom a lift to school so [he] wouldn't have to drive alone.", + "correct_answer": "Tom", + "relational_word": "give a life:drive alone walk", + "is_associative": 0 + }, + { + "index": 225, + "is_switchable": 1, + "sentence": "Tom gave Ralph a lift to school so [he] wouldn't have to walk.", + "answer1": "Ralph", + "answer0": "Tom", + "sentence_switched": "Ralph gave tom a lift to school so [he] wouldn't have to walk.", + "correct_answer": "Ralph", + "relational_word": "give a life:drive alone walk", + "is_associative": 0 + }, + { + "index": 226, + "is_switchable": 1, + "sentence": "Bill passed the half-empty plate to John because [he] was full.", + "answer1": "John", + "answer0": "Bill", + "sentence_switched": "John passed the half-empty plate to bill because [he] was full.", + "correct_answer": "Bill", + "relational_word": "pass the plate:full/hungry", + "is_associative": 0 + }, + { + "index": 227, + "is_switchable": 1, + "sentence": "Bill passed the half-empty plate to John because [he] was hungry.", + "answer1": "John", + "answer0": "Bill", + "sentence_switched": "John passed the half-empty plate to bill because [he] was hungry.", + "correct_answer": "John", + "relational_word": "pass the plate:full/hungry", + "is_associative": 0 + }, + { + "index": 228, + "is_switchable": 1, + "sentence": "Bill passed the gameboy to John because [his] turn was over.", + "answer1": "John", + "answer0": "Bill", + "sentence_switched": "John passed the gameboy to bill because [his] turn was over.", + "correct_answer": "Bill", + "relational_word": "pass:turn over turn next", + "is_associative": 0 + }, + { + "index": 229, + "is_switchable": 1, + "sentence": "Bill passed the gameboy to John because [his] turn was next.", + "answer1": "John", + "answer0": "Bill", + "sentence_switched": "John passed the gameboy to bill because [his] turn was next.", + "correct_answer": "John", + "relational_word": "pass:turn over turn next", + "is_associative": 0 + }, + { + "index": 230, + "is_switchable": 1, + "sentence": "The man lifted the boy onto [his] shoulders.", + "answer1": "The boy", + "answer0": "The man", + "sentence_switched": "The boy lifted the man onto [his] shoulders.", + "correct_answer": "The man", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 231, + "is_switchable": 1, + "sentence": "The man lifted the boy onto [his] bunk bed.", + "answer1": "The boy", + "answer0": "The man", + "sentence_switched": "The boy lifted the man onto [his] bunk bed.", + "correct_answer": "The boy", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 232, + "is_switchable": 1, + "sentence": "Stretching [her] back, the woman smiled at the girl.", + "answer1": "The girl", + "answer0": "The woman", + "sentence_switched": "Stretching [her] back, the girl smiled at the woman.", + "correct_answer": "The woman", + "relational_word": "stretch pat", + "is_associative": 0 + }, + { + "index": 233, + "is_switchable": 1, + "sentence": "Patting [her] back, the woman smiled at the girl.", + "answer1": "The girl", + "answer0": "The woman", + "sentence_switched": "Patting [her] back, the girl smiled at the woman.", + "correct_answer": "The girl", + "relational_word": "stretch pat", + "is_associative": 0 + }, + { + "index": 234, + "is_switchable": 1, + "sentence": "Billy cried because Toby wouldn't accept [his] toy.", + "answer1": "Toby", + "answer0": "Billy", + "sentence_switched": "Toby cried because billy wouldn't accept [his] toy.", + "correct_answer": "Billy", + "relational_word": "accept share", + "is_associative": 0 + }, + { + "index": 235, + "is_switchable": 1, + "sentence": "Billy cried because Toby wouldn't share [his] toy.", + "answer1": "Toby", + "answer0": "Billy", + "sentence_switched": "Toby cried because billy wouldn't share [his] toy.", + "correct_answer": "Toby", + "relational_word": "accept share", + "is_associative": 0 + }, + { + "index": 236, + "is_switchable": 1, + "sentence": "Lily spoke to Donna, breaking [her] silence.", + "answer1": "Donna", + "answer0": "Lily", + "sentence_switched": "Donna spoke to lily, breaking [her] silence.", + "correct_answer": "Lily", + "relational_word": "speak:break silence break concentration", + "is_associative": 0 + }, + { + "index": 237, + "is_switchable": 1, + "sentence": "Lily spoke to Donna, breaking [her] concentration.", + "answer1": "Donna", + "answer0": "Lily", + "sentence_switched": "Donna spoke to lily, breaking [her] concentration.", + "correct_answer": "Donna", + "relational_word": "speak:break silence break concentration", + "is_associative": 0 + }, + { + "index": 238, + "is_switchable": 1, + "sentence": "When Tommy dropped his ice cream, Timmy giggled, so father gave [him] a sympathetic look.", + "answer1": "Timmy", + "answer0": "Tommy", + "sentence_switched": "When timmy dropped his ice cream, tommy giggled, so father gave [him] a sympathetic look.", + "correct_answer": "Tommy", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 239, + "is_switchable": 1, + "sentence": "When Tommy dropped his ice cream, Timmy giggled, so father gave [him] a stern look.", + "answer1": "Timmy", + "answer0": "Tommy", + "sentence_switched": "When timmy dropped his ice cream, tommy giggled, so father gave [him] a stern look.", + "correct_answer": "Timmy", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 240, + "is_switchable": 1, + "sentence": "As Ollie carried Tommy up the long winding steps, [his] legs ached.", + "answer1": "Tommy", + "answer0": "Ollie", + "sentence_switched": "As tommy carried ollie up the long winding steps, [his] legs ached.", + "correct_answer": "Ollie", + "relational_word": "carry:leg ache leg dangle", + "is_associative": 0 + }, + { + "index": 241, + "is_switchable": 1, + "sentence": "As Ollie carried Tommy up the long winding steps, [his] legs dangled.", + "answer1": "Tommy", + "answer0": "Ollie", + "sentence_switched": "As tommy carried ollie up the long winding steps, [his] legs dangled.", + "correct_answer": "Tommy", + "relational_word": "carry:leg ache leg dangle", + "is_associative": 0 + }, + { + "index": 242, + "is_switchable": 0, + "sentence": "The father carried the sleeping boy in [his] arms", + "answer1": "The boy", + "answer0": "The father", + "sentence_switched": "The boy carried the sleeping father in [his] arms", + "correct_answer": "The father", + "relational_word": "carry:in arms in bassinet", + "is_associative": 0 + }, + { + "index": 243, + "is_switchable": 0, + "sentence": "The father carried the sleeping boy in [his] bassinet.", + "answer1": "The boy", + "answer0": "The father", + "sentence_switched": "The boy carried the sleeping father in [his] bassinet.", + "correct_answer": "The boy", + "relational_word": "carry:in arms in bassinet", + "is_associative": 0 + }, + { + "index": 244, + "is_switchable": 1, + "sentence": "The woman held the girl against [her] chest", + "answer1": "The girl", + "answer0": "The woman", + "sentence_switched": "The girl held the woman against [her] chest", + "correct_answer": "The woman", + "relational_word": "hold:against chest against will", + "is_associative": 0 + }, + { + "index": 245, + "is_switchable": 1, + "sentence": "The woman held the girl against [her] will.", + "answer1": "The girl", + "answer0": "The woman", + "sentence_switched": "The girl held the woman against [her] will.", + "correct_answer": "The girl", + "relational_word": "hold:against chest against will", + "is_associative": 0 + }, + { + "index": 246, + "is_switchable": 0, + "sentence": "Pam's parents came home and found her having sex with her boyfriend, Paul. [They] were furious about it.", + "answer1": "Pam and Paul", + "answer0": "Pam's parents", + "sentence_switched": "Pam and paul came home and found Pam's parents having sex. [they] were furious about it.", + "correct_answer": "Pam's parents", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 247, + "is_switchable": 0, + "sentence": "Pam's parents came home and found her having sex with her boyfriend, Paul. [They] were embarrassed about it.", + "answer1": "Pam and Paul", + "answer0": "Pam's parents", + "sentence_switched": "Pam and paul came home and found Pam's parents having sex. [they] were embarrassed about it.", + "correct_answer": "Pam and Paul", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 248, + "is_switchable": 0, + "sentence": "Dr. Adams informed Kate that [she] had retired and presented several options for future treatment.", + "answer1": "Kate", + "answer0": "Dr. Adams", + "sentence_switched": "Kate informed dr. adams that [she] had retired and presented several options for future treatment.", + "correct_answer": "Dr. Adams", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 249, + "is_switchable": 0, + "sentence": "Dr. Adams informed Kate that [she] had cancer and presented several options for future treatment.", + "answer1": "Kate", + "answer0": "Dr. Adams", + "sentence_switched": "Kate informed dr. adams that [she] had cancer and presented several options for future treatment.", + "correct_answer": "Kate", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 250, + "is_switchable": 1, + "sentence": "Dan had to stop Bill from toying with the injured bird. [He] is very compassionate.", + "answer1": "Bill", + "answer0": "Dan", + "sentence_switched": "Bill had to stop dan from toying with the injured bird. [he] is very compassionate.", + "correct_answer": "Dan", + "relational_word": "stop", + "is_associative": 0 + }, + { + "index": 251, + "is_switchable": 1, + "sentence": "Dan had to stop Bill from toying with the injured bird. [He] is very cruel.", + "answer1": "Bill", + "answer0": "Dan", + "sentence_switched": "Bill had to stop dan from toying with the injured bird. [he] is very cruel.", + "correct_answer": "Bill", + "relational_word": "stop", + "is_associative": 0 + }, + { + "index": 252, + "is_switchable": 1, + "sentence": "George got free tickets to the play, but he gave them to Eric, even though [he] was particularly eager to see it.", + "answer1": "Eric", + "answer0": "George", + "sentence_switched": "Eric got free tickets to the play, but he gave them to george, even though [he] was particularly eager to see it.", + "correct_answer": "George", + "relational_word": "even though/because/not", + "is_associative": 0 + }, + { + "index": 253, + "is_switchable": 1, + "sentence": "George got free tickets to the play, but he gave them to Eric, because [he] was particularly eager to see it.", + "answer1": "Eric", + "answer0": "George", + "sentence_switched": "Eric got free tickets to the play, but he gave them to george, because [he] was particularly eager to see it.", + "correct_answer": "Eric", + "relational_word": "even though/because/not", + "is_associative": 0 + }, + { + "index": 254, + "is_switchable": 1, + "sentence": "George got free tickets to the play, but he gave them to Eric, because [he] was not particularly eager to see it.", + "answer1": "Eric", + "answer0": "George", + "sentence_switched": "Eric got free tickets to the play, but he gave them to george, because [he] was not particularly eager to see it.", + "correct_answer": "George", + "relational_word": "even though/because/not", + "is_associative": 0 + }, + { + "index": 255, + "is_switchable": 1, + "sentence": "Jane gave Joan candy because [she] wasn't hungry.", + "answer1": "Joan", + "answer0": "Jane", + "sentence_switched": "Joan gave jane candy because [she] wasn't hungry.", + "correct_answer": "Jane", + "relational_word": "give:not hungry/hungry", + "is_associative": 0 + }, + { + "index": 256, + "is_switchable": 1, + "sentence": "Jane gave Joan candy because [she] was hungry.", + "answer1": "Joan", + "answer0": "Jane", + "sentence_switched": "Joan gave jane candy because [she] was hungry.", + "correct_answer": "Joan", + "relational_word": "give:not hungry/hungry", + "is_associative": 0 + }, + { + "index": 257, + "is_switchable": 0, + "sentence": "I tried to paint a picture of an orchard, with lemons in the lemon trees, but [they] came out looking more like light bulbs.", + "answer1": "lemon trees", + "answer0": "lemons", + "sentence_switched": "I tried to paint a picture of an orchard, with lemon trees in the lemons, but [they] came out looking more like light bulbs.", + "correct_answer": "lemons", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 258, + "is_switchable": 0, + "sentence": "I tried to paint a picture of an orchard, with lemons in the lemon trees, but [they] came out looking more like telephone poles.", + "answer1": "lemon trees", + "answer0": "lemons", + "sentence_switched": "I tried to paint a picture of an orchard, with lemon trees in the lemons, but [they] came out looking more like telephone poles.", + "correct_answer": "lemon trees", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 259, + "is_switchable": 1, + "sentence": "James asked Robert for a favor but [he] was refused.", + "answer1": "Robert", + "answer0": "James", + "sentence_switched": "Robert asked james for a favor but [he] was refused.", + "correct_answer": "James", + "relational_word": "ask for a favor:refuse/be refused`", + "is_associative": 0 + }, + { + "index": 260, + "is_switchable": 1, + "sentence": "James asked Robert for a favor but [he] refused.", + "answer1": "Robert", + "answer0": "James", + "sentence_switched": "Robert asked james for a favor but [he] refused.", + "correct_answer": "Robert", + "relational_word": "ask for a favor:refuse/be refused`", + "is_associative": 0 + }, + { + "index": 261, + "is_switchable": 1, + "sentence": "Kirilov ceded the presidency to Shatov because [he] was less popular.", + "answer1": "Shatov", + "answer0": "Kirilov", + "sentence_switched": "Shatov ceded the presidency to kirilov because [he] was less popular.", + "correct_answer": "Kirilov", + "relational_word": "cede:less popular/more popular", + "is_associative": 0 + }, + { + "index": 262, + "is_switchable": 1, + "sentence": "Kirilov ceded the presidency to Shatov because [he] was more popular.", + "answer1": "Shatov", + "answer0": "Kirilov", + "sentence_switched": "Shatov ceded the presidency to kirilov because [he] was more popular.", + "correct_answer": "Shatov", + "relational_word": "cede:less popular/more popular", + "is_associative": 0 + }, + { + "index": 263, + "is_switchable": 1, + "sentence": "Emma did not pass the ball to Janie although [she] saw that she was open.", + "answer1": "Janie", + "answer0": "Emma", + "sentence_switched": "Janie did not pass the ball to emma although [she] saw that she was open.", + "correct_answer": "Emma", + "relational_word": "not pass although:see open/open", + "is_associative": 0 + }, + { + "index": 264, + "is_switchable": 1, + "sentence": "Emma did not pass the ball to Janie although [she] was open.", + "answer1": "Janie", + "answer0": "Emma", + "sentence_switched": "Janie did not pass the ball to emma although [she] was open.", + "correct_answer": "Janie", + "relational_word": "not pass although:see open/open", + "is_associative": 0 + }, + { + "index": 265, + "is_switchable": 0, + "sentence": "I put the butterfly wing on the table and [it] broke.", + "answer1": "The table", + "answer0": "The butterfly wing", + "sentence_switched": "I put the table on the butterfly wing and [it] broke.", + "correct_answer": "The butterfly wing", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 266, + "is_switchable": 0, + "sentence": "I put the heavy book on the table and [it] broke.", + "answer1": "The table", + "answer0": "The heavy book", + "sentence_switched": "I put the table on the heavy book and [it] broke.", + "correct_answer": "The table", + "relational_word": "none", + "is_associative": 1 + }, + { + "index": 267, + "is_switchable": 0, + "sentence": "Madonna fired her trainer because [she] couldn't stand her boyfriend.", + "answer1": "The trainer", + "answer0": "Madonna", + "sentence_switched": "The trainer fired madonna because [she] couldn't stand her boyfriend.", + "correct_answer": "Madonna", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 268, + "is_switchable": 0, + "sentence": "Madonna fired her trainer because [she] slept with her boyfriend.", + "answer1": "The trainer", + "answer0": "Madonna", + "sentence_switched": "The trainer fired madonna because [she] slept with her boyfriend.", + "correct_answer": "The trainer", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 269, + "is_switchable": 0, + "sentence": "Madonna fired her trainer because she slept with [her] boyfriend.", + "answer1": "The trainer", + "answer0": "Madonna", + "sentence_switched": "The trainer fired madonna because she slept with [her] boyfriend.", + "correct_answer": "Madonna", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 270, + "is_switchable": 0, + "sentence": "Madonna fired her trainer because she couldn't stand [her] boyfriend.", + "answer1": "The trainer", + "answer0": "Madonna", + "sentence_switched": "The trainer fired madonna because she couldn't stand [her] boyfriend.", + "correct_answer": "The trainer", + "relational_word": "none", + "is_associative": 0 + }, + { + "index": 271, + "is_switchable": 1, + "sentence": "Carol believed that Rebecca suspected that [she] had stolen the watch.", + "answer1": "Rebecca", + "answer0": "Carol", + "sentence_switched": "Rebecca believed that carol suspected that [she] had stolen the watch.", + "correct_answer": "Carol", + "relational_word": "suspect regret", + "is_associative": 2 + }, + { + "index": 272, + "is_switchable": 1, + "sentence": "Carol believed that Rebecca regretted that [she] had stolen the watch.", + "answer1": "Rebecca", + "answer0": "Carol", + "sentence_switched": "Rebecca believed that carol regretted that [she] had stolen the watch.", + "correct_answer": "Rebecca", + "relational_word": "suspect regret", + "is_associative": 2 + } +] diff --git a/bin/pytorch_pretrained_bert b/bin/pytorch_pretrained_bert deleted file mode 100644 index eee2b4c250c962..00000000000000 --- a/bin/pytorch_pretrained_bert +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -python -m pytorch_pretrained_bert "$@" \ No newline at end of file diff --git a/child_frames.py b/child_frames.py new file mode 100644 index 00000000000000..0650898fa0d7bc --- /dev/null +++ b/child_frames.py @@ -0,0 +1,307 @@ +frames = \ +[ + { + "index": 2, + "orig_sentence": "The trophy doesn't fit into the brown suitcase because [it] is too large/small.", + "entities": ["trophy", "suitcase"], + "entity_substitutes": [["ball", "toy"], ["bag", "box"]], + "determiner": "the", + "packed_relations": ["doesn't fit into/can fit into", "doesn't hold/can hold"], + "packed_relation_substitutes": [["can't be put into/can be put into"], ["doesn't have enough room for/has enough room for"]], + "packed_predicates": ["is large/isn't large", "is small/isn't small"], + }, + { + "index": 4, + "orig_sentence": "Joan made sure to thank Susan for all the help [she] had recieved/given.", + "entities": ["John", "Susan"], + "entity_substitutes": [["David", "Michael"], ["Mary", "Tiffany"]], + "packed_relations": ["thanked/didn't thank", "took good care of/didn't good care of"], + "packed_relation_substitutes": [["felt grateful to/didn't feel grateful to"], ["was appreciated by/wasn't appreciated by"]], + "packed_predicates": ["had received a lot of help/hadn't received a lot of help", "had given a lot of help/hadn't given a lot of help"], + "predicate_dichotomy": False, + }, + { + "index": 4000, + "orig_sentence": "John gave a lot of money to Susan because [he] was very rich/poor.", + "entities": ["John", "Susan"], + "entity_substitutes": [["David", "Michael"], ["Mary", "Linda"]], + "packed_relations": ["gave a lot of money to/didn't give a lot of money to", "received a lot of money from/didn't receive a lot of money from"], + "packed_relation_substitutes": [["subsidized/didn't subsidize"], ["borrowed a lot of money from/didn't borrow any money from"]], + "packed_predicates": ["was rich/wasn't rich", "was poor/wasn't poor"], + }, + { + "index": 10, + "orig_sentence": "The delivery truck zoomed by the school bus because [it] was going so fast/slow.", + "entities": ["truck", "bus"], + "entity_substitutes": [["car", "ambulance"], ["bicycle", "tram"]], + "determiner": "the", + "packed_relations": ["overtook/couldn't overtake", "fell far behind/didn't fall far behind"], + "packed_relation_substitutes": [["zoomed by/didn't pass"], ["was left behind/wasn't left far behind"]], + "packed_predicates": ["was going fast/wasn't going fast", "was going slow/wasn't going slow"], + }, + ## didn't defeated, replace error: didn't defeat -> defeated + { + "index": 12, + "orig_sentence": "Frank felt vindicated/crushed when his longtime rival Bill revealed that [he] was the winner of the competition.", + "entities": ["John", "Susan"], + "entity_substitutes": [["David", "Michael"], ["Mary", "Linda"]], + "packed_relations": ["beat/didn't beat", "lost to/didn't lose to"], + "packed_relation_substitutes": [["defeated/didn't defeat"], ["was defeated by/wasn't defeated by"]], + "relation_suffix": "in the game", + "packed_predicates": ["was happy/wasn't happy", "was sad/wasn't sad"], + "reverse_causal": True + }, + { + "index": 16, + "orig_sentence": "The large ball crashed right through the table because [it] was made of steel/styrofoam.", + "entities": ["ball", "board"], + "entity_substitutes": [["bullet", "arrow"], ["shield", "disk"]], + "determiner": "the", + "packed_relations": ["crashed right through/didn't crash through", "failed to block/blocked"], + "packed_relation_substitutes": [["penetrated through/didn't penetrate through"], ["failed to stop/stopped"]], + "packed_predicates": ["was hard/wasn't hard", "was soft/wasn't soft"], + }, + { + "index": 18, + "orig_sentence": "John couldn't see the stage with Billy in front of him because [he] is so short.", + "entities": ["John", "Susan"], + "entity_substitutes": [["David", "Edward"], ["Betty", "Donna"]], + "packed_relations": ["couldn't see the stage behind/could see the stage behind", "blocked the view of/didn't block the view of"], + "packed_relation_substitutes": [["couldn't catch sight of the stage behind/could catch sight of the stage behind"], ["obstructed the sight of/didn't obstruct the sight of"]], + "packed_predicates": ["is short/isn't short", "is tall/isn't tall"], + }, + { + "index": 20, + "orig_sentence": "Tom threw his schoolbag down to Ray after [he] reached the top of the stairs.", + "entities": ["Brian", "Amy"], + "entity_substitutes": [["Charles", "Paul"], ["Emma", "Linda"]], + "packed_relations": ["threw the schoolbag down to/threw the schoolbag up to", "caught the schoolbag thrown down by/caught the schoolbag thrown up by"], + "packed_relation_substitutes": [["cast the schoolbag down to/cast the schoolbag up to"], ["took the schoolbag thrown down by/took the schoolbag thrown up by"]], + "packed_predicates": ["reached the top of the stairs", "reached the bottom of the stairs"], + "conjunctions": [["after", ], ["before", ]] + }, + ## didn't defeated, replace error: didn't defeat -> defeated + { + "index": 22, + "orig_sentence": "Although they ran at about the same speed, Sue beat Sally because [she] had such a good start.", + "entities": ["Tom", "Sue"], + "entity_substitutes": [["John", "David"], ["Sally", "Susan"]], + "packed_relations": ["beat/didn't beat", "lost to/didn't lose to"], + "packed_relation_substitutes": [["defeated/didn't defeat"], ["was defeated by/wasn't defeated by"]], + "relation_prefix": "Running at about the same speed,", + "relation_suffix": "in the running race", + "packed_predicates": ["had a good start/didn't have a good start", "had a bad start/didn't have a bad start"], + }, + { + "index": 28, + "orig_sentence": "Anna did a lot better than her good friend Lucy on the test because [she] had studied so hard.", + "entities": ["Anna", "Andy"], + "entity_substitutes": [["Lucy", "Nancy"], ["George", "Frank"]], + "packed_relations": ["did better than/didn't do better than", "did worse than/didn't do worse than"], + "packed_relation_substitutes": [["performed better than/didn't perform better than"], ["performed worse than/didn't perform worse than"]], + "relation_suffix": "on the test", + "packed_predicates": ["had studied hard/hadn't studied hard", "was lazy in doing homework/wasn't lazy in doing homework"], + }, + { + "index": 30, + "orig_sentence": "The firemen arrived after the police because [they] were coming from so far away.", + "entities": ["doctor", "police"], + "entity_substitutes": [["worker", "employee"], ["boss", "administrator"]], + "determiner": "the", + "packed_relations": ["arrived after/didn't arrive after", "arrived before/didn't arrive before"], + "packed_relation_substitutes": [["reached here after/didn't reach here after"], ["reached here before/didn't reach here before"]], + "packed_predicates": ["came from far away/didn't come from far away", "came from a close place/didn't come from a close place"], + }, + { + "index": 32000, + "orig_sentence": "Frank was upset with Tom because the toaster [he] had bought from him didn't work.", + "entities": ["Betty", "Henry"], + "entity_substitutes": [["Amy", "Linda"], ["Bush", "Frank"]], + "packed_relations": ["was upset with/was pleased with", "was hated by/was loved by"], + "packed_relation_substitutes": [["hated/liked"], ["was disliked by/was liked by"]], + "packed_predicates": ["had bought didn't work/had bought worked well", "had sold didn't work/had sold worked well"], + "predicate_prefix": "the toaster", + "predicate_dichotomy": False, + }, + { + "index": 36, + "orig_sentence": "The sack of potatoes had been placed above the bag of flour, so [it] had to be moved first", + "entities": ["potatoes", "flour"], + "entity_substitutes": [["candy", "rice"], ["beans", "noodles"]], + "determiner": "the bag of", + "packed_relations": ["had been placed above/hadn't been placed above", "had been placed below/hadn't been placed below"], + "packed_relation_substitutes": [["had been put above/hadn't been put above"], ["had been put below/hadn't been put below"]], + "packed_predicates": ["had to be moved first/couldn't be moved first", "had to be moved later/couldn't be moved later"], + "reverse_causal": True + }, + { + "index": 38, + "orig_sentence": "Pete envies Martin although [he] is very successful.", + "entities": ["Peter", "Mandy"], + "entity_substitutes": [["Martin", "Paul"], ["Cindy", "Emma"]], + "packed_relations": ["envied/didn't envy", "was envied by/wasn't envied by"], + "packed_relation_substitutes": [["was jealous of/wasn't jealous of"], ["was admired by/wasn't admired by"]], + "packed_predicates": ["failed/didn't fail", "was successful/wasn't successful"], + }, + { + "index": 42, + "orig_sentence": "I poured water from the bottle into the cup until [it] was empty.", + "entities": ["bottle", "cup"], + "entity_substitutes": [["bowl", "bucket"], ["tube", "container"]], + "determiner": "the", + "packed_relations": ["was filled with water from/leaked into", "leaked into/was filled with water from"], + "packed_relation_substitutes": [["was suffused with water from/dripped water into"], ["dripped water into/was suffused with water from"]], + "packed_predicates": ["was empty", "was full"], + "conjunctions": [["after", ], ["before", ]] + }, + { + "index": 50, + "orig_sentence": "Joe's uncle can still beat him at tennis, even though [he] is 30 years younger.", + "entities": ["Joe", "Amy"], + "entity_substitutes": [["David", "Charles"], ["Betty", "Cindy"]], + "packed_relations": ["can beat/can't beat", "often loses to/seldom loses to"], + "packed_relation_substitutes": [["can defeat/can't defeat"], ["is often defeated by/is seldom defeated by"]], + "relation_suffix": "at tennis", + "packed_predicates": ["is older/isn't older", "is younger/isn't younger"], + }, + { + "index": 68, + "orig_sentence": "Ann asked Mary what time the library closes, because [she] had forgotten.", + "entities": ["Ann", "Henry"], + "entity_substitutes": [["Mary", "Linda"], ["Brian", "Michael"]], + "packed_relations": ["asked/didn't ask", "told/didn't tell"], + "packed_relation_substitutes": [["was told by/wasn't told by"], ["was asked by/wasn't asked by"]], + "relation_suffix": "what time the library closes", + "packed_predicates": ["had forgotten/hadn't forgotten", "remembered/didn't remember"], + }, + { + "index": 84, + "orig_sentence": "If the con artist has succeeded in fooling Sam, [he] would have gotten a lot of money.", + "entities": ["Sam", "Emma"], + "entity_substitutes": [["Paul", "Bush"], ["Susan", "Lucy"]], + "packed_relations": ["succeeded in fooling/failed to fool", "was fooled by/wasn't fooled by"], + "packed_relation_substitutes": [["succeeded in cheating/failed to cheat"], ["was cheated by/wasn't cheated by"]], + "packed_predicates": ["got a lot of money/didn't get a lot of money", "lost a lot of money/didn't lose a lot of money"], + "predicate_dichotomy": False, + "reverse_causal": True + }, + { + "index": 15000, + "orig_sentence": "Jackson was greatly influenced by Arnold, though [he] lived two centuries later.", + "entities": ["Jack", "Betty"], + "entity_substitutes": [["Tom", "Jay"], ["Emily", "Helen"]], + "packed_relations": ["always takes care of/dosen't take care of", "is always taken care of by/isn't taken care of by"], + "packed_relation_substitutes": [["always looks after/dosen't look after"], ["always needs the help of/didn't need the help of"]], + "packed_predicates": ["is older/isn't older", "is younger/isn't younger"], + }, + { + "index": 160, + "orig_sentence": "The actress used to be named Terpsichore, but she changed it to Tina a few years ago, because she figured [it] was too hard to pronounce.", + "entities": ["Betty", "Adele"], + "entity_substitutes": [["Amy", "Cindy"], ["Alberta", "Caroline"]], + "packed_relations": ["replaced/didn't replace", "was changed to/wasn't changed to"], + "packed_relation_substitutes": [["was substituted for/wasn't substituted for"], ["was replaced by/wasn't replaced by"]], + "relation_suffix": "as the actress's new name", + "packed_predicates": ["is easy to pronounce/isn't easy to pronounce", "is hard to pronounce/isn't hard to pronounce"], + }, + { + "index": 1700000, + "orig_sentence": "In July, Kamtchatka declared war on Yakutsk. Since Yakutsk's army was much better equipped and ten times larger, [they] were defeated within weeks.", + "entities": ["Germany", "Italy"], + "entity_substitutes": [["Australia", "Japan"], ["Argentina", "Canada"]], + "packed_relations": ["defeated/didn't defeat", "was defeated by/wasn't defeated by"], + "packed_relation_substitutes": [["conquered/didn't conquer"], ["was conquered by/wasn't conquered by"]], + "packed_predicates": ["was more powerful/wasn't more powerful", "was less powerful/wasn't less powerful"], + }, + { + "index": 186, + "orig_sentence": "When the sponsors of the bill got to the town hall, they were surprised to find that the room was full of opponents. [They] were very much in the minority", + "entities": ["sponsors", "opponents"], + "entity_substitutes": [["workers", "customers"], ["teachers", "students"]], + "determiner": "the", + "packed_relations": ["were less in number than/were not less in number than", "were more in number than/were not more in number than"], + "packed_relation_substitutes": [["were outnumbered by/were not outnumbered by"], ["outnumbered/didn't outnumber"]], + "packed_predicates": ["were in the minority/were not in the minority", "were in the majority/were not in the majority"], + "reverse_causal": True + }, + { + "index": 188, + "orig_sentence": "Everyone really loved the oatmeal cookies; only a few people liked the chocolate chip cookies. Next time, we should make more of [them] .", + "entities": ["cookies", "chips"], + "entity_substitutes": [["apples", "bananas"], ["grapes", "sandwiches"]], + "determiner": "the", + "packed_relations": ["are more popular than/are less popular than", "lose to/don't lose to"], + "packed_relation_substitutes": [["are sold more than/are sold less than"], ["are not as popular as/are as popular as"]], + "packed_predicates": ["should be made more next time/shouldn't be made more next time", "should be made less next time/shouldn't be made less next time"], + "reverse_causal": True + }, + { + "index": 190, + "orig_sentence": "We had hoped to place copies of our newsletter on all the chairs in the auditorium, but there were simply not enough of [them] .", + "entities": ["newspapers", "chairs"], + "entity_substitutes": [["cups", "pictures"], ["tables", "benches"]], + "determiner": "the", + "packed_relations": ["could be placed on all/couldn't be placed on all", "could all be covered by/couldn't all be covered by"], + "packed_relation_substitutes": [["could be put on all/couldn't be put on all"], ["could carry all/couldn't carry all"]], + "packed_predicates": ["there were many of/there were not many of", "there were few of/there were not few of"], + "prepositive_pred": True, + }, + { + "index": 19600, + "orig_sentence": "Steve follows Fred's example in everything. [He] admires him hugely.", + "entities": ["Steve", "Lucy"], + "entity_substitutes": [["Fred", "George"], ["Lily", "Wendy"]], + "packed_relations": ["follows/doesn't follow", "is followed by/isn't followed by"], + "packed_relation_substitutes": [["imitates/doesn't imitate"], ["is imitated by/isn't imitated by"]], + "relation_suffix": "in everything", + "packed_predicates": ["is bad at making decisions/isn't bad at making decisions", "is good at making decisions/isn't good at making decisions"], + }, + { + "index": 198, + "orig_sentence": "The table won't fit through the doorway because [it] is too wide.", + "entities": ["table", "doorway"], + "entity_substitutes": [["desk", "sofa"], ["corridor", "hallway"]], + "determiner": "the", + "packed_relations": ["will fit through/won't fit through", "will be fitted through by/won't be fitted through by"], + "packed_relation_substitutes": [["will pass through/won't pass through"], ["will be passed through by/won't be passed through by"]], + "packed_predicates": ["is narrow/isn't narrow", "is wide/isn't wide"], + }, + { + "index": 2000000, + "orig_sentence": "Grace was happy to trade me her sweater for my jacket. She thinks [it] looks dowdy on her.", + "entities": ["sweater", "jacket"], + "entity_substitutes": [["skirt", "cap"], ["hat", "short"]], + "determiner": "the", + "packed_relations": ["is traded by Grace for/isn't traded by Grace for", "is substituted by Grace for/isn't substituted by Grace for"], + "packed_relation_substitutes": [["is replaced by Grace with/isn't replaced by Grace with"], ["is preferred by Grace to/isn't preferred by Grace to"]], + "packed_predicates": ["looks bad/looks not bad", "looks good/looks not good"], + "predicate_prefix": "she thinks", + }, + { + "index": 226, + "orig_sentence": "Bill passed the half-empty plate to John because [he] was full.", + "entities": ["Bill", "Amy"], + "entity_substitutes": [["Brian", "David"], ["Emma", "Helen"]], + "packed_relations": ["passed the half-empty plate to/didn't pass the half-empty plate to", "received the half-empty plate from/didn't received the half-empty plate from"], + "packed_relation_substitutes": [["gave the half-empty plate to/didn't give the half-empty plate to"], ["took the half-empty plate from/didn't take the half-empty plate from"]], + "packed_predicates": ["was full/wasn't full", "was hungry/wasn't hungry"], + }, + { + "index": 252, + "orig_sentence": "George got free tickets to the play, but he gave them to Eric, even though [he] was particularly eager to see it.", + "entities": ["George", "Linda"], + "entity_substitutes": [["Eric", "Ted"], ["Cindy", "Lucy"]], + "packed_relations": ["gave the tickets of the play to/didn't give the tickets of the play to", "received the tickets of the play from/didn't receive the tickets of the play from"], + "packed_relation_substitutes": [["sent the tickets of the play to/didn't send the tickets of the play to"], ["took the tickets of the play from/didn't take the tickets of the play from"]], + "packed_predicates": ["wasn't interested in it/was interested in it", "was eager to see it/wasn't eager to see it"], + }, + { + "index": 261, + "orig_sentence": "Kirilov ceded the presidency to Shatov because [he] was less popular.", + "entities": ["James", "Amy"], + "entity_substitutes": [["Robert", "Jack"], ["Donna", "Emily"]], + "packed_relations": ["ceded the presidency to/didn't cede the presidency to", "took over the presidency from/didn't take over the presidency from"], + "packed_relation_substitutes": [["gave the presidency to/didn't give the presidency to"], ["got the presidency from/didn't get the presidency from"]], + "packed_predicates": ["was notorious/was not notorious", "was popular/wasn't popular"], + }, +] diff --git a/child_generator.py b/child_generator.py new file mode 100644 index 00000000000000..04b1c150b6b65c --- /dev/null +++ b/child_generator.py @@ -0,0 +1,139 @@ +import os +import json +import itertools +from itertools import product, permutations +from random import sample + +from pytorch_pretrained_bert.tokenization import BertTokenizer +from child_lib import * + + +BERT_DIR = '/nas/pretrain-bert/pretrain-pytorch/bert-base-uncased' +tokenizer = BertTokenizer.from_pretrained('/nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt') + + +def assert_in_bert_vocab(tokens): + for token in tokens: + if isinstance(token, str): # entities + assert token.lower() in tokenizer.vocab, token + '->' + str(tokenizer.tokenize(token)) + elif isinstance(token, tuple): # relations + assert len(token) == 2, str(token) + for rel in token: + rel = rel.split('..')[0] + assert rel in tokenizer.vocab, rel + '->' + str(tokenizer.tokenize(rel)) + + +male_names = ['James', 'John', 'Robert', ]#'Michael', 'David', 'Paul', 'Jeff', 'Daniel', 'Charles', 'Thomas'] +female_names = ['Mary', 'Linda', 'Jennifer', ]#'Maria', 'Susan', 'Lisa', 'Sandra', 'Barbara', 'Patricia', 'Elizabeth'] +people_names = (male_names, female_names) +assert_in_bert_vocab(male_names) +assert_in_bert_vocab(female_names) + +people_adj_relations = ( + ('taller..than', 'shorter..than'), +# ('thinner..than', 'fatter..than'), # fatter not in BERT vocab + ('younger..than', 'older..than'), +# ('stronger..than', 'weaker..than'), +# ('faster..than', 'slower..than'), +# ('richer..than', 'poorer..than') +) + +rel2entypes = { +# spatial_relations: [fruits, animals, people_names], + people_adj_relations: [people_names], +# animal_adj_relations: [animals], +# object_adj_relations: [fruits, animals] +} + + +def comparative2superlative(comparative_form, structured=False): + assert comparative_form.endswith('er'), comparative_form + superlative_form = 'the ' + comparative_form[:-2] + 'est' \ + if not structured else 'the ' + comparative_form + ' st' + return superlative_form + + +def make_relational_atoms(relational_template, entities, relations): + neg_relations = ["isn't " + r for r in relations] + relations = ["is " + r for r in relations] + atoms = [relational_template.format(ent0=ent0, ent1=ent1, rel=rel) + for ent0, ent1, rel in [entities + relations[:1], reverse(entities) + reverse(relations)[:1]]] + atoms += [relational_template.format(ent0=ent0, ent1=ent1, rel=rel) + for ent0, ent1, rel in [entities + reverse(neg_relations)[:1], reverse(entities) + neg_relations[:1]]] + return atoms + + +transitive_P_template = '{ent0} {rel} {ent1} .' +transitive_wh_QA_template = '{which} is {pred} ? {ent} .' +transitive_yesno_QA_template = 'is {ent0} {rel} {ent1} ? {ans} .' + +def make_transitive(P_template, wh_QA_template, yesno_QA_template, join_template, + index=-1, orig_sentence='', entities=["John", "Mary", "Susan"], entity_substitutes=None, determiner="", + relations=('taller..than', 'shorter..than'), maybe=True, structured=False, + packed_predicates=["pred0/~pred0", "pred1/~pred1"], predicate_substitutes=None, + predicate_dichotomy=True, reverse_causal=False): + if entities[0].islower(): + entities = ['the ' + e for e in entities] +# print('relations =', relations) + relations, predicates = ([r.replace('..', ' ') for r in relations], [r.split('..')[0] for r in relations]) \ + if '..' in relations[0] else ([r.split('/')[0] for r in relations], [r.split('/')[-1] for r in relations]) +# print('relations =', relations, 'predicates =', predicates) + predicates = [comparative2superlative(p, structured=structured) for p in predicates] + + P0_entities, P1_entities = ([entities[0], entities[1]], [entities[1], entities[2]]) \ + if not maybe else ([entities[0], entities[1]], [entities[0], entities[2]]) + P0 = make_relational_atoms(P_template, P0_entities, relations) + P1 = make_relational_atoms(P_template, P1_entities, relations) + + wh_pronoun = 'which' if entities[0].startswith('the') else 'who' + wh_QA = [wh_QA_template.format(which=wh_pronoun, pred=pred, ent=ent) + for pred, ent in [(predicates[0], mask(entities[0])), (predicates[-1], mask(entities[-1] if not maybe else 'unknown'))]] + + def _maybe(s): + return s if not maybe else 'maybe' + yesno_entities = (entities[0], entities[-1]) if not maybe else (entities[1], entities[-1]) + yesno_QA = [yesno_QA_template.format(ent0=ent0, ent1=ent1, rel=rel, ans=ans) + for ent0, ent1, rel, ans in [ + (yesno_entities[0], yesno_entities[-1], relations[0], mask(_maybe('yes'))), + (yesno_entities[0], yesno_entities[-1], relations[-1], mask(_maybe('no'))), + (yesno_entities[-1], yesno_entities[0], relations[-1], mask(_maybe('yes'))), + (yesno_entities[-1], yesno_entities[0], relations[0], mask(_maybe('no')))]] + + Ps = [(p0, p1) for p0, p1 in list(product(P0, P1)) + list(product(P1, P0))] + QAs = wh_QA + yesno_QA + + def get_rel(atom): + for rel in relations: +# assert rel.startswith('is') + rel = rel.split()[0] # "taller than" -> "taller" + if rel in atom: + return rel + assert False + sentences = [p0 + ' ' + p1 + ' ||| ' + qas for (p0, p1), qas in product(Ps, QAs) + if not structured or get_rel(p0) == get_rel(p1) == get_rel(qas)] +# sentences = [s.replace('er st ', 'est ') for s in sentences] + return sentences + + +def make_sentences(maybe=True, structured=False): + sentence_groups = [] + maybe = False + for relations, entity_types in rel2entypes.items(): + sentences = [] + ent_tuples = [] + for entities in entity_types: + if isinstance(entities, list): + ent_tuples += permutations(entities, 3) + else: + assert isinstance(entities, tuple) and len(entities) == 2 # people_names + ent_tuples += permutations(entities[0] + entities[1], 3) + for (rel, ent_tuple) in product(relations, ent_tuples): + sentences += make_transitive(transitive_P_template, transitive_wh_QA_template, transitive_yesno_QA_template, None, + entities=list(ent_tuple), relations=rel, maybe=False, structured=True) + if maybe: + sentences += make_transitive(transitive_P_template, transitive_wh_QA_template, transitive_yesno_QA_template, None, + entities=list(ent_tuple), relations=rel, maybe=True, structured=True) + # sample(sentences, 20) + # logger.info('num_sent = %d -> %d' % (len(sentences), len(set(sentences)))) + sentence_groups.append(sentences) + return sentences diff --git a/child_lib.py b/child_lib.py new file mode 100644 index 00000000000000..8f6112bfdb06ed --- /dev/null +++ b/child_lib.py @@ -0,0 +1,13 @@ +def reverse(l): + return list(reversed(l)) if isinstance(l, list) else tuple(reversed(l)) + + +def mask(ent_str): + tokens = ent_str.strip().split() + if len(tokens) == 1: + return '[%s]' % tokens[0] + elif len(tokens) == 2: + assert tokens[0] == 'the', ent_str + return '%s [%s]' % (tokens[0], tokens[1]) + else: + assert False, ent_str diff --git a/child_wsc_generator.py b/child_wsc_generator.py new file mode 100644 index 00000000000000..e247529c747c22 --- /dev/null +++ b/child_wsc_generator.py @@ -0,0 +1,126 @@ +import os +import json +import itertools +from itertools import product, chain + +from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig +from child_frames import frames + + +CONFIG_NAME = 'bert_config.json' +BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/' +#tokenizer = BertTokenizer.from_pretrained(os.path.join(BERT_DIR, 'vocab.txt')) +tokenizer = BertTokenizer.from_pretrained('/nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt') + +A_template = "{rel_prefix} {dt} {ent0} {rel} {dt} {ent1} {rel_suffix}" +B_templates = ["{pred_prefix} {dt} {ent} {pred}", "{pred_prefix} {pred} {dt} {ent}"] + +# causal_templates = [["{A} because {B}."],# "{B} so {A}."], +# ["{A} so {B}."],# "{B} because {A}."] +# ] +# turning_templates = [["{A} although {B}."],# "{B} but {A}."], +# ["{A} but {B}."],# "{B} although {A}."] +# ] + +causal_templates = [["{A} ||| {conj} {B}."],# "{B} so {A}."], + ["{A} ||| {conj} {B}."],# "{B} because {A}."] + ] +turning_templates = [["{A} ||| {conj} {B}."],# "{B} but {A}."], + ["{A} ||| {conj} {B}."],# "{B} although {A}."] + ] + + +def reverse(l): + return list(reversed(l)) + + +def mask(ent_str): + tokens = ent_str.strip().split() + if len(tokens) == 1: + return '[%s]' % tokens[0] + elif len(tokens) == 2: + assert tokens[0] == 'the', ent_str + return '%s [%s]' % (tokens[0], tokens[1]) + else: + assert False, ent_str + + +def make_sentences(index=-1, orig_sentence='', entities=["John", "Mary"], entity_substitutes=None, determiner="", + packed_relations=["rel/~rel", "rev_rel/~rev_rel"], packed_relation_substitutes=None, + relation_prefix="", relation_suffix="", + packed_predicates=["pred0/~pred0", "pred1/~pred1"], predicate_prefix="", prepositive_pred=False, + predicate_dichotomy=True, reverse_causal=False, conjunctions=[["because", "so"], ["although", "but"]]): + assert entities[0].lower() in tokenizer.vocab , entities[0] + assert entities[1].lower() in tokenizer.vocab , entities[1] + + def form_As(packed_rels): + relations, neg_relations = zip(*[rel.split("/") for rel in packed_rels]) + relations, neg_relations = list(relations), list(neg_relations) + + As = [A_template.format(dt=determiner, ent0=ent0, ent1=ent1, rel=rel, rel_prefix=relation_prefix, rel_suffix=relation_suffix) + for ent0, ent1, rel in [entities + relations[:1], reverse(entities) + reverse(relations)[:1]]] + negAs = [A_template.format(dt=determiner, ent0=ent0, ent1=ent1, rel=rel, rel_prefix=relation_prefix, rel_suffix=relation_suffix) + for ent0, ent1, rel in [entities + neg_relations[:1], reverse(entities) + reverse(neg_relations)[:1]]] + return As, negAs + + As, negAs = form_As(packed_relations) + + substituted_As, substituted_negAs = [], [] + for packed_rel_subs in zip(*packed_relation_substitutes): + subs_As, subs_negAs = form_As(packed_rel_subs) + substituted_As += subs_As + substituted_negAs += subs_negAs + + if "/" in packed_predicates[0]: + predicates, neg_predicates = zip(*[pred.split("/") for pred in packed_predicates]) + predicates, neg_predicates = list(predicates), list(neg_predicates) + else: + predicates, neg_predicates = packed_predicates, [] + + B_template = B_templates[int(prepositive_pred)] + Bs = [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) + for ent, pred in zip(entities, predicates)] + negBs = [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) + for ent, pred in zip(entities, neg_predicates)] + if predicate_dichotomy: + Bs += [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) + for ent, pred in zip(entities, reversed(neg_predicates))] + negBs += [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) + for ent, pred in zip(entities, reversed(predicates))] + + def form_sentences(sentence_template, As, Bs, conj): + return [" ".join(sentence_template.format(A=A, B=B, conj=conj).split()) for A, B in product(As, Bs)] + + def form_all_sentences(As, negAs, Bs, negBs): + causal_sentences = [] + causal_conj = conjunctions[0][int(reverse_causal)] + for causal_template in causal_templates[int(reverse_causal)]: + for A, B in [(As, Bs), (negAs, negBs)]: + causal_sentences += form_sentences(causal_template, A, B, causal_conj) + + turning_sentences = [] + turning_conj = conjunctions[1][int(reverse_causal)] + for turning_template in turning_templates[int(reverse_causal)]: + for A, B in [(As, negBs), (negAs, Bs)]: + turning_sentences += form_sentences(turning_template, A, B, turning_conj) + + sentences = causal_sentences + turning_sentences + return sentences, causal_sentences, turning_sentences + + sentences, causal_sentences, turning_sentences = form_all_sentences(As, negAs, Bs, negBs) + substituted_sentences = sentences + + if packed_relation_substitutes is not None: + substituted_sentences += form_all_sentences(substituted_As, substituted_negAs, Bs, negBs)[0] + + if entity_substitutes is not None: + for sub in entity_substitutes: + for ent in sub: + assert ent.lower() in tokenizer.vocab , ent + " not in BERT vocab" + assert len(set(chain.from_iterable(entity_substitutes))) == 4, entity_substitutes + assert len(set(chain.from_iterable(entity_substitutes)).union(set(entities))) == 6 + + entity_substitutes = list(itertools.product(entities[:1] + entity_substitutes[0], entities[1:] + entity_substitutes[1])) + substituted_sentences = [sent.replace(entities[0], sub[0]).replace(entities[1], sub[1]) + for sent in substituted_sentences for sub in entity_substitutes] + return causal_sentences, turning_sentences, substituted_sentences diff --git a/convert_pos.py b/convert_pos.py new file mode 100644 index 00000000000000..89b7cf9f4ba31e --- /dev/null +++ b/convert_pos.py @@ -0,0 +1,64 @@ +from nltk.corpus import wordnet as wn + +# Just to make it a bit more readable +WN_NOUN = 'n' +WN_VERB = 'v' +WN_ADJECTIVE = 'a' +WN_ADJECTIVE_SATELLITE = 's' +WN_ADVERB = 'r' + +def convert(word, from_pos, to_pos): + """ Transform words given from/to POS tags """ + + synsets = wn.synsets(word, pos=from_pos) + + # Word not found + if not synsets: + return [] + + # Get all lemmas of the word (consider 'a'and 's' equivalent) + lemmas = [l for s in synsets + for l in s.lemmas() + if s.name().split('.')[1] == from_pos + or from_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE) + and s.name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE)] + + # Get related forms + derivationally_related_forms = [(l, l.derivationally_related_forms()) for l in lemmas] + + # filter only the desired pos (consider 'a' and 's' equivalent) + related_noun_lemmas = [l for drf in derivationally_related_forms + for l in drf[1] + if l.synset().name().split('.')[1] == to_pos + or to_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE) + and l.synset().name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE)] + + # Extract the words from the lemmas + words = [l.name() for l in related_noun_lemmas] + len_words = len(words) + + # Build the result in the form of a list containing tuples (word, probability) + result = [(w, float(words.count(w))/len_words) for w in set(words)] + result.sort(key=lambda w: -w[1]) + + # return all the possibilities sorted by probability + return result + + +print(convert("death", WN_NOUN, WN_VERB)) +# [('die', 0.75), ('end', 0.2), ('decease', 0.05)] + +print(convert("story", WN_NOUN, WN_VERB)) +# [('report', 0.2222222222222222), ('tell', 0.2222222222222222), ('narrate', 0.2222222222222222),... + +print(convert("boring", WN_ADJECTIVE, WN_NOUN)) +# [('tedium', 0.3333333333333333), ('dullness', 0.16666666666666666),... + +print(convert("trouble", WN_NOUN, WN_ADJECTIVE)) +# [('troublous', 0.6666666666666666), ('problematical', 0.3333333333333333)] + +print(convert("solve", WN_VERB, WN_ADJECTIVE_SATELLITE)) +# [('solvent', 0.5), ('workable', 0.5)] + +print(convert("think", WN_VERB, WN_ADJECTIVE)) +# [('cogitative', 0.6666666666666666), ('recollective', 0.3333333333333333)] diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 00000000000000..e47eb548f9a13f --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,7 @@ +FROM pytorch/pytorch:latest + +RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext + +RUN pip install pytorch-pretrained-bert + +WORKDIR /workspace \ No newline at end of file diff --git a/examples.tar b/examples.tar new file mode 100644 index 00000000000000..03e1c603a947ce Binary files /dev/null and b/examples.tar differ diff --git a/examples/child_frames.py b/examples/child_frames.py new file mode 100644 index 00000000000000..0650898fa0d7bc --- /dev/null +++ b/examples/child_frames.py @@ -0,0 +1,307 @@ +frames = \ +[ + { + "index": 2, + "orig_sentence": "The trophy doesn't fit into the brown suitcase because [it] is too large/small.", + "entities": ["trophy", "suitcase"], + "entity_substitutes": [["ball", "toy"], ["bag", "box"]], + "determiner": "the", + "packed_relations": ["doesn't fit into/can fit into", "doesn't hold/can hold"], + "packed_relation_substitutes": [["can't be put into/can be put into"], ["doesn't have enough room for/has enough room for"]], + "packed_predicates": ["is large/isn't large", "is small/isn't small"], + }, + { + "index": 4, + "orig_sentence": "Joan made sure to thank Susan for all the help [she] had recieved/given.", + "entities": ["John", "Susan"], + "entity_substitutes": [["David", "Michael"], ["Mary", "Tiffany"]], + "packed_relations": ["thanked/didn't thank", "took good care of/didn't good care of"], + "packed_relation_substitutes": [["felt grateful to/didn't feel grateful to"], ["was appreciated by/wasn't appreciated by"]], + "packed_predicates": ["had received a lot of help/hadn't received a lot of help", "had given a lot of help/hadn't given a lot of help"], + "predicate_dichotomy": False, + }, + { + "index": 4000, + "orig_sentence": "John gave a lot of money to Susan because [he] was very rich/poor.", + "entities": ["John", "Susan"], + "entity_substitutes": [["David", "Michael"], ["Mary", "Linda"]], + "packed_relations": ["gave a lot of money to/didn't give a lot of money to", "received a lot of money from/didn't receive a lot of money from"], + "packed_relation_substitutes": [["subsidized/didn't subsidize"], ["borrowed a lot of money from/didn't borrow any money from"]], + "packed_predicates": ["was rich/wasn't rich", "was poor/wasn't poor"], + }, + { + "index": 10, + "orig_sentence": "The delivery truck zoomed by the school bus because [it] was going so fast/slow.", + "entities": ["truck", "bus"], + "entity_substitutes": [["car", "ambulance"], ["bicycle", "tram"]], + "determiner": "the", + "packed_relations": ["overtook/couldn't overtake", "fell far behind/didn't fall far behind"], + "packed_relation_substitutes": [["zoomed by/didn't pass"], ["was left behind/wasn't left far behind"]], + "packed_predicates": ["was going fast/wasn't going fast", "was going slow/wasn't going slow"], + }, + ## didn't defeated, replace error: didn't defeat -> defeated + { + "index": 12, + "orig_sentence": "Frank felt vindicated/crushed when his longtime rival Bill revealed that [he] was the winner of the competition.", + "entities": ["John", "Susan"], + "entity_substitutes": [["David", "Michael"], ["Mary", "Linda"]], + "packed_relations": ["beat/didn't beat", "lost to/didn't lose to"], + "packed_relation_substitutes": [["defeated/didn't defeat"], ["was defeated by/wasn't defeated by"]], + "relation_suffix": "in the game", + "packed_predicates": ["was happy/wasn't happy", "was sad/wasn't sad"], + "reverse_causal": True + }, + { + "index": 16, + "orig_sentence": "The large ball crashed right through the table because [it] was made of steel/styrofoam.", + "entities": ["ball", "board"], + "entity_substitutes": [["bullet", "arrow"], ["shield", "disk"]], + "determiner": "the", + "packed_relations": ["crashed right through/didn't crash through", "failed to block/blocked"], + "packed_relation_substitutes": [["penetrated through/didn't penetrate through"], ["failed to stop/stopped"]], + "packed_predicates": ["was hard/wasn't hard", "was soft/wasn't soft"], + }, + { + "index": 18, + "orig_sentence": "John couldn't see the stage with Billy in front of him because [he] is so short.", + "entities": ["John", "Susan"], + "entity_substitutes": [["David", "Edward"], ["Betty", "Donna"]], + "packed_relations": ["couldn't see the stage behind/could see the stage behind", "blocked the view of/didn't block the view of"], + "packed_relation_substitutes": [["couldn't catch sight of the stage behind/could catch sight of the stage behind"], ["obstructed the sight of/didn't obstruct the sight of"]], + "packed_predicates": ["is short/isn't short", "is tall/isn't tall"], + }, + { + "index": 20, + "orig_sentence": "Tom threw his schoolbag down to Ray after [he] reached the top of the stairs.", + "entities": ["Brian", "Amy"], + "entity_substitutes": [["Charles", "Paul"], ["Emma", "Linda"]], + "packed_relations": ["threw the schoolbag down to/threw the schoolbag up to", "caught the schoolbag thrown down by/caught the schoolbag thrown up by"], + "packed_relation_substitutes": [["cast the schoolbag down to/cast the schoolbag up to"], ["took the schoolbag thrown down by/took the schoolbag thrown up by"]], + "packed_predicates": ["reached the top of the stairs", "reached the bottom of the stairs"], + "conjunctions": [["after", ], ["before", ]] + }, + ## didn't defeated, replace error: didn't defeat -> defeated + { + "index": 22, + "orig_sentence": "Although they ran at about the same speed, Sue beat Sally because [she] had such a good start.", + "entities": ["Tom", "Sue"], + "entity_substitutes": [["John", "David"], ["Sally", "Susan"]], + "packed_relations": ["beat/didn't beat", "lost to/didn't lose to"], + "packed_relation_substitutes": [["defeated/didn't defeat"], ["was defeated by/wasn't defeated by"]], + "relation_prefix": "Running at about the same speed,", + "relation_suffix": "in the running race", + "packed_predicates": ["had a good start/didn't have a good start", "had a bad start/didn't have a bad start"], + }, + { + "index": 28, + "orig_sentence": "Anna did a lot better than her good friend Lucy on the test because [she] had studied so hard.", + "entities": ["Anna", "Andy"], + "entity_substitutes": [["Lucy", "Nancy"], ["George", "Frank"]], + "packed_relations": ["did better than/didn't do better than", "did worse than/didn't do worse than"], + "packed_relation_substitutes": [["performed better than/didn't perform better than"], ["performed worse than/didn't perform worse than"]], + "relation_suffix": "on the test", + "packed_predicates": ["had studied hard/hadn't studied hard", "was lazy in doing homework/wasn't lazy in doing homework"], + }, + { + "index": 30, + "orig_sentence": "The firemen arrived after the police because [they] were coming from so far away.", + "entities": ["doctor", "police"], + "entity_substitutes": [["worker", "employee"], ["boss", "administrator"]], + "determiner": "the", + "packed_relations": ["arrived after/didn't arrive after", "arrived before/didn't arrive before"], + "packed_relation_substitutes": [["reached here after/didn't reach here after"], ["reached here before/didn't reach here before"]], + "packed_predicates": ["came from far away/didn't come from far away", "came from a close place/didn't come from a close place"], + }, + { + "index": 32000, + "orig_sentence": "Frank was upset with Tom because the toaster [he] had bought from him didn't work.", + "entities": ["Betty", "Henry"], + "entity_substitutes": [["Amy", "Linda"], ["Bush", "Frank"]], + "packed_relations": ["was upset with/was pleased with", "was hated by/was loved by"], + "packed_relation_substitutes": [["hated/liked"], ["was disliked by/was liked by"]], + "packed_predicates": ["had bought didn't work/had bought worked well", "had sold didn't work/had sold worked well"], + "predicate_prefix": "the toaster", + "predicate_dichotomy": False, + }, + { + "index": 36, + "orig_sentence": "The sack of potatoes had been placed above the bag of flour, so [it] had to be moved first", + "entities": ["potatoes", "flour"], + "entity_substitutes": [["candy", "rice"], ["beans", "noodles"]], + "determiner": "the bag of", + "packed_relations": ["had been placed above/hadn't been placed above", "had been placed below/hadn't been placed below"], + "packed_relation_substitutes": [["had been put above/hadn't been put above"], ["had been put below/hadn't been put below"]], + "packed_predicates": ["had to be moved first/couldn't be moved first", "had to be moved later/couldn't be moved later"], + "reverse_causal": True + }, + { + "index": 38, + "orig_sentence": "Pete envies Martin although [he] is very successful.", + "entities": ["Peter", "Mandy"], + "entity_substitutes": [["Martin", "Paul"], ["Cindy", "Emma"]], + "packed_relations": ["envied/didn't envy", "was envied by/wasn't envied by"], + "packed_relation_substitutes": [["was jealous of/wasn't jealous of"], ["was admired by/wasn't admired by"]], + "packed_predicates": ["failed/didn't fail", "was successful/wasn't successful"], + }, + { + "index": 42, + "orig_sentence": "I poured water from the bottle into the cup until [it] was empty.", + "entities": ["bottle", "cup"], + "entity_substitutes": [["bowl", "bucket"], ["tube", "container"]], + "determiner": "the", + "packed_relations": ["was filled with water from/leaked into", "leaked into/was filled with water from"], + "packed_relation_substitutes": [["was suffused with water from/dripped water into"], ["dripped water into/was suffused with water from"]], + "packed_predicates": ["was empty", "was full"], + "conjunctions": [["after", ], ["before", ]] + }, + { + "index": 50, + "orig_sentence": "Joe's uncle can still beat him at tennis, even though [he] is 30 years younger.", + "entities": ["Joe", "Amy"], + "entity_substitutes": [["David", "Charles"], ["Betty", "Cindy"]], + "packed_relations": ["can beat/can't beat", "often loses to/seldom loses to"], + "packed_relation_substitutes": [["can defeat/can't defeat"], ["is often defeated by/is seldom defeated by"]], + "relation_suffix": "at tennis", + "packed_predicates": ["is older/isn't older", "is younger/isn't younger"], + }, + { + "index": 68, + "orig_sentence": "Ann asked Mary what time the library closes, because [she] had forgotten.", + "entities": ["Ann", "Henry"], + "entity_substitutes": [["Mary", "Linda"], ["Brian", "Michael"]], + "packed_relations": ["asked/didn't ask", "told/didn't tell"], + "packed_relation_substitutes": [["was told by/wasn't told by"], ["was asked by/wasn't asked by"]], + "relation_suffix": "what time the library closes", + "packed_predicates": ["had forgotten/hadn't forgotten", "remembered/didn't remember"], + }, + { + "index": 84, + "orig_sentence": "If the con artist has succeeded in fooling Sam, [he] would have gotten a lot of money.", + "entities": ["Sam", "Emma"], + "entity_substitutes": [["Paul", "Bush"], ["Susan", "Lucy"]], + "packed_relations": ["succeeded in fooling/failed to fool", "was fooled by/wasn't fooled by"], + "packed_relation_substitutes": [["succeeded in cheating/failed to cheat"], ["was cheated by/wasn't cheated by"]], + "packed_predicates": ["got a lot of money/didn't get a lot of money", "lost a lot of money/didn't lose a lot of money"], + "predicate_dichotomy": False, + "reverse_causal": True + }, + { + "index": 15000, + "orig_sentence": "Jackson was greatly influenced by Arnold, though [he] lived two centuries later.", + "entities": ["Jack", "Betty"], + "entity_substitutes": [["Tom", "Jay"], ["Emily", "Helen"]], + "packed_relations": ["always takes care of/dosen't take care of", "is always taken care of by/isn't taken care of by"], + "packed_relation_substitutes": [["always looks after/dosen't look after"], ["always needs the help of/didn't need the help of"]], + "packed_predicates": ["is older/isn't older", "is younger/isn't younger"], + }, + { + "index": 160, + "orig_sentence": "The actress used to be named Terpsichore, but she changed it to Tina a few years ago, because she figured [it] was too hard to pronounce.", + "entities": ["Betty", "Adele"], + "entity_substitutes": [["Amy", "Cindy"], ["Alberta", "Caroline"]], + "packed_relations": ["replaced/didn't replace", "was changed to/wasn't changed to"], + "packed_relation_substitutes": [["was substituted for/wasn't substituted for"], ["was replaced by/wasn't replaced by"]], + "relation_suffix": "as the actress's new name", + "packed_predicates": ["is easy to pronounce/isn't easy to pronounce", "is hard to pronounce/isn't hard to pronounce"], + }, + { + "index": 1700000, + "orig_sentence": "In July, Kamtchatka declared war on Yakutsk. Since Yakutsk's army was much better equipped and ten times larger, [they] were defeated within weeks.", + "entities": ["Germany", "Italy"], + "entity_substitutes": [["Australia", "Japan"], ["Argentina", "Canada"]], + "packed_relations": ["defeated/didn't defeat", "was defeated by/wasn't defeated by"], + "packed_relation_substitutes": [["conquered/didn't conquer"], ["was conquered by/wasn't conquered by"]], + "packed_predicates": ["was more powerful/wasn't more powerful", "was less powerful/wasn't less powerful"], + }, + { + "index": 186, + "orig_sentence": "When the sponsors of the bill got to the town hall, they were surprised to find that the room was full of opponents. [They] were very much in the minority", + "entities": ["sponsors", "opponents"], + "entity_substitutes": [["workers", "customers"], ["teachers", "students"]], + "determiner": "the", + "packed_relations": ["were less in number than/were not less in number than", "were more in number than/were not more in number than"], + "packed_relation_substitutes": [["were outnumbered by/were not outnumbered by"], ["outnumbered/didn't outnumber"]], + "packed_predicates": ["were in the minority/were not in the minority", "were in the majority/were not in the majority"], + "reverse_causal": True + }, + { + "index": 188, + "orig_sentence": "Everyone really loved the oatmeal cookies; only a few people liked the chocolate chip cookies. Next time, we should make more of [them] .", + "entities": ["cookies", "chips"], + "entity_substitutes": [["apples", "bananas"], ["grapes", "sandwiches"]], + "determiner": "the", + "packed_relations": ["are more popular than/are less popular than", "lose to/don't lose to"], + "packed_relation_substitutes": [["are sold more than/are sold less than"], ["are not as popular as/are as popular as"]], + "packed_predicates": ["should be made more next time/shouldn't be made more next time", "should be made less next time/shouldn't be made less next time"], + "reverse_causal": True + }, + { + "index": 190, + "orig_sentence": "We had hoped to place copies of our newsletter on all the chairs in the auditorium, but there were simply not enough of [them] .", + "entities": ["newspapers", "chairs"], + "entity_substitutes": [["cups", "pictures"], ["tables", "benches"]], + "determiner": "the", + "packed_relations": ["could be placed on all/couldn't be placed on all", "could all be covered by/couldn't all be covered by"], + "packed_relation_substitutes": [["could be put on all/couldn't be put on all"], ["could carry all/couldn't carry all"]], + "packed_predicates": ["there were many of/there were not many of", "there were few of/there were not few of"], + "prepositive_pred": True, + }, + { + "index": 19600, + "orig_sentence": "Steve follows Fred's example in everything. [He] admires him hugely.", + "entities": ["Steve", "Lucy"], + "entity_substitutes": [["Fred", "George"], ["Lily", "Wendy"]], + "packed_relations": ["follows/doesn't follow", "is followed by/isn't followed by"], + "packed_relation_substitutes": [["imitates/doesn't imitate"], ["is imitated by/isn't imitated by"]], + "relation_suffix": "in everything", + "packed_predicates": ["is bad at making decisions/isn't bad at making decisions", "is good at making decisions/isn't good at making decisions"], + }, + { + "index": 198, + "orig_sentence": "The table won't fit through the doorway because [it] is too wide.", + "entities": ["table", "doorway"], + "entity_substitutes": [["desk", "sofa"], ["corridor", "hallway"]], + "determiner": "the", + "packed_relations": ["will fit through/won't fit through", "will be fitted through by/won't be fitted through by"], + "packed_relation_substitutes": [["will pass through/won't pass through"], ["will be passed through by/won't be passed through by"]], + "packed_predicates": ["is narrow/isn't narrow", "is wide/isn't wide"], + }, + { + "index": 2000000, + "orig_sentence": "Grace was happy to trade me her sweater for my jacket. She thinks [it] looks dowdy on her.", + "entities": ["sweater", "jacket"], + "entity_substitutes": [["skirt", "cap"], ["hat", "short"]], + "determiner": "the", + "packed_relations": ["is traded by Grace for/isn't traded by Grace for", "is substituted by Grace for/isn't substituted by Grace for"], + "packed_relation_substitutes": [["is replaced by Grace with/isn't replaced by Grace with"], ["is preferred by Grace to/isn't preferred by Grace to"]], + "packed_predicates": ["looks bad/looks not bad", "looks good/looks not good"], + "predicate_prefix": "she thinks", + }, + { + "index": 226, + "orig_sentence": "Bill passed the half-empty plate to John because [he] was full.", + "entities": ["Bill", "Amy"], + "entity_substitutes": [["Brian", "David"], ["Emma", "Helen"]], + "packed_relations": ["passed the half-empty plate to/didn't pass the half-empty plate to", "received the half-empty plate from/didn't received the half-empty plate from"], + "packed_relation_substitutes": [["gave the half-empty plate to/didn't give the half-empty plate to"], ["took the half-empty plate from/didn't take the half-empty plate from"]], + "packed_predicates": ["was full/wasn't full", "was hungry/wasn't hungry"], + }, + { + "index": 252, + "orig_sentence": "George got free tickets to the play, but he gave them to Eric, even though [he] was particularly eager to see it.", + "entities": ["George", "Linda"], + "entity_substitutes": [["Eric", "Ted"], ["Cindy", "Lucy"]], + "packed_relations": ["gave the tickets of the play to/didn't give the tickets of the play to", "received the tickets of the play from/didn't receive the tickets of the play from"], + "packed_relation_substitutes": [["sent the tickets of the play to/didn't send the tickets of the play to"], ["took the tickets of the play from/didn't take the tickets of the play from"]], + "packed_predicates": ["wasn't interested in it/was interested in it", "was eager to see it/wasn't eager to see it"], + }, + { + "index": 261, + "orig_sentence": "Kirilov ceded the presidency to Shatov because [he] was less popular.", + "entities": ["James", "Amy"], + "entity_substitutes": [["Robert", "Jack"], ["Donna", "Emily"]], + "packed_relations": ["ceded the presidency to/didn't cede the presidency to", "took over the presidency from/didn't take over the presidency from"], + "packed_relation_substitutes": [["gave the presidency to/didn't give the presidency to"], ["got the presidency from/didn't get the presidency from"]], + "packed_predicates": ["was notorious/was not notorious", "was popular/wasn't popular"], + }, +] diff --git a/examples/child_generator.py b/examples/child_generator.py new file mode 100644 index 00000000000000..5d71d8be387042 --- /dev/null +++ b/examples/child_generator.py @@ -0,0 +1,128 @@ +import os +import json +import itertools +from itertools import product, chain + +from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig +from child_frames import frames + + +CONFIG_NAME = 'bert_config.json' +BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/' +tokenizer = BertTokenizer.from_pretrained(os.path.join(BERT_DIR, 'vocab.txt')) + + +A_template = "{rel_prefix} {dt} {ent0} {rel} {dt} {ent1} {rel_suffix}" +B_templates = ["{pred_prefix} {dt} {ent} {pred}", "{pred_prefix} {pred} {dt} {ent}"] + +# causal_templates = [["{A} because {B}."],# "{B} so {A}."], +# ["{A} so {B}."],# "{B} because {A}."] +# ] +# turning_templates = [["{A} although {B}."],# "{B} but {A}."], +# ["{A} but {B}."],# "{B} although {A}."] +# ] + +causal_templates = [["{A} ||| {conj} {B}."],# "{B} so {A}."], + ["{A} ||| {conj} {B}."],# "{B} because {A}."] + ] +turning_templates = [["{A} ||| {conj} {B}."],# "{B} but {A}."], + ["{A} ||| {conj} {B}."],# "{B} although {A}."] + ] + + +def reverse(l): + return list(reversed(l)) + + +def mask(ent_str): + tokens = ent_str.strip().split() + if len(tokens) == 1: + return '[%s]' % tokens[0] + elif len(tokens) == 2: + assert tokens[0] == 'the', ent_str + return '%s [%s]' % (tokens[0], tokens[1]) + else: + assert False, ent_str + + +def make_sentences(index=-1, orig_sentence='', entities=["John", "Mary"], entity_substitutes=None, determiner="", + packed_relations=["rel/~rel", "rev_rel/~rev_rel"], packed_relation_substitutes=None, + relation_prefix="", relation_suffix="", + packed_predicates=["pred0/~pred0", "pred1/~pred1"], predicate_prefix="", prepositive_pred=False, + predicate_dichotomy=True, reverse_causal=False, conjunctions=[["because", "so"], ["although", "but"]]): + assert entities[0].lower() in tokenizer.vocab , entities[0] + assert entities[1].lower() in tokenizer.vocab , entities[1] + + def form_As(packed_rels): + relations, neg_relations = zip(*[rel.split("/") for rel in packed_rels]) + relations, neg_relations = list(relations), list(neg_relations) + + As = [A_template.format(dt=determiner, ent0=ent0, ent1=ent1, rel=rel, rel_prefix=relation_prefix, rel_suffix=relation_suffix) + for ent0, ent1, rel in [entities + relations[:1], reverse(entities) + reverse(relations)[:1]]] + negAs = [A_template.format(dt=determiner, ent0=ent0, ent1=ent1, rel=rel, rel_prefix=relation_prefix, rel_suffix=relation_suffix) + for ent0, ent1, rel in [entities + neg_relations[:1], reverse(entities) + reverse(neg_relations)[:1]]] + return As, negAs + + As, negAs = form_As(packed_relations) + + substituted_As, substituted_negAs = [], [] + for packed_rel_subs in zip(*packed_relation_substitutes): + subs_As, subs_negAs = form_As(packed_rel_subs) + substituted_As += subs_As + substituted_negAs += subs_negAs + + if "/" in packed_predicates[0]: + predicates, neg_predicates = zip(*[pred.split("/") for pred in packed_predicates]) + predicates, neg_predicates = list(predicates), list(neg_predicates) + else: + predicates, neg_predicates = packed_predicates, [] + + B_template = B_templates[int(prepositive_pred)] + Bs = [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) + for ent, pred in zip(entities, predicates)] + negBs = [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) + for ent, pred in zip(entities, neg_predicates)] + if predicate_dichotomy: + Bs += [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) + for ent, pred in zip(entities, reversed(neg_predicates))] + negBs += [B_template.format(dt=determiner, ent=mask(ent), pred=pred, pred_prefix=predicate_prefix) + for ent, pred in zip(entities, reversed(predicates))] + + def form_sentences(sentence_template, As, Bs, conj): + return [" ".join(sentence_template.format(A=A, B=B, conj=conj).split()) for A, B in product(As, Bs)] + + def form_all_sentences(As, negAs, Bs, negBs): + causal_sentences = [] + causal_conj = conjunctions[0][int(reverse_causal)] + for causal_template in causal_templates[int(reverse_causal)]: + for A, B in [(As, Bs), (negAs, negBs)]: + causal_sentences += form_sentences(causal_template, A, B, causal_conj) + + turning_sentences = [] + turning_conj = conjunctions[1][int(reverse_causal)] + for turning_template in turning_templates[int(reverse_causal)]: + for A, B in [(As, negBs), (negAs, Bs)]: + turning_sentences += form_sentences(turning_template, A, B, turning_conj) + + sentences = causal_sentences + turning_sentences + return sentences, causal_sentences, turning_sentences + + sentences, causal_sentences, turning_sentences = form_all_sentences(As, negAs, Bs, negBs) + # substituted_sentences = sentences + + if packed_relation_substitutes is not None: + substituted_sentences = form_all_sentences(substituted_As, substituted_negAs, Bs, negBs)[0] + + substituted_sent_groups = list(zip(sentences, substituted_sentences)) + + if entity_substitutes is not None: + for sub in entity_substitutes: + for ent in sub: + assert ent.lower() in tokenizer.vocab , ent + " not in BERT vocab" + assert len(set(chain.from_iterable(entity_substitutes))) == 4, entity_substitutes + assert len(set(chain.from_iterable(entity_substitutes)).union(set(entities))) == 6 + + entity_substitutes = list(itertools.product(entities[:1] + entity_substitutes[0], entities[1:] + entity_substitutes[1])) + substituted_sent_groups = [[sent.replace(entities[0], sub[0]).replace(entities[1], sub[1]) + for sent in sent_group for sub in entity_substitutes] for sent_group in substituted_sent_groups] + return causal_sentences, turning_sentences, substituted_sent_groups \ No newline at end of file diff --git a/examples/extract_features.py b/examples/extract_features.py index abe7fdffe7dbec..9d05d7905d0fa4 100644 --- a/examples/extract_features.py +++ b/examples/extract_features.py @@ -168,7 +168,7 @@ def read_examples(input_file): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 - with open(input_file, "r") as reader: + with open(input_file, "r", encoding='utf-8') as reader: while True: line = reader.readline() if not line: @@ -199,6 +199,7 @@ def main(): "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") ## Other parameters + parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences longer " @@ -209,7 +210,6 @@ def main(): default=-1, help = "local_rank for distributed training on gpus") parser.add_argument("--no_cuda", - default=False, action='store_true', help="Whether not to use CUDA when available") @@ -227,7 +227,7 @@ def main(): layer_indexes = [int(x) for x in args.layers.split(",")] - tokenizer = BertTokenizer.from_pretrained(args.bert_model) + tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) examples = read_examples(args.input_file) diff --git a/examples/run_child_finetuning.py b/examples/run_child_finetuning.py new file mode 100644 index 00000000000000..e960126b85d415 --- /dev/null +++ b/examples/run_child_finetuning.py @@ -0,0 +1,531 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import logging +import argparse +from tqdm import tqdm, trange +import itertools + +import numpy as np +import torch +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from torch.utils.data.distributed import DistributedSampler + +from pytorch_pretrained_bert.tokenization import BertTokenizer +from pytorch_pretrained_bert.modeling import BertForPreTraining, BertForMaskedLM +from pytorch_pretrained_bert.optimization import BertAdam + +from child_generator import make_sentences +from child_frames import frames + +from torch.utils.data import Dataset +import random + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) +logger = logging.getLogger(__name__) + + +def warmup_linear(x, warmup=0.002): + if x < warmup: + return x/warmup + return 1.0 - x + + +def rejoin_tokens(tokens): + new_tokens = [] + while len(tokens) > 0: + t = tokens.pop(0) + if t == "[": + t1 = tokens.pop(0) + t2 = tokens.pop(0) + assert t2 == "]", t2 + new_tokens.append(t + t1 + t2) + else: + new_tokens.append(t) + return new_tokens + + +class CHILDDataset(Dataset): + def __init__(self, tokenizer, one_sent=False, max_seq_len=None, dev_percent=-1): + self.tokenizer = tokenizer + self.one_sent = one_sent + self.max_seq_len = max_seq_len + + if dev_percent == -1: + causal_lines, turning_lines, subs_lines = [], [], [] + for frame in frames: + causal_sent, turning_sent, subs_sent = make_sentences(**frame) + causal_lines += causal_sent + turning_lines += turning_sent + subs_lines += subs_sent + train_lines = causal_lines + turning_lines + dev_lines = list(set(subs_lines) - set(train_lines)) + self.all_lines = train_lines + dev_lines + self.n_dev = len(dev_lines) + else: + self.all_lines = list(itertools.chain.from_iterable( + [make_sentences(**frame)[-1] for frame in frames])) + random.shuffle(self.all_lines) + self.n_dev = int(round(len(self.all_lines) * dev_percent)) + + n_all = len(self.all_lines) + self.n_train = n_all - self.n_dev + + if type(self.all_lines[0]) == list: + n_substitutes = len(self.all_lines[0]) + assert all(len(substitutes) == n_substitutes for substitutes in self.all_lines) + print('flattening all_lines: %d * %d = %d' % + (n_all, n_substitutes, n_all * n_substitutes)) + self.all_lines = list(itertools.chain.from_iterable(self.all_lines)) + self.n_dev *= n_substitutes + self.n_train *= n_substitutes + + self.examples = [] + cur_id = 0 + for line in self.all_lines: + t1, t2, is_next_label = self.split_sent(line) + + tokens_a = rejoin_tokens(self.tokenizer.tokenize(t1)) + tokens_b = rejoin_tokens(self.tokenizer.tokenize(t2)) if t2 is not None else None + + example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=tokens_b, is_next=is_next_label) + self.examples.append(example) + cur_id += 1 + + if self.max_seq_len is None: + self.max_seq_len = max([len(example.tokens_a) + len(example.tokens_b) + 3 + if example.tokens_b is not None else len(example.tokens_a) + 2 + for example in self.examples]) + print('max_seq_len =', self.max_seq_len) + + self.features = [convert_example_to_features(example, self.max_seq_len, self.tokenizer) for example in self.examples] + + def split_sent(self, line): + label = 0 + if "|||" in line: + t1, t2 = [t.strip() for t in line.split("|||")] + assert len(t1) > 0 and len(t2) > 0, "%d %d" % (len(t1), len(t2)) + if self.one_sent: + t1 = t1 + " " + t2 + t2 = None + else: + assert self.one_sent + t1, t2 = line.strip(), None + return t1, t2, label + + def get_train_examples(self): + return self.examples[:self.n_train] + + def get_dev_examples(self): + return self.examples[self.n_train:] + + def get_train_features(self): + return self.features[:self.n_train] + + def get_dev_features(self): + return self.features[self.n_train:] + + def __len__(self): + return len(self.all_lines) + + +class InputExample(object): + def __init__(self, guid, tokens_a, tokens_b=None, is_next=None, lm_labels=None): + self.guid = guid + self.tokens_a = tokens_a + self.tokens_b = tokens_b + self.is_next = is_next # nextSentence + self.lm_labels = lm_labels # masked words for language model + + +class InputFeatures(object): + def __init__(self, input_ids, input_mask, segment_ids, is_next, lm_label_ids): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.is_next = is_next + self.lm_label_ids = lm_label_ids + + +def mask_word(tokens, tokenizer): + output_label = [] + + for i, token in enumerate(tokens): + if token.startswith("[") and token.endswith("]"): # masked word + token = token[1:-1] + tokens[i] = "[MASK]" + output_label.append(tokenizer.vocab[token]) + else: + output_label.append(-1) + + return tokens, output_label + + +def convert_example_to_features(example, max_seq_length, tokenizer): + tokens_a = example.tokens_a + tokens_b = example.tokens_b + + t1_masked, t1_label = mask_word(tokens_a, tokenizer) + lm_label_ids = [-1] + t1_label + [-1] + + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b is not None and len(tokens_b) > 0: + t2_masked, t2_label = mask_word(tokens_b, tokenizer) + lm_label_ids += (t2_label + [-1]) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + lm_label_ids.append(-1) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(lm_label_ids) == max_seq_length + + if example.guid < -5: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("tokens: %s" % " ".join( + [str(x) for x in tokens])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + logger.info("LM label: %s " % (lm_label_ids)) + logger.info("Is next sentence label: %s " % (example.is_next)) + + features = InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + lm_label_ids=lm_label_ids, + is_next=example.is_next) + return features + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--dev_percent", + default=-1, + type=float, + help="") + parser.add_argument("--one_sent", + action='store_true', + help="") + + ## Required parameters + parser.add_argument("--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model checkpoints will be written.") + + ## Other parameters + parser.add_argument("--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument("--do_train", + action='store_true', + help="Whether to run training.") + parser.add_argument("--do_eval", + action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument("--train_batch_size", + default=32, + type=int, + help="Total batch size for training.") + parser.add_argument("--eval_batch_size", + default=32, + type=int, + help="Total batch size for eval.") + parser.add_argument("--learning_rate", + default=3e-5, + type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_proportion", + default=0.1, + type=float, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10%% of training.") + parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument("--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models.") + parser.add_argument("--local_rank", + type=int, + default=-1, + help="local_rank for distributed training on gpus") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + parser.add_argument('--gradient_accumulation_steps', + type=int, + default=1, + help="Number of updates steps to accumualte before performing a backward/update pass.") + parser.add_argument('--fp16', + action='store_true', + help="Whether to use 16-bit float precision instead of 32-bit") + parser.add_argument('--loss_scale', + type = float, default = 0, + help = "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" + "0 (default value): dynamic loss scaling.\n" + "Positive power of 2: static loss scaling value.\n") + + args = parser.parse_args() + + if args.local_rank == -1 or args.no_cuda: + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( + device, n_gpu, bool(args.local_rank != -1), args.fp16)) + + if args.gradient_accumulation_steps < 1: + raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( + args.gradient_accumulation_steps)) + + args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + if not args.do_train and not args.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + if os.path.exists(args.output_dir) and os.listdir(args.output_dir): + raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) + os.makedirs(args.output_dir, exist_ok=True) + + BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/' + tokenizer = BertTokenizer.from_pretrained(os.path.join(BERT_DIR, 'vocab.txt'), do_lower_case=args.do_lower_case) + + #train_examples = None + num_train_steps = None + if args.do_train: + print("Generating CHILD Dataset") + child_dataset = CHILDDataset(tokenizer, one_sent=args.one_sent, dev_percent=args.dev_percent) + train_features = child_dataset.get_train_features() + num_train_steps = int( + len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) + + # Prepare model + model = BertForMaskedLM.from_pretrained(BERT_DIR) + if args.fp16: + model.half() + model.to(device) + if args.local_rank != -1: + try: + from apex.parallel import DistributedDataParallel as DDP + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + model = DDP(model) + elif n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Prepare optimizer + param_optimizer = list(model.named_parameters()) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + if args.fp16: + try: + from apex.optimizers import FP16_Optimizer + from apex.optimizers import FusedAdam + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + optimizer = FusedAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + bias_correction=False, + max_grad_norm=1.0) + if args.loss_scale == 0: + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + else: + optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + + else: + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_steps) + + def validate(model, eval_dataloader): + model.eval() + eval_loss, eval_accuracy = 0, 0 + nb_eval_steps, nb_eval_examples = 0, 0 + + # for batch in tqdm(eval_dataloader, desc="Evaluating"): + for batch in eval_dataloader: + batch = tuple(t.to(device) for t in batch) + input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch + with torch.no_grad(): + tmp_eval_loss = model(input_ids, segment_ids, input_mask, lm_label_ids) + logits = model(input_ids, segment_ids, input_mask) + + logits = logits.detach().cpu().numpy() + lm_label_ids = lm_label_ids.to('cpu').numpy() + tmp_eval_accuracy = accuracy(logits, lm_label_ids) + + eval_loss += tmp_eval_loss.mean().item() + eval_accuracy += tmp_eval_accuracy + + nb_eval_examples += input_ids.size(0) + nb_eval_steps += 1 + + eval_loss = eval_loss / nb_eval_steps + eval_accuracy = eval_accuracy / nb_eval_examples + result = {'loss': eval_loss, + 'acc': eval_accuracy} + + # logger.info("***** Eval results *****") + for key in sorted(result.keys()): + # logger.info(" %s = %s", key, str(result[key])) + print(" %s = %.3f" % (key, result[key]), end='') + + + global_step = 0 + if args.do_train: + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_features)) + logger.info(" Batch size = %d", args.train_batch_size) + logger.info(" Num steps = %d", num_train_steps) + + all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) + all_lm_label_ids = torch.tensor([f.lm_label_ids for f in train_features], dtype=torch.long) + all_is_next = torch.tensor([f.is_next for f in train_features], dtype=torch.long) + train_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lm_label_ids, all_is_next) + + if args.local_rank == -1: + train_sampler = RandomSampler(train_dataset) + else: + #TODO: check if this works with current data generator from disk that relies on file.__next__ + # (it doesn't return item back by index) + train_sampler = DistributedSampler(train_dataset) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) + + if args.do_eval: + eval_features = child_dataset.get_dev_features() + all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) + all_lm_label_ids = torch.tensor([f.lm_label_ids for f in eval_features], dtype=torch.long) + all_is_next = torch.tensor([f.is_next for f in eval_features], dtype=torch.long) + eval_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lm_label_ids, all_is_next) + + eval_sampler = SequentialSampler(eval_dataset) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) + + # logger.info("Epoch 0. Evaluating on train set...") + print("Epoch 0. Train:", end='') + validate(model, train_dataloader) + # logger.info("Evaluating on valid set...") + print(" Valid:", end='') + validate(model, eval_dataloader) + print() + + # for epoch in trange(int(args.num_train_epochs), desc="Epoch"): + for epoch in range(int(args.num_train_epochs)): + model.train() + tr_loss = 0 + nb_tr_examples, nb_tr_steps = 0, 0 + # for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): + for step, batch in enumerate(train_dataloader): + batch = tuple(t.to(device) for t in batch) + input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch + loss = model(input_ids, segment_ids, input_mask, lm_label_ids) + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + if args.fp16: + optimizer.backward(loss) + else: + loss.backward() + tr_loss += loss.item() + nb_tr_examples += input_ids.size(0) + nb_tr_steps += 1 + if (step + 1) % args.gradient_accumulation_steps == 0: + if args.fp16: + # modify learning rate with special warm up BERT uses + # if args.fp16 is False, BertAdam is used that handles this automatically + lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_steps, args.warmup_proportion) + for param_group in optimizer.param_groups: + param_group['lr'] = lr_this_step + optimizer.step() + optimizer.zero_grad() + global_step += 1 + + if args.do_eval: + # logger.info("Epoch %d. Evaluating on train set..." % (epoch + 1)) + print("Epoch %d. Train:" % (epoch + 1), end='') + validate(model, train_dataloader) + # logger.info("Evaluating on valid set...") + print(" Valid:", end='') + validate(model, eval_dataloader) + print() + + # Save a trained model + # logger.info("** ** * Saving fine - tuned model ** ** * ") + # model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self + # output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") + # if args.do_train: + # torch.save(model_to_save.state_dict(), output_model_file) + + +def accuracy(out, labels): + outputs = np.argmax(out, axis=-1) + return np.all((outputs == labels) | (labels == -1), axis=-1).sum() + + +if __name__ == "__main__": + main() diff --git a/examples/run_classifier.py b/examples/run_classifier.py index 2c83b4fe497fea..31877a541457f9 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -1,5 +1,6 @@ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,7 +36,7 @@ from pytorch_pretrained_bert.optimization import BertAdam from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO) logger = logging.getLogger(__name__) @@ -90,7 +91,7 @@ def get_labels(self): @classmethod def _read_tsv(cls, input_file, quotechar=None): """Reads a tab separated value file.""" - with open(input_file, "r") as f: + with open(input_file, "r", encoding='utf-8') as f: reader = csv.reader(f, delimiter="\t", quotechar=quotechar) lines = [] for line in reader: @@ -196,9 +197,7 @@ def _create_examples(self, lines, set_type): def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): """Loads a data file into a list of `InputBatch`s.""" - label_map = {} - for (i, label) in enumerate(label_list): - label_map[label] = i + label_map = {label : i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): @@ -207,8 +206,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) - - if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" @@ -216,7 +213,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: - tokens_a = tokens_a[0:(max_seq_length - 2)] + tokens_a = tokens_a[:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: @@ -236,22 +233,12 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. - tokens = [] - segment_ids = [] - tokens.append("[CLS]") - segment_ids.append(0) - for token in tokens_a: - tokens.append(token) - segment_ids.append(0) - tokens.append("[SEP]") - segment_ids.append(0) + tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + segment_ids = [0] * len(tokens) if tokens_b: - for token in tokens_b: - tokens.append(token) - segment_ids.append(1) - tokens.append("[SEP]") - segment_ids.append(1) + tokens += tokens_b + ["[SEP]"] + segment_ids += [1] * (len(tokens_b) + 1) input_ids = tokenizer.convert_tokens_to_ids(tokens) @@ -260,10 +247,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. - while len(input_ids) < max_seq_length: - input_ids.append(0) - input_mask.append(0) - segment_ids.append(0) + padding = [0] * (max_seq_length - len(input_ids)) + input_ids += padding + input_mask += padding + segment_ids += padding assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length @@ -309,34 +296,10 @@ def accuracy(out, labels): outputs = np.argmax(out, axis=1) return np.sum(outputs == labels) -def copy_optimizer_params_to_model(named_params_model, named_params_optimizer): - """ Utility function for optimize_on_cpu and 16-bits training. - Copy the parameters optimized on CPU/RAM back to the model on GPU - """ - for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model): - if name_opti != name_model: - logger.error("name_opti != name_model: {} {}".format(name_opti, name_model)) - raise ValueError - param_model.data.copy_(param_opti.data) - -def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False): - """ Utility function for optimize_on_cpu and 16-bits training. - Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model - """ - is_nan = False - for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model): - if name_opti != name_model: - logger.error("name_opti != name_model: {} {}".format(name_opti, name_model)) - raise ValueError - if param_model.grad is not None: - if test_nan and torch.isnan(param_model.grad).sum() > 0: - is_nan = True - if param_opti.grad is None: - param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size())) - param_opti.grad.data.copy_(param_model.grad.data) - else: - param_opti.grad = None - return is_nan +def warmup_linear(x, warmup=0.002): + if x < warmup: + return x/warmup + return 1.0 - x def main(): parser = argparse.ArgumentParser() @@ -349,7 +312,8 @@ def main(): help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " - "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") + "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " + "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, @@ -359,7 +323,7 @@ def main(): default=None, type=str, required=True, - help="The output directory where the model checkpoints will be written.") + help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", @@ -369,13 +333,14 @@ def main(): "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", - default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", - default=False, action='store_true', help="Whether to run eval on the dev set.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, @@ -398,32 +363,28 @@ def main(): help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", - default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") - parser.add_argument('--seed', - type=int, + parser.add_argument('--seed', + type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, - help="Number of updates steps to accumualte before performing a backward/update pass.") - parser.add_argument('--optimize_on_cpu', - default=False, - action='store_true', - help="Whether to perform optimization and keep the optimizer averages on CPU") + help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', - default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', - type=float, default=128, - help='Loss scaling, positive power of 2 values can improve fp16 convergence.') + type=float, default=0, + help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" + "0 (default value): dynamic loss scaling.\n" + "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() @@ -433,18 +394,23 @@ def main(): "mrpc": MrpcProcessor, } + num_labels_task = { + "cola": 2, + "mnli": 3, + "mrpc": 2, + } + if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: + torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') - if args.fp16: - logger.info("16-bits training currently not supported in distributed training") - args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) - logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) + logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( + device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( @@ -461,7 +427,7 @@ def main(): if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") - if os.path.exists(args.output_dir) and os.listdir(args.output_dir): + if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) @@ -471,9 +437,10 @@ def main(): raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() + num_labels = num_labels_task[task_name] label_list = processor.get_labels() - tokenizer = BertTokenizer.from_pretrained(args.bert_model) + tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None @@ -483,40 +450,57 @@ def main(): len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model - model = BertForSequenceClassification.from_pretrained(args.bert_model, - cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) + model = BertForSequenceClassification.from_pretrained(args.bert_model, + cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), + num_labels = num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank) + try: + from apex.parallel import DistributedDataParallel as DDP + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer - if args.fp16: - param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ - for n, param in model.named_parameters()] - elif args.optimize_on_cpu: - param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ - for n, param in model.named_parameters()] - else: - param_optimizer = list(model.named_parameters()) - no_decay = ['bias', 'gamma', 'beta'] + param_optimizer = list(model.named_parameters()) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=t_total) + if args.fp16: + try: + from apex.optimizers import FP16_Optimizer + from apex.optimizers import FusedAdam + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + optimizer = FusedAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + bias_correction=False, + max_grad_norm=1.0) + if args.loss_scale == 0: + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + else: + optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + + else: + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=t_total) global_step = 0 + nb_tr_steps = 0 + tr_loss = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) @@ -542,39 +526,40 @@ def main(): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch - loss, _ = model(input_ids, segment_ids, input_mask, label_ids) + loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. - if args.fp16 and args.loss_scale != 1.0: - # rescale loss for fp16 training - # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html - loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps - loss.backward() + + if args.fp16: + optimizer.backward(loss) + else: + loss.backward() + tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: - if args.fp16 or args.optimize_on_cpu: - if args.fp16 and args.loss_scale != 1.0: - # scale down gradients for fp16 training - for param in model.parameters(): - if param.grad is not None: - param.grad.data = param.grad.data / args.loss_scale - is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True) - if is_nan: - logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling") - args.loss_scale = args.loss_scale / 2 - model.zero_grad() - continue - optimizer.step() - copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) - else: - optimizer.step() - model.zero_grad() + # modify learning rate with special warm up BERT uses + lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) + for param_group in optimizer.param_groups: + param_group['lr'] = lr_this_step + optimizer.step() + optimizer.zero_grad() global_step += 1 + # Save a trained model + model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self + output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") + if args.do_train: + torch.save(model_to_save.state_dict(), output_model_file) + + # Load a trained model that you have fine-tuned + model_state_dict = torch.load(output_model_file) + model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict, num_labels=num_labels) + model.to(device) + if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( @@ -594,14 +579,16 @@ def main(): model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 - for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: + + for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): - tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) + tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) + logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() @@ -615,11 +602,11 @@ def main(): eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples - + loss = tr_loss/nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, - 'loss': tr_loss/nb_tr_steps} + 'loss': loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py new file mode 100644 index 00000000000000..35a2f797c7e3de --- /dev/null +++ b/examples/run_lm_finetuning.py @@ -0,0 +1,649 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import logging +import argparse +from tqdm import tqdm, trange + +import numpy as np +import torch +from torch.utils.data import DataLoader, RandomSampler +from torch.utils.data.distributed import DistributedSampler + +from pytorch_pretrained_bert.tokenization import BertTokenizer +from pytorch_pretrained_bert.modeling import BertForPreTraining +from pytorch_pretrained_bert.optimization import BertAdam + +from torch.utils.data import Dataset +import random + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) +logger = logging.getLogger(__name__) + + +def warmup_linear(x, warmup=0.002): + if x < warmup: + return x/warmup + return 1.0 - x + + +class BERTDataset(Dataset): + def __init__(self, corpus_path, tokenizer, seq_len, encoding="utf-8", corpus_lines=None, on_memory=True): + self.vocab = tokenizer.vocab + self.tokenizer = tokenizer + self.seq_len = seq_len + self.on_memory = on_memory + self.corpus_lines = corpus_lines # number of non-empty lines in input corpus + self.corpus_path = corpus_path + self.encoding = encoding + self.current_doc = 0 # to avoid random sentence from same doc + + # for loading samples directly from file + self.sample_counter = 0 # used to keep track of full epochs on file + self.line_buffer = None # keep second sentence of a pair in memory and use as first sentence in next pair + + # for loading samples in memory + self.current_random_doc = 0 + self.num_docs = 0 + self.sample_to_doc = [] # map sample index to doc and line + + # load samples into memory + if on_memory: + self.all_docs = [] + doc = [] + self.corpus_lines = 0 + with open(corpus_path, "r", encoding=encoding) as f: + for line in tqdm(f, desc="Loading Dataset", total=corpus_lines): + line = line.strip() + if line == "": + self.all_docs.append(doc) + doc = [] + #remove last added sample because there won't be a subsequent line anymore in the doc + self.sample_to_doc.pop() + else: + #store as one sample + sample = {"doc_id": len(self.all_docs), + "line": len(doc)} + self.sample_to_doc.append(sample) + doc.append(line) + self.corpus_lines = self.corpus_lines + 1 + + # if last row in file is not empty + if self.all_docs[-1] != doc: + self.all_docs.append(doc) + self.sample_to_doc.pop() + + self.num_docs = len(self.all_docs) + + # load samples later lazily from disk + else: + if self.corpus_lines is None: + with open(corpus_path, "r", encoding=encoding) as f: + self.corpus_lines = 0 + for line in tqdm(f, desc="Loading Dataset", total=corpus_lines): + if line.strip() == "": + self.num_docs += 1 + else: + self.corpus_lines += 1 + + # if doc does not end with empty line + if line.strip() != "": + self.num_docs += 1 + + self.file = open(corpus_path, "r", encoding=encoding) + self.random_file = open(corpus_path, "r", encoding=encoding) + + def __len__(self): + # last line of doc won't be used, because there's no "nextSentence". Additionally, we start counting at 0. + return self.corpus_lines - self.num_docs - 1 + + def __getitem__(self, item): + cur_id = self.sample_counter + self.sample_counter += 1 + if not self.on_memory: + # after one epoch we start again from beginning of file + if cur_id != 0 and (cur_id % len(self) == 0): + self.file.close() + self.file = open(self.corpus_path, "r", encoding=self.encoding) + + t1, t2, is_next_label = self.random_sent(item) + + # tokenize + tokens_a = self.tokenizer.tokenize(t1) + tokens_b = self.tokenizer.tokenize(t2) + + # combine to one sample + cur_example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=tokens_b, is_next=is_next_label) + + # transform sample to features + cur_features = convert_example_to_features(cur_example, self.seq_len, self.tokenizer) + + cur_tensors = (torch.tensor(cur_features.input_ids), + torch.tensor(cur_features.input_mask), + torch.tensor(cur_features.segment_ids), + torch.tensor(cur_features.lm_label_ids), + torch.tensor(cur_features.is_next)) + + return cur_tensors + + def random_sent(self, index): + """ + Get one sample from corpus consisting of two sentences. With prob. 50% these are two subsequent sentences + from one doc. With 50% the second sentence will be a random one from another doc. + :param index: int, index of sample. + :return: (str, str, int), sentence 1, sentence 2, isNextSentence Label + """ + t1, t2 = self.get_corpus_line(index) + if random.random() > 0.5: + label = 0 + else: + t2 = self.get_random_line() + label = 1 + + assert len(t1) > 0 + assert len(t2) > 0 + return t1, t2, label + + def get_corpus_line(self, item): + """ + Get one sample from corpus consisting of a pair of two subsequent lines from the same doc. + :param item: int, index of sample. + :return: (str, str), two subsequent sentences from corpus + """ + t1 = "" + t2 = "" + assert item < self.corpus_lines + if self.on_memory: + sample = self.sample_to_doc[item] + t1 = self.all_docs[sample["doc_id"]][sample["line"]] + t2 = self.all_docs[sample["doc_id"]][sample["line"]+1] + # used later to avoid random nextSentence from same doc + self.current_doc = sample["doc_id"] + return t1, t2 + else: + if self.line_buffer is None: + # read first non-empty line of file + while t1 == "" : + t1 = self.file.__next__().strip() + t2 = self.file.__next__().strip() + else: + # use t2 from previous iteration as new t1 + t1 = self.line_buffer + t2 = self.file.__next__().strip() + # skip empty rows that are used for separating documents and keep track of current doc id + while t2 == "" or t1 == "": + t1 = self.file.__next__().strip() + t2 = self.file.__next__().strip() + self.current_doc = self.current_doc+1 + self.line_buffer = t2 + + assert t1 != "" + assert t2 != "" + return t1, t2 + + def get_random_line(self): + """ + Get random line from another document for nextSentence task. + :return: str, content of one line + """ + # Similar to original tf repo: This outer loop should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document we're processing. + for _ in range(10): + if self.on_memory: + rand_doc_idx = random.randint(0, len(self.all_docs)-1) + rand_doc = self.all_docs[rand_doc_idx] + line = rand_doc[random.randrange(len(rand_doc))] + else: + rand_index = random.randint(1, self.corpus_lines if self.corpus_lines < 1000 else 1000) + #pick random line + for _ in range(rand_index): + line = self.get_next_line() + #check if our picked random line is really from another doc like we want it to be + if self.current_random_doc != self.current_doc: + break + return line + + def get_next_line(self): + """ Gets next line of random_file and starts over when reaching end of file""" + try: + line = self.random_file.__next__().strip() + #keep track of which document we are currently looking at to later avoid having the same doc as t1 + if line == "": + self.current_random_doc = self.current_random_doc + 1 + line = self.random_file.__next__().strip() + except StopIteration: + self.random_file.close() + self.random_file = open(self.corpus_path, "r", encoding=self.encoding) + line = self.random_file.__next__().strip() + return line + + +class InputExample(object): + """A single training/test example for the language model.""" + + def __init__(self, guid, tokens_a, tokens_b=None, is_next=None, lm_labels=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + tokens_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + tokens_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.tokens_a = tokens_a + self.tokens_b = tokens_b + self.is_next = is_next # nextSentence + self.lm_labels = lm_labels # masked words for language model + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, is_next, lm_label_ids): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.is_next = is_next + self.lm_label_ids = lm_label_ids + + +def random_word(tokens, tokenizer): + """ + Masking some random tokens for Language Model task with probabilities as in the original BERT paper. + :param tokens: list of str, tokenized sentence. + :param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here) + :return: (list of str, list of int), masked tokens and related labels for LM prediction + """ + output_label = [] + + for i, token in enumerate(tokens): + prob = random.random() + # mask token with 15% probability + if prob < 0.15: + prob /= 0.15 + + # 80% randomly change token to mask token + if prob < 0.8: + tokens[i] = "[MASK]" + + # 10% randomly change token to random token + elif prob < 0.9: + tokens[i] = random.choice(list(tokenizer.vocab.items()))[0] + + # -> rest 10% randomly keep current token + + # append current token to output (we will predict these later) + try: + output_label.append(tokenizer.vocab[token]) + except KeyError: + # For unknown words (should not occur with BPE vocab) + output_label.append(tokenizer.vocab["[UNK]"]) + logger.warning("Cannot find token '{}' in vocab. Using [UNK] insetad".format(token)) + else: + # no masking token (will be ignored by loss function later) + output_label.append(-1) + + return tokens, output_label + + +def convert_example_to_features(example, max_seq_length, tokenizer): + """ + Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with + IDs, LM labels, input_mask, CLS and SEP tokens etc. + :param example: InputExample, containing sentence input as strings and is_next label + :param max_seq_length: int, maximum length of sequence. + :param tokenizer: Tokenizer + :return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training) + """ + tokens_a = example.tokens_a + tokens_b = example.tokens_b + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + + t1_random, t1_label = random_word(tokens_a, tokenizer) + t2_random, t2_label = random_word(tokens_b, tokenizer) + # concatenate lm labels and account for CLS, SEP, SEP + lm_label_ids = ([-1] + t1_label + [-1] + t2_label + [-1]) + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambigiously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + assert len(tokens_b) > 0 + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + lm_label_ids.append(-1) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(lm_label_ids) == max_seq_length + + if example.guid < 5: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("tokens: %s" % " ".join( + [str(x) for x in tokens])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + logger.info("LM label: %s " % (lm_label_ids)) + logger.info("Is next sentence label: %s " % (example.is_next)) + + features = InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + lm_label_ids=lm_label_ids, + is_next=example.is_next) + return features + + +def main(): + parser = argparse.ArgumentParser() + + ## Required parameters + parser.add_argument("--train_file", + default=None, + type=str, + required=True, + help="The input train corpus.") + parser.add_argument("--bert_model", default=None, type=str, required=True, + help="Bert pre-trained model selected in the list: bert-base-uncased, " + "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") + parser.add_argument("--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model checkpoints will be written.") + + ## Other parameters + parser.add_argument("--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument("--do_train", + action='store_true', + help="Whether to run training.") + parser.add_argument("--train_batch_size", + default=32, + type=int, + help="Total batch size for training.") + parser.add_argument("--eval_batch_size", + default=8, + type=int, + help="Total batch size for eval.") + parser.add_argument("--learning_rate", + default=3e-5, + type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_proportion", + default=0.1, + type=float, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10%% of training.") + parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument("--on_memory", + action='store_true', + help="Whether to load train samples into memory or use disk") + parser.add_argument("--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models.") + parser.add_argument("--local_rank", + type=int, + default=-1, + help="local_rank for distributed training on gpus") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + parser.add_argument('--gradient_accumulation_steps', + type=int, + default=1, + help="Number of updates steps to accumualte before performing a backward/update pass.") + parser.add_argument('--fp16', + action='store_true', + help="Whether to use 16-bit float precision instead of 32-bit") + parser.add_argument('--loss_scale', + type = float, default = 0, + help = "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" + "0 (default value): dynamic loss scaling.\n" + "Positive power of 2: static loss scaling value.\n") + + args = parser.parse_args() + + if args.local_rank == -1 or args.no_cuda: + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( + device, n_gpu, bool(args.local_rank != -1), args.fp16)) + + if args.gradient_accumulation_steps < 1: + raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( + args.gradient_accumulation_steps)) + + args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + if not args.do_train and not args.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + if os.path.exists(args.output_dir) and os.listdir(args.output_dir): + raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) + os.makedirs(args.output_dir, exist_ok=True) + + tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) + + #train_examples = None + num_train_steps = None + if args.do_train: + print("Loading Train Dataset", args.train_file) + train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, + corpus_lines=None, on_memory=args.on_memory) + num_train_steps = int( + len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) + + # Prepare model + model = BertForPreTraining.from_pretrained(args.bert_model) + if args.fp16: + model.half() + model.to(device) + if args.local_rank != -1: + try: + from apex.parallel import DistributedDataParallel as DDP + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + model = DDP(model) + elif n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Prepare optimizer + param_optimizer = list(model.named_parameters()) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + if args.fp16: + try: + from apex.optimizers import FP16_Optimizer + from apex.optimizers import FusedAdam + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + optimizer = FusedAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + bias_correction=False, + max_grad_norm=1.0) + if args.loss_scale == 0: + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + else: + optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + + else: + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_steps) + + global_step = 0 + if args.do_train: + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_dataset)) + logger.info(" Batch size = %d", args.train_batch_size) + logger.info(" Num steps = %d", num_train_steps) + + if args.local_rank == -1: + train_sampler = RandomSampler(train_dataset) + else: + #TODO: check if this works with current data generator from disk that relies on file.__next__ + # (it doesn't return item back by index) + train_sampler = DistributedSampler(train_dataset) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) + + model.train() + for _ in trange(int(args.num_train_epochs), desc="Epoch"): + tr_loss = 0 + nb_tr_examples, nb_tr_steps = 0, 0 + for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): + batch = tuple(t.to(device) for t in batch) + input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch + loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + if args.fp16: + optimizer.backward(loss) + else: + loss.backward() + tr_loss += loss.item() + nb_tr_examples += input_ids.size(0) + nb_tr_steps += 1 + if (step + 1) % args.gradient_accumulation_steps == 0: + # modify learning rate with special warm up BERT uses + lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_steps, args.warmup_proportion) + for param_group in optimizer.param_groups: + param_group['lr'] = lr_this_step + optimizer.step() + optimizer.zero_grad() + global_step += 1 + + # Save a trained model + logger.info("** ** * Saving fine - tuned model ** ** * ") + model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self + output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") + if args.do_train: + torch.save(model_to_save.state_dict(), output_model_file) + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def accuracy(out, labels): + outputs = np.argmax(out, axis=1) + return np.sum(outputs == labels) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/run_squad.py b/examples/run_squad.py index e3213189bfba92..39e9c501996d5c 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -1,5 +1,6 @@ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -38,14 +39,14 @@ from pytorch_pretrained_bert.optimization import BertAdam from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO) logger = logging.getLogger(__name__) class SquadExample(object): - """A single training/test example for simple sequence classification.""" + """A single training/test example for the Squad dataset.""" def __init__(self, qas_id, @@ -107,7 +108,7 @@ def __init__(self, def read_squad_examples(input_file, is_training): """Read a SQuAD json file into a list of SquadExample.""" - with open(input_file, "r") as reader: + with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] def is_whitespace(c): @@ -669,34 +670,10 @@ def _compute_softmax(scores): probs.append(score / total_sum) return probs -def copy_optimizer_params_to_model(named_params_model, named_params_optimizer): - """ Utility function for optimize_on_cpu and 16-bits training. - Copy the parameters optimized on CPU/RAM back to the model on GPU - """ - for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model): - if name_opti != name_model: - logger.error("name_opti != name_model: {} {}".format(name_opti, name_model)) - raise ValueError - param_model.data.copy_(param_opti.data) - -def set_optimizer_params_grad(named_params_optimizer, named_params_model, test_nan=False): - """ Utility function for optimize_on_cpu and 16-bits training. - Copy the gradient of the GPU parameters to the CPU/RAMM copy of the model - """ - is_nan = False - for (name_opti, param_opti), (name_model, param_model) in zip(named_params_optimizer, named_params_model): - if name_opti != name_model: - logger.error("name_opti != name_model: {} {}".format(name_opti, name_model)) - raise ValueError - if param_model.grad is not None: - if test_nan and torch.isnan(param_model.grad).sum() > 0: - is_nan = True - if param_opti.grad is None: - param_opti.grad = torch.nn.Parameter(param_opti.data.new().resize_(*param_opti.data.size())) - param_opti.grad.data.copy_(param_model.grad.data) - else: - param_opti.grad = None - return is_nan +def warmup_linear(x, warmup=0.002): + if x < warmup: + return x/warmup + return 1.0 - x def main(): parser = argparse.ArgumentParser() @@ -704,9 +681,10 @@ def main(): ## Required parameters parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " - "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") + "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " + "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, - help="The output directory where the model checkpoints will be written.") + help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") @@ -720,8 +698,8 @@ def main(): parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") - parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") - parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") + parser.add_argument("--do_train", action='store_true', help="Whether to run training.") + parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") @@ -736,15 +714,14 @@ def main(): parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") - parser.add_argument("--verbose_logging", default=False, action='store_true', + parser.add_argument("--verbose_logging", action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", - default=False, action='store_true', help="Whether not to use CUDA when available") - parser.add_argument('--seed', - type=int, + parser.add_argument('--seed', + type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', @@ -752,24 +729,20 @@ def main(): default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", - default=True, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") - parser.add_argument('--optimize_on_cpu', - default=False, - action='store_true', - help="Whether to perform optimization and keep the optimizer averages on CPU") parser.add_argument('--fp16', - default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', - type=float, default=128, - help='Loss scaling, positive power of 2 values can improve fp16 convergence.') + type=float, default=0, + help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" + "0 (default value): dynamic loss scaling.\n" + "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() @@ -777,14 +750,12 @@ def main(): device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: + torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') - if args.fp16: - logger.info("16-bits training currently not supported in distributed training") - args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) - logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}".format( + logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: @@ -811,11 +782,11 @@ def main(): raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") - if os.path.exists(args.output_dir) and os.listdir(args.output_dir): + if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) - tokenizer = BertTokenizer.from_pretrained(args.bert_model) + tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None @@ -828,41 +799,61 @@ def main(): # Prepare model model = BertForQuestionAnswering.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) + if args.fp16: model.half() model.to(device) if args.local_rank != -1: - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], - output_device=args.local_rank) + try: + from apex.parallel import DistributedDataParallel as DDP + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer - if args.fp16: - param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ - for n, param in model.named_parameters()] - elif args.optimize_on_cpu: - param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ - for n, param in model.named_parameters()] - else: - param_optimizer = list(model.named_parameters()) - no_decay = ['bias', 'gamma', 'beta'] + param_optimizer = list(model.named_parameters()) + + # hack to remove pooler, which is not used + # thus it produce None grad that break apex + param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] + + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] + t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=t_total) + if args.fp16: + try: + from apex.optimizers import FP16_Optimizer + from apex.optimizers import FusedAdam + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + optimizer = FusedAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + bias_correction=False, + max_grad_norm=1.0) + if args.loss_scale == 0: + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + else: + optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + else: + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=t_total) global_step = 0 if args.do_train: cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( - args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) + list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: @@ -878,7 +869,7 @@ def main(): if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: - train_features = pickle.dump(train_features, writer) + pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) @@ -906,33 +897,33 @@ def main(): loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. - if args.fp16 and args.loss_scale != 1.0: - # rescale loss for fp16 training - # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html - loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps - loss.backward() + + if args.fp16: + optimizer.backward(loss) + else: + loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: - if args.fp16 or args.optimize_on_cpu: - if args.fp16 and args.loss_scale != 1.0: - # scale down gradients for fp16 training - for param in model.parameters(): - if param.grad is not None: - param.grad.data = param.grad.data / args.loss_scale - is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True) - if is_nan: - logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling") - args.loss_scale = args.loss_scale / 2 - model.zero_grad() - continue - optimizer.step() - copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) - else: - optimizer.step() - model.zero_grad() + # modify learning rate with special warm up BERT uses + lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) + for param_group in optimizer.param_groups: + param_group['lr'] = lr_this_step + optimizer.step() + optimizer.zero_grad() global_step += 1 + # Save a trained model + model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self + output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") + if args.do_train: + torch.save(model_to_save.state_dict(), output_model_file) + + # Load a trained model that you have fine-tuned + model_state_dict = torch.load(output_model_file) + model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict) + model.to(device) + if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False) diff --git a/examples/run_squad2.py b/examples/run_squad2.py new file mode 100644 index 00000000000000..558b24764e87f7 --- /dev/null +++ b/examples/run_squad2.py @@ -0,0 +1,1075 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run BERT on SQuAD 2.0""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import collections +import logging +import json +import math +import os +import random +import pickle +from tqdm import tqdm, trange + +import numpy as np +import torch +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from torch.utils.data.distributed import DistributedSampler + +from pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer +from pytorch_pretrained_bert.modeling import BertForQuestionAnswering +from pytorch_pretrained_bert.optimization import BertAdam +from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE + +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO) +logger = logging.getLogger(__name__) + + +class SquadExample(object): + """ + A single training/test example for the Squad dataset. + For examples without an answer, the start and end position are -1. + """ + + def __init__(self, + qas_id, + question_text, + doc_tokens, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=None): + self.qas_id = qas_id + self.question_text = question_text + self.doc_tokens = doc_tokens + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + def __str__(self): + return self.__repr__() + + def __repr__(self): + s = "" + s += "qas_id: %s" % (self.qas_id) + s += ", question_text: %s" % ( + self.question_text) + s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) + if self.start_position: + s += ", start_position: %d" % (self.start_position) + if self.start_position: + s += ", end_position: %d" % (self.end_position) + if self.start_position: + s += ", is_impossible: %r" % (self.is_impossible) + return s + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + start_position=None, + end_position=None, + is_impossible=None): + self.unique_id = unique_id + self.example_index = example_index + self.doc_span_index = doc_span_index + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + self.token_is_max_context = token_is_max_context + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + +def read_squad_examples(input_file, is_training): + """Read a SQuAD json file into a list of SquadExample.""" + with open(input_file, "r", encoding='utf-8') as reader: + source = json.load(reader) + input_data = source["data"] + version = source["version"] + + def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + for c in paragraph_text: + if is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + end_position = None + orig_answer_text = None + is_impossible = False + if is_training: + if version == "v2.0": + is_impossible = qa["is_impossible"] + if (len(qa["answers"]) != 1) and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer.") + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + answer_offset = answer["answer_start"] + answer_length = len(orig_answer_text) + start_position = char_to_word_offset[answer_offset] + end_position = char_to_word_offset[answer_offset + answer_length - 1] + # Only add answers where the text can be exactly recovered from the + # document. If this CAN'T happen it's likely due to weird Unicode + # stuff so we will just skip the example. + # + # Note that this means for training mode, every example is NOT + # guaranteed to be preserved. + actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) + cleaned_answer_text = " ".join( + whitespace_tokenize(orig_answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + logger.warning("Could not find answer: '%s' vs. '%s'", + actual_text, cleaned_answer_text) + continue + else: + start_position = -1 + end_position = -1 + orig_answer_text = "" + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + doc_tokens=doc_tokens, + orig_answer_text=orig_answer_text, + start_position=start_position, + end_position=end_position, + is_impossible=is_impossible) + examples.append(example) + return examples + + +def convert_examples_to_features(examples, tokenizer, max_seq_length, + doc_stride, max_query_length, is_training): + """Loads a data file into a list of `InputBatch`s.""" + + unique_id = 1000000000 + + features = [] + for (example_index, example) in enumerate(examples): + query_tokens = tokenizer.tokenize(example.question_text) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + tok_start_position = None + tok_end_position = None + if is_training and example.is_impossible: + tok_start_position = -1 + tok_end_position = -1 + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, + example.orig_answer_text) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + _DocSpan = collections.namedtuple( # pylint: disable=invalid-name + "DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, + split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + start_position = None + end_position = None + if is_training and not example.is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if (example.start_position < doc_start or + example.end_position < doc_start or + example.start_position > doc_end or example.end_position > doc_end): + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + if is_training and example.is_impossible: + start_position = 0 + end_position = 0 + + if example_index < 20: + logger.info("*** Example ***") + logger.info("unique_id: %s" % (unique_id)) + logger.info("example_index: %s" % (example_index)) + logger.info("doc_span_index: %s" % (doc_span_index)) + logger.info("tokens: %s" % " ".join(tokens)) + logger.info("token_to_orig_map: %s" % " ".join([ + "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) + logger.info("token_is_max_context: %s" % " ".join([ + "%d:%s" % (x, y) for (x, y) in token_is_max_context.items() + ])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info( + "input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + if is_training and example.is_impossible: + logger.info("impossible example") + if is_training and not example.is_impossible: + answer_text = " ".join(tokens[start_position:(end_position + 1)]) + logger.info("start_position: %d" % (start_position)) + logger.info("end_position: %d" % (end_position)) + logger.info( + "answer: %s" % (answer_text)) + + features.append( + InputFeatures( + unique_id=unique_id, + example_index=example_index, + doc_span_index=doc_span_index, + tokens=tokens, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + start_position=start_position, + end_position=end_position, + is_impossible=example.is_impossible)) + unique_id += 1 + + return features + + +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + + # The SQuAD annotations are character based. We first project them to + # whitespace-tokenized words. But then after WordPiece tokenization, we can + # often find a "better match". For example: + # + # Question: What year was John Smith born? + # Context: The leader was John Smith (1895-1943). + # Answer: 1895 + # + # The original whitespace-tokenized answer will be "(1895-1943).". However + # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match + # the exact answer, 1895. + # + # However, this is not always possible. Consider the following: + # + # Question: What country is the top exporter of electornics? + # Context: The Japanese electronics industry is the lagest in the world. + # Answer: Japan + # + # In this case, the annotator chose "Japan" as a character sub-span of + # the word "Japanese". Since our WordPiece tokenizer does not split + # "Japanese", we just use "Japanese" as the annotation. This is fairly rare + # in SQuAD, but does happen. + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + + +RawResult = collections.namedtuple("RawResult", + ["unique_id", "start_logits", "end_logits"]) + + +def write_predictions(all_examples, all_features, all_results, n_best_size, + max_answer_length, do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file, verbose_logging, is_version2, null_score_diff_threshold): + """Write final predictions to the json file and log-odds of null if needed.""" + logger.info("Writing predictions to: %s" % (output_prediction_file)) + logger.info("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min mull score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature.unique_id] + start_indexes = _get_best_indexes(result.start_logits, n_best_size) + end_indexes = _get_best_indexes(result.end_logits, n_best_size) + # if we could have irrelevant answers, get the min score of irrelevant + if is_version2: + feature_null_score = result.start_logits[0] + result.end_logits[0] + if feature_null_score < score_null: + score_null = feature_null_score + min_null_feature_index = feature_index + null_start_logit = result.start_logits[0] + null_end_logit = result.end_logits[0] + + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) + + if is_version2: + prelim_predictions.append( + _PrelimPrediction( + feature_index=min_null_feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit)) + + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"]) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + tok_text = " ".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) + if final_text in seen_predictions: + continue + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) + + # if we didn't inlude the empty option in the n-best, inlcude it + if is_version2: + if "" not in seen_predictions: + nbest.append( + _NbestPrediction( + text="", start_logit=null_start_logit, + end_logit=null_end_logit)) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + + if not is_version2: + all_predictions[example.qas_id] = nbest_json[0]["text"] + else: + # predict "" iff the null score - the score of best non-null > threshold + score_diff = score_null - best_non_null_entry.start_logit - ( + best_non_null_entry.end_logit) + scores_diff_json[example.qas_id] = score_diff + if score_diff > null_score_diff_threshold: + all_predictions[example.qas_id] = "" + else: + all_predictions[example.qas_id] = best_non_null_entry.text + all_nbest_json[example.qas_id] = nbest_json + + with open(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + + with open(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + + if is_version2: + with open(output_null_log_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + +def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heruistic between + # `pred_text` and `orig_text` to get a character-to-charcter alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = " ".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + if verbose_logging: + logger.info( + "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + if verbose_logging: + logger.info("Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, tok_ns_text) + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in tok_ns_to_s_map.items(): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + if verbose_logging: + logger.info("Couldn't map start position") + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + if verbose_logging: + logger.info("Couldn't map end position") + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + +def warmup_linear(x, warmup=0.002): + if x < warmup: + return x/warmup + return 1.0 - x + +def main(): + parser = argparse.ArgumentParser() + + ## Required parameters + parser.add_argument("--bert_model", default=None, type=str, required=True, + help="Bert pre-trained model selected in the list: bert-base-uncased, " + "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") + parser.add_argument("--output_dir", default=None, type=str, required=True, + help="The output directory where the model checkpoints and predictions will be written.") + + ## Other parameters + parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") + parser.add_argument("--predict_file", default=None, type=str, + help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") + parser.add_argument("--max_seq_length", default=384, type=int, + help="The maximum total input sequence length after WordPiece tokenization. Sequences " + "longer than this will be truncated, and sequences shorter than this will be padded.") + parser.add_argument("--doc_stride", default=128, type=int, + help="When splitting up a long document into chunks, how much stride to take between chunks.") + parser.add_argument("--max_query_length", default=64, type=int, + help="The maximum number of tokens for the question. Questions longer than this will " + "be truncated to this length.") + parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") + parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") + parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") + parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") + parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") + parser.add_argument("--num_train_epochs", default=3.0, type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_proportion", default=0.1, type=float, + help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " + "of training.") + parser.add_argument("--n_best_size", default=20, type=int, + help="The total number of n-best predictions to generate in the nbest_predictions.json " + "output file.") + parser.add_argument("--max_answer_length", default=30, type=int, + help="The maximum length of an answer that can be generated. This is needed because the start " + "and end predictions are not conditioned on one another.") + parser.add_argument("--verbose_logging", default=False, action='store_true', + help="If true, all of the warnings related to data processing will be printed. " + "A number of warnings are expected for a normal SQuAD evaluation.") + parser.add_argument("--no_cuda", + default=False, + action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + parser.add_argument('--gradient_accumulation_steps', + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models.") + parser.add_argument("--local_rank", + type=int, + default=-1, + help="local_rank for distributed training on gpus") + parser.add_argument('--fp16', + default=False, + action='store_true', + help="Whether to use 16-bit float precision instead of 32-bit") + parser.add_argument('--loss_scale', + type=float, default=0, + help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" + "0 (default value): dynamic loss scaling.\n" + "Positive power of 2: static loss scaling value.\n") + parser.add_argument('--null_score_diff_threshold', + type=float, default=0.0, + help="If null_score - best_non_null is greater than the threshold predict null.") + + args = parser.parse_args() + + if args.local_rank == -1 or args.no_cuda: + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( + device, n_gpu, bool(args.local_rank != -1), args.fp16)) + + if args.gradient_accumulation_steps < 1: + raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( + args.gradient_accumulation_steps)) + + args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + if not args.do_train and not args.do_predict: + raise ValueError("At least one of `do_train` or `do_predict` must be True.") + + if args.do_train: + if not args.train_file: + raise ValueError( + "If `do_train` is True, then `train_file` must be specified.") + if args.do_predict: + if not args.predict_file: + raise ValueError( + "If `do_predict` is True, then `predict_file` must be specified.") + + if os.path.exists(args.output_dir) and os.listdir(args.output_dir): + raise ValueError("Output directory () already exists and is not empty.") + os.makedirs(args.output_dir, exist_ok=True) + + tokenizer = BertTokenizer.from_pretrained(args.bert_model) + + train_examples = None + num_train_steps = None + if args.do_train: + train_examples = read_squad_examples( + input_file=args.train_file, is_training=True) + num_train_steps = int( + len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) + + # Prepare model + model = BertForQuestionAnswering.from_pretrained(args.bert_model, + cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) + + if args.fp16: + model.half() + model.to(device) + if args.local_rank != -1: + try: + from apex.parallel import DistributedDataParallel as DDP + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + model = DDP(model) + elif n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Prepare optimizer + param_optimizer = list(model.named_parameters()) + + # hack to remove pooler, which is not used + # thus it produce None grad that break apex + param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] + + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + + t_total = num_train_steps + if args.local_rank != -1: + t_total = t_total // torch.distributed.get_world_size() + if args.fp16: + try: + from apex.optimizers import FP16_Optimizer + from apex.optimizers import FusedAdam + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + optimizer = FusedAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + bias_correction=False, + max_grad_norm=1.0) + if args.loss_scale == 0: + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + else: + optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + else: + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=t_total) + + global_step = 0 + if args.do_train: + cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( + args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) + train_features = None + try: + with open(cached_train_features_file, "rb") as reader: + train_features = pickle.load(reader) + except: + train_features = convert_examples_to_features( + examples=train_examples, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=True) + if args.local_rank == -1 or torch.distributed.get_rank() == 0: + logger.info(" Saving train features into cached file %s", cached_train_features_file) + with open(cached_train_features_file, "wb") as writer: + pickle.dump(train_features, writer) + logger.info("***** Running training *****") + logger.info(" Num orig examples = %d", len(train_examples)) + logger.info(" Num split examples = %d", len(train_features)) + logger.info(" Batch size = %d", args.train_batch_size) + logger.info(" Num steps = %d", num_train_steps) + all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) + all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) + all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) + all_is_impossibles = torch.tensor([int(f.is_impossible) for f in train_features], dtype=torch.long) + train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, + all_start_positions, all_end_positions, all_is_impossibles) + if args.local_rank == -1: + train_sampler = RandomSampler(train_data) + else: + train_sampler = DistributedSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) + + model.train() + for _ in trange(int(args.num_train_epochs), desc="Epoch"): + for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): + if n_gpu == 1: + batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self + input_ids, input_mask, segment_ids, start_positions, end_positions, _ = batch + loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + + if args.fp16: + optimizer.backward(loss) + else: + loss.backward() + if (step + 1) % args.gradient_accumulation_steps == 0: + # modify learning rate with special warm up BERT uses + lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) + for param_group in optimizer.param_groups: + param_group['lr'] = lr_this_step + optimizer.step() + optimizer.zero_grad() + global_step += 1 + + # Save a trained model + model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self + output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") + if args.do_train: + torch.save(model_to_save.state_dict(), output_model_file) + + # Load a trained model that you have fine-tuned + model_state_dict = torch.load(output_model_file) + model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict) + model.to(device) + + if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): + eval_examples = read_squad_examples( + input_file=args.predict_file, is_training=False) + eval_features = convert_examples_to_features( + examples=eval_examples, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=False) + + logger.info("***** Running predictions *****") + logger.info(" Num orig examples = %d", len(eval_examples)) + logger.info(" Num split examples = %d", len(eval_features)) + logger.info(" Batch size = %d", args.predict_batch_size) + + all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) + all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) + eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) + # Run prediction for full data + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) + + model.eval() + all_results = [] + logger.info("Start evaluating") + for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"): + if len(all_results) % 1000 == 0: + logger.info("Processing example: %d" % (len(all_results))) + input_ids = input_ids.to(device) + input_mask = input_mask.to(device) + segment_ids = segment_ids.to(device) + with torch.no_grad(): + batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) + for i, example_index in enumerate(example_indices): + start_logits = batch_start_logits[i].detach().cpu().tolist() + end_logits = batch_end_logits[i].detach().cpu().tolist() + eval_feature = eval_features[example_index.item()] + unique_id = int(eval_feature.unique_id) + all_results.append(RawResult(unique_id=unique_id, + start_logits=start_logits, + end_logits=end_logits)) + output_prediction_file = os.path.join(args.output_dir, "predictions.json") + output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") + output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") + write_predictions(eval_examples, eval_features, all_results, + args.n_best_size, args.max_answer_length, + args.do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file, args.verbose_logging, True, args.null_score_diff_threshold) + + +if __name__ == "__main__": + main() diff --git a/examples/run_swag.py b/examples/run_swag.py new file mode 100644 index 00000000000000..3fb87ae3e77882 --- /dev/null +++ b/examples/run_swag.py @@ -0,0 +1,536 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +import logging +import os +import argparse +import random +from tqdm import tqdm, trange +import csv + +import numpy as np +import torch +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from torch.utils.data.distributed import DistributedSampler + +from pytorch_pretrained_bert.tokenization import BertTokenizer +from pytorch_pretrained_bert.modeling import BertForMultipleChoice +from pytorch_pretrained_bert.optimization import BertAdam +from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE + +logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt = '%m/%d/%Y %H:%M:%S', + level = logging.INFO) +logger = logging.getLogger(__name__) + + +class SwagExample(object): + """A single training/test example for the SWAG dataset.""" + def __init__(self, + swag_id, + context_sentence, + start_ending, + ending_0, + ending_1, + ending_2, + ending_3, + label = None): + self.swag_id = swag_id + self.context_sentence = context_sentence + self.start_ending = start_ending + self.endings = [ + ending_0, + ending_1, + ending_2, + ending_3, + ] + self.label = label + + def __str__(self): + return self.__repr__() + + def __repr__(self): + l = [ + f"swag_id: {self.swag_id}", + f"context_sentence: {self.context_sentence}", + f"start_ending: {self.start_ending}", + f"ending_0: {self.endings[0]}", + f"ending_1: {self.endings[1]}", + f"ending_2: {self.endings[2]}", + f"ending_3: {self.endings[3]}", + ] + + if self.label is not None: + l.append(f"label: {self.label}") + + return ", ".join(l) + + +class InputFeatures(object): + def __init__(self, + example_id, + choices_features, + label + + ): + self.example_id = example_id + self.choices_features = [ + { + 'input_ids': input_ids, + 'input_mask': input_mask, + 'segment_ids': segment_ids + } + for _, input_ids, input_mask, segment_ids in choices_features + ] + self.label = label + + +def read_swag_examples(input_file, is_training): + with open(input_file, 'r', encoding='utf-8') as f: + reader = csv.reader(f) + lines = list(reader) + + if is_training and lines[0][-1] != 'label': + raise ValueError( + "For training, the input file must contain a label column." + ) + + examples = [ + SwagExample( + swag_id = line[2], + context_sentence = line[4], + start_ending = line[5], # in the swag dataset, the + # common beginning of each + # choice is stored in "sent2". + ending_0 = line[7], + ending_1 = line[8], + ending_2 = line[9], + ending_3 = line[10], + label = int(line[11]) if is_training else None + ) for line in lines[1:] # we skip the line with the column names + ] + + return examples + +def convert_examples_to_features(examples, tokenizer, max_seq_length, + is_training): + """Loads a data file into a list of `InputBatch`s.""" + + # Swag is a multiple choice task. To perform this task using Bert, + # we will use the formatting proposed in "Improving Language + # Understanding by Generative Pre-Training" and suggested by + # @jacobdevlin-google in this issue + # https://github.com/google-research/bert/issues/38. + # + # Each choice will correspond to a sample on which we run the + # inference. For a given Swag example, we will create the 4 + # following inputs: + # - [CLS] context [SEP] choice_1 [SEP] + # - [CLS] context [SEP] choice_2 [SEP] + # - [CLS] context [SEP] choice_3 [SEP] + # - [CLS] context [SEP] choice_4 [SEP] + # The model will output a single value for each input. To get the + # final decision of the model, we will run a softmax over these 4 + # outputs. + features = [] + for example_index, example in enumerate(examples): + context_tokens = tokenizer.tokenize(example.context_sentence) + start_ending_tokens = tokenizer.tokenize(example.start_ending) + + choices_features = [] + for ending_index, ending in enumerate(example.endings): + # We create a copy of the context tokens in order to be + # able to shrink it according to ending_tokens + context_tokens_choice = context_tokens[:] + ending_tokens = start_ending_tokens + tokenizer.tokenize(ending) + # Modifies `context_tokens_choice` and `ending_tokens` in + # place so that the total length is less than the + # specified length. Account for [CLS], [SEP], [SEP] with + # "- 3" + _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3) + + tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"] + segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + padding = [0] * (max_seq_length - len(input_ids)) + input_ids += padding + input_mask += padding + segment_ids += padding + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + choices_features.append((tokens, input_ids, input_mask, segment_ids)) + + label = example.label + if example_index < 5: + logger.info("*** Example ***") + logger.info(f"swag_id: {example.swag_id}") + for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features): + logger.info(f"choice: {choice_idx}") + logger.info(f"tokens: {' '.join(tokens)}") + logger.info(f"input_ids: {' '.join(map(str, input_ids))}") + logger.info(f"input_mask: {' '.join(map(str, input_mask))}") + logger.info(f"segment_ids: {' '.join(map(str, segment_ids))}") + if is_training: + logger.info(f"label: {label}") + + features.append( + InputFeatures( + example_id = example.swag_id, + choices_features = choices_features, + label = label + ) + ) + + return features + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + +def accuracy(out, labels): + outputs = np.argmax(out, axis=1) + return np.sum(outputs == labels) + +def select_field(features, field): + return [ + [ + choice[field] + for choice in feature.choices_features + ] + for feature in features + ] + +def warmup_linear(x, warmup=0.002): + if x < warmup: + return x/warmup + return 1.0 - x + +def main(): + parser = argparse.ArgumentParser() + + ## Required parameters + parser.add_argument("--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the .csv files (or other data files) for the task.") + parser.add_argument("--bert_model", default=None, type=str, required=True, + help="Bert pre-trained model selected in the list: bert-base-uncased, " + "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " + "bert-base-multilingual-cased, bert-base-chinese.") + parser.add_argument("--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model checkpoints will be written.") + + ## Other parameters + parser.add_argument("--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument("--do_train", + action='store_true', + help="Whether to run training.") + parser.add_argument("--do_eval", + action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument("--do_lower_case", + action='store_true', + help="Set this flag if you are using an uncased model.") + parser.add_argument("--train_batch_size", + default=32, + type=int, + help="Total batch size for training.") + parser.add_argument("--eval_batch_size", + default=8, + type=int, + help="Total batch size for eval.") + parser.add_argument("--learning_rate", + default=5e-5, + type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_proportion", + default=0.1, + type=float, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10%% of training.") + parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument("--local_rank", + type=int, + default=-1, + help="local_rank for distributed training on gpus") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + parser.add_argument('--gradient_accumulation_steps', + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.") + parser.add_argument('--fp16', + action='store_true', + help="Whether to use 16-bit float precision instead of 32-bit") + parser.add_argument('--loss_scale', + type=float, default=0, + help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" + "0 (default value): dynamic loss scaling.\n" + "Positive power of 2: static loss scaling value.\n") + + args = parser.parse_args() + + if args.local_rank == -1 or args.no_cuda: + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( + device, n_gpu, bool(args.local_rank != -1), args.fp16)) + + if args.gradient_accumulation_steps < 1: + raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( + args.gradient_accumulation_steps)) + + args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + if not args.do_train and not args.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + if os.path.exists(args.output_dir) and os.listdir(args.output_dir): + raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) + os.makedirs(args.output_dir, exist_ok=True) + + tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) + + train_examples = None + num_train_steps = None + if args.do_train: + train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True) + num_train_steps = int( + len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) + + # Prepare model + model = BertForMultipleChoice.from_pretrained(args.bert_model, + cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), + num_choices=4) + if args.fp16: + model.half() + model.to(device) + if args.local_rank != -1: + try: + from apex.parallel import DistributedDataParallel as DDP + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + model = DDP(model) + elif n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Prepare optimizer + param_optimizer = list(model.named_parameters()) + + # hack to remove pooler, which is not used + # thus it produce None grad that break apex + param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] + + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + t_total = num_train_steps + if args.local_rank != -1: + t_total = t_total // torch.distributed.get_world_size() + if args.fp16: + try: + from apex.optimizers import FP16_Optimizer + from apex.optimizers import FusedAdam + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + optimizer = FusedAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + bias_correction=False, + max_grad_norm=1.0) + if args.loss_scale == 0: + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + else: + optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + else: + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=t_total) + + global_step = 0 + if args.do_train: + train_features = convert_examples_to_features( + train_examples, tokenizer, args.max_seq_length, True) + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_examples)) + logger.info(" Batch size = %d", args.train_batch_size) + logger.info(" Num steps = %d", num_train_steps) + all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) + all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) + all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) + all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) + train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) + if args.local_rank == -1: + train_sampler = RandomSampler(train_data) + else: + train_sampler = DistributedSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) + + model.train() + for _ in trange(int(args.num_train_epochs), desc="Epoch"): + tr_loss = 0 + nb_tr_examples, nb_tr_steps = 0, 0 + for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): + batch = tuple(t.to(device) for t in batch) + input_ids, input_mask, segment_ids, label_ids = batch + loss = model(input_ids, segment_ids, input_mask, label_ids) + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + if args.fp16 and args.loss_scale != 1.0: + # rescale loss for fp16 training + # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html + loss = loss * args.loss_scale + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + tr_loss += loss.item() + nb_tr_examples += input_ids.size(0) + nb_tr_steps += 1 + + if args.fp16: + optimizer.backward(loss) + else: + loss.backward() + if (step + 1) % args.gradient_accumulation_steps == 0: + # modify learning rate with special warm up BERT uses + lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) + for param_group in optimizer.param_groups: + param_group['lr'] = lr_this_step + optimizer.step() + optimizer.zero_grad() + global_step += 1 + + # Save a trained model + model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self + output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") + torch.save(model_to_save.state_dict(), output_model_file) + + # Load a trained model that you have fine-tuned + model_state_dict = torch.load(output_model_file) + model = BertForMultipleChoice.from_pretrained(args.bert_model, + state_dict=model_state_dict, + num_choices=4) + model.to(device) + + if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): + eval_examples = read_swag_examples(os.path.join(args.data_dir, 'val.csv'), is_training = True) + eval_features = convert_examples_to_features( + eval_examples, tokenizer, args.max_seq_length, True) + logger.info("***** Running evaluation *****") + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) + all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) + all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) + all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) + eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) + # Run prediction for full data + eval_sampler = SequentialSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) + + model.eval() + eval_loss, eval_accuracy = 0, 0 + nb_eval_steps, nb_eval_examples = 0, 0 + for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: + input_ids = input_ids.to(device) + input_mask = input_mask.to(device) + segment_ids = segment_ids.to(device) + label_ids = label_ids.to(device) + + with torch.no_grad(): + tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) + logits = model(input_ids, segment_ids, input_mask) + + logits = logits.detach().cpu().numpy() + label_ids = label_ids.to('cpu').numpy() + tmp_eval_accuracy = accuracy(logits, label_ids) + + eval_loss += tmp_eval_loss.mean().item() + eval_accuracy += tmp_eval_accuracy + + nb_eval_examples += input_ids.size(0) + nb_eval_steps += 1 + + eval_loss = eval_loss / nb_eval_steps + eval_accuracy = eval_accuracy / nb_eval_examples + + result = {'eval_loss': eval_loss, + 'eval_accuracy': eval_accuracy, + 'global_step': global_step, + 'loss': tr_loss/nb_tr_steps} + + output_eval_file = os.path.join(args.output_dir, "eval_results.txt") + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + +if __name__ == "__main__": + main() diff --git "a/likunlin-\344\274\230\345\214\226-Copy1.ipynb" "b/likunlin-\344\274\230\345\214\226-Copy1.ipynb" new file mode 100644 index 00000000000000..5e459999f3fce0 --- /dev/null +++ "b/likunlin-\344\274\230\345\214\226-Copy1.ipynb" @@ -0,0 +1,1396 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "InteractiveShell.ast_node_interactivity = 'all'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten.\n", + "Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead.\n", + "Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable.\n", + "Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable.\n", + "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "\n", + "import numpy as np\n", + "import math\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "from pylab import rcParams\n", + "\n", + "import torch\n", + "import torch.nn.functional as F\n", + "from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig\n", + "from examples.extract_features import *" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "04/16/2019 09:11:27 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/xd/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084\n", + "04/16/2019 09:11:27 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/\n", + "04/16/2019 09:11:27 - INFO - pytorch_pretrained_bert.modeling - Model config {\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"max_position_embeddings\": 512,\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"type_vocab_size\": 2,\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "class Args:\n", + " def __init__(self):\n", + " pass\n", + " \n", + "args = Args()\n", + "args.no_cuda = True\n", + "\n", + "CONFIG_NAME = 'bert_config.json'\n", + "BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/'\n", + "config_file = os.path.join(BERT_DIR, CONFIG_NAME)\n", + "config = BertConfig.from_json_file(config_file)\n", + "\n", + "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')#do_lower_case:在标记化时将文本转换为小写。默认= True\n", + "model = BertForPreTraining.from_pretrained(BERT_DIR)\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n", + "_ = model.to(device)\n", + "_ = model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13579\n", + "['i', 'ari', '##ve', 'home', '.']\n" + ] + } + ], + "source": [ + "print(tokenizer.vocab['doubts'])\n", + "print(tokenizer.tokenize(\"I arive home.\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BertForPreTraining:\n", + "Outputs:\n", + " if `masked_lm_labels` and `next_sentence_label` are not `None`:\n", + " Outputs the total_loss which is the sum of the masked language modeling loss and the next\n", + " sentence classification loss.\n", + " if `masked_lm_labels` or `next_sentence_label` is `None`:\n", + " Outputs a tuple comprising\n", + " - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and\n", + " - the next sentence classification logits of shape [batch_size, 2]." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "from_pretrained:\n", + "Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.\n", + "Download and cache the pre-trained model file if needed." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "04/16/2019 09:34:51 - INFO - examples.extract_features - tokens: [CLS] i love you . hello everybody . [SEP]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[103, 1045, 2293, 103, 1012, 7592, 103, 1012, 102]\n", + "[101, 103, 2293, 2017, 103, 7592, 7955, 103, 102]\n", + "[101, 1045, 103, 2017, 1012, 103, 7955, 1012, 103]\n" + ] + }, + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[0mmasked_feature_copies\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatches\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcopy_and_mask_feature\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 105\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmasked_feature_copies\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m#结果[101, 1045, 2293, 103, 102]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m: list index out of range" + ] + } + ], + "source": [ + "import re\n", + "def convert_text_to_examples(text): #把每一行的句子变成一个实例,一个实例中包含text_a,text_b(text_b目前是没用的)\n", + " examples = []\n", + " unique_id = 0\n", + " if True:\n", + " for line in text:\n", + " line = line.strip()\n", + " text_a = None\n", + " text_b = None\n", + " m = re.match(r\"^(.*) \\|\\|\\| (.*)$\", line) #想要匹配这样的字符串'You are my sunshine. ||| I love you.'\n", + " \n", + " if m is None:\n", + " text_a = line\n", + " else:\n", + " text_a = m.group(1) #匹配的第一句,比如You are my sunshine,my only sunshine.\n", + " text_b = m.group(2) #匹配的第二句,比如I love you.\n", + " \n", + " examples.append(\n", + " InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))\n", + " unique_id += 1\n", + " return examples\n", + "#疑问,当text是一行的时候,line是一个个字母 -> text是[\"***\"]的形式\n", + "#print(convert_text_to_examples({\"I love you.\",\"hello everybody.\"})[0].text_a)\n", + "\n", + "def convert_examples_to_features(examples, tokenizer, append_special_tokens=True, replace_mask=True, print_info=False):\n", + " #把实例变成一个特征\n", + " features = []\n", + " for (ex_index, example) in enumerate(examples):\n", + " tokens_a = tokenizer.tokenize(example.text_a) #tokenizer的作用是\n", + " #print(example.unique_id) #*****************************\n", + " tokens_b = None\n", + " if example.text_b:\n", + " tokens_b = tokenizer.tokenize(example.text_b)\n", + "\n", + " tokens = []\n", + " input_type_ids = [] #segment embedding\n", + " if append_special_tokens: #输入参数中默认为true\n", + " tokens.append(\"[CLS]\")\n", + " input_type_ids.append(0)\n", + " for token in tokens_a:\n", + " if replace_mask and token == '_': # XD\n", + " token = \"[MASK]\"\n", + " tokens.append(token)\n", + " input_type_ids.append(0)\n", + " if append_special_tokens:\n", + " tokens.append(\"[SEP]\")\n", + " input_type_ids.append(0)\n", + "\n", + " if tokens_b:\n", + " for token in tokens_b:\n", + " if replace_mask and token == '_': # XD\n", + " token = \"[MASK]\"\n", + " tokens.append(token)\n", + " input_type_ids.append(1)\n", + " if append_special_tokens:\n", + " tokens.append(\"[SEP]\")\n", + " input_type_ids.append(1)\n", + " #print(tokens) #*******************************\n", + " input_ids = tokenizer.convert_tokens_to_ids(tokens) #把原来句子中的词语编成在字典中的编号\n", + " input_mask = [1] * len(input_ids) \n", + " #print(input_ids)#***********************************\n", + " if ex_index < 5:\n", + "# logger.info(\"*** Example ***\")\n", + "# logger.info(\"unique_id: %s\" % (example.unique_id))\n", + " logger.info(\"tokens: %s\" % \" \".join([str(x) for x in tokens]))\n", + "# logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n", + "# logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n", + "# logger.info(\n", + "# \"input_type_ids: %s\" % \" \".join([str(x) for x in input_type_ids]))\n", + " \n", + " features.append(\n", + " InputFeatures(\n", + " unique_id=example.unique_id,\n", + " tokens=tokens,\n", + " input_ids=input_ids,#字符串中的每个单词在词典中的index序列\n", + " input_mask=input_mask, #一堆1\n", + " input_type_ids=input_type_ids)) #第0类和第1类,对text_a,text_b的区分,本代码中全都是零\n", + " return features\n", + " \n", + "\n", + "\n", + "def copy_and_mask_feature(feature, step, masked_tokens=None): #step参数用来表示每多少个单词mask一次\n", + " import copy\n", + " tokens = feature.tokens\n", + " len_token = len(tokens)\n", + " if len_token 0\n", + " masked_feature_copies = []\n", + " for i in batches: #用[mask]依次掩盖每一个位置\n", + " feature_copy = copy.deepcopy(feature)\n", + " masked_pos = i\n", + " while masked_pos < len_token:\n", + " feature_copy.input_ids[masked_pos] = tokenizer.vocab[\"[MASK]\"]\n", + " masked_pos = masked_pos + step\n", + " masked_feature_copies.append(feature_copy)\n", + " return masked_feature_copies, batches\n", + "\n", + "#examples = convert_text_to_examples({\"I love you.Hello everybody.\"})\n", + "#features = convert_examples_to_features(examples, tokenizer, print_info=False)\n", + "#masked_feature_copies, batches = copy_and_mask_feature(features[0],3)\n", + "#for i in range(0,5):\n", + "# print(masked_feature_copies[i].input_ids) #结果[101, 1045, 2293, 103, 102]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import nltk\n", + "from pattern.en import conjugate, lemma, lexeme,PRESENT,SG\n", + "\n", + "def show_lm_probs(tokens, input_ids, probs, topk=5, firstk=20): #输出结果的函数,要最高概率topk个输出\n", + " def print_pair(token, prob, end_str='', hit_mark=' '):\n", + " if i < firstk:\n", + " # token = token.replace('', '').replace('\\n', '/n')\n", + " print('{}{: >3} | {: <12}'.format(hit_mark, int(round(prob*100)), token), end=end_str)\n", + " \n", + " ret = None\n", + " for i in range(len(tokens)):\n", + " ind_ = input_ids[i].item() if input_ids is not None else tokenizer.vocab[tokens[i]]\n", + " prob_ = probs[i][ind_].item() #这个probs是该字符串第i个位置上填上词典上各个词的概率,prob_是词典上原来天的这个词的概率\n", + " print_pair(tokens[i], prob_, end_str='\\t')\n", + " values, indices = probs[i].topk(topk)\n", + " #print(values, indices)\n", + " #print(\"****************************************************************************************************\")\n", + " top_pairs = []\n", + " for j in range(topk):\n", + " ind, prob = indices[j].item(), values[j].item()\n", + " hit_mark = '*' if ind == ind_ else ' '\n", + " token = tokenizer.ids_to_tokens[ind]\n", + " print_pair(token, prob, hit_mark=hit_mark, end_str='' if j < topk - 1 else '\\n')\n", + " top_pairs.append((token, prob))\n", + " if tokens[i] == \"[MASK]\":\n", + " ret = top_pairs\n", + " return ret #返回的这是个啥" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import colored\n", + "from colored import stylize\n", + "\n", + "def show_abnormals(tokens, probs, show_suggestions=False):\n", + " def gap2color(gap):\n", + " if gap <= 5:\n", + " return 'yellow_1'\n", + " elif gap <= 10:\n", + " return 'orange_1'\n", + " else:\n", + " return 'red_1'\n", + " \n", + " def print_token(token, suggestion, gap):\n", + " if gap == 0:\n", + " print(stylize(token + ' ', colored.fg('white') + colored.bg('black')), end='')\n", + " else:\n", + " print(stylize(token, colored.fg(gap2color(gap)) + colored.bg('black')), end='')\n", + " if show_suggestions and gap > 5:\n", + " print(stylize('/' + suggestion + ' ', colored.fg('green' if gap > 10 else 'cyan') + colored.bg('black')), end='')\n", + " else:\n", + " print(stylize(' ', colored.fg(gap2color(gap)) + colored.bg('black')), end='')\n", + " # print('/' + suggestion, end=' ')\n", + " # print('%.2f' % gap, end=' ')\n", + " \n", + " avg_gap = 0.\n", + " for i in range(1, len(tokens) - 1): # skip first [CLS] and last [SEP]\n", + " ind_ = tokenizer.vocab[tokens[i]]\n", + " prob_ = probs[i][ind_].item()\n", + " top_prob = probs[i].max().item()\n", + " top_ind = probs[i].argmax().item()\n", + " gap = math.log(top_prob) - math.log(prob_) #计算两个词之间的差距\n", + " suggestion = tokenizer.ids_to_tokens[top_ind]\n", + " print_token(tokens[i], suggestion, gap)\n", + " avg_gap += gap\n", + " avg_gap /= (len(tokens) - 2)\n", + " print()\n", + " print('平均gap:'+ str(avg_gap))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['[CLS]', 'last', 'week', 'i', 'went', 'to', 'the', 'theatre', '.', 'i', 'had', 'a', 'very', 'good', 'seat', '.', 'the', 'play', 'was', 'very', 'interesting', '.', '[SEP]']\n", + "[[101, 2197, 2733, 1045, 2253, 2000, 1996, 3004, 1012, 102], [101, 1045, 2018, 1037, 2200, 2204, 2835, 1012, 102], [101, 1996, 2377, 2001, 2200, 5875, 1012, 102]]\n", + "[[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7], [0, 8], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6], [1, 7], [2, 1], [2, 2], [2, 3], [2, 4], [2, 5], [2, 6]]\n", + "[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]]\n", + "['Last week I went to the theatre.', ' I had a very good seat.', ' The play was very interesting.']\n" + ] + } + ], + "source": [ + "analyzed_cache = {}\n", + "from pattern.en import conjugate, lemma, lexeme,PRESENT,SG\n", + "#print (lemma('gave'))\n", + "#print (lexeme('production'))\n", + "#print (conjugate(verb='give',tense=PRESENT,number=SG))\n", + "def process_text(text): \n", + "#处理输入文本,包括将文本按句子分成若干token,得出原来text中index位置的单词在x句子的y位置,还得出各个句子类别码\n", + " token =[]\n", + " token0 = tokenizer.tokenize(text)\n", + " token.append('[CLS]')\n", + " for i in token0:\n", + " token.append(i)\n", + " token.append('[SEP]')\n", + " print(token)\n", + " in_sentence = [[0,0]] \n", + " sentence_n = 0\n", + " index = 1\n", + " for i in range(1,len(token)-1):\n", + " in_sentence.append([sentence_n,index]) #每个token中的词在所在句中的位置表示出来,以及该位置在哪一句中\n", + " index = index + 1 #比如,位置i这个词在第sentence句的index位置上\n", + " if token[i] == '.':\n", + " sentence_n = sentence_n + 1\n", + " index = 1\n", + " sentences = text.split(\".\")\n", + " sentences.remove('')\n", + "\n", + " sen_token = []\n", + " input_ids_sen = []\n", + " input_type_ids_sen = []\n", + " for i,sentence in enumerate(sentences):\n", + " sentence = sentence + '.'\n", + " sentences[i] = sentences[i] + '.'\n", + " token = []\n", + " input_type_ids = []\n", + " tokens = tokenizer.tokenize(sentence)\n", + " token.append('[CLS]')\n", + " input_type_ids.append(0) \n", + " for i in tokens:\n", + " token.append(i)\n", + " input_type_ids.append(0) \n", + " token.append('[SEP]') \n", + " input_type_ids.append(0)\n", + " input_ids_sen.append(tokenizer.convert_tokens_to_ids(token))\n", + " input_type_ids_sen.append(input_type_ids)\n", + " #input_ids_sen = torch.tensor(input_ids_sen)\n", + " #input_type_ids_sen = torch.tensor(input_type_ids_sen)\n", + " return input_ids_sen,input_type_ids_sen,in_sentence,sentences\n", + "text = \"Last week I went to the theatre. I had a very good seat. The play was very interesting.\"\n", + "input_ids_sen,input_type_ids_sen,in_sentence,sentences = process_text(text)\n", + "print(input_ids_sen)\n", + "print(in_sentence)\n", + "print(input_type_ids_sen)\n", + "print(sentences)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "这个函数是在该位置上的单词可能性很低时才使用,不会把原来就较为合理的面目全非" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "是否用不定式:\n", + "用to的可能性0.00036304089007899165\n", + "可能性最大的词概率0.23709583282470703\n", + "是否用被动或进行时:\n", + "[('was', 0.0002793713065329939), ('am', 4.049863855470903e-05), ('were', 1.306664398725843e-05), ('been', 4.840642304770881e-06), ('be', 1.453689151276194e-06), ('are', 7.996850399649702e-07), ('is', 5.958298174846277e-07), ('being', 9.706550230248467e-09)]\n", + "had 0.8573063611984253\n", + "was 0.0002793713065329939\n", + "不是被动\n", + "[('was', 0.9590925574302673), ('am', 0.006898669525980949), ('were', 0.0016424404457211494), ('been', 0.0004373548727016896), ('is', 0.00035717932041734457), ('be', 3.4134478482883424e-05), ('are', 2.2988733689999208e-05), ('being', 3.1775894626662193e-07)]\n", + "was 0.9590925574302673\n", + "was 0.9590925574302673\n", + "判断其他语法:\n", + "need_be == 1\n", + "['go', 'goes', 'going', 'went', 'gone']\n", + "[2175, 3632, 2183, 2253, 2908]\n", + "{'go': 0.00043932811240665615, 'goes': 0.00012179886834928766, 'going': 0.6597349047660828, 'went': 0.00122930109500885, 'gone': 0.002755501540377736}\n", + "going\n", + "was going\n" + ] + }, + { + "data": { + "text/plain": [ + "'was going'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import copy\n", + "import nltk\n", + "from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,PRESENT,SG,INFINITIVE, PRESENT, PAST, FUTURE, PROGRESSIVE\n", + "\n", + "def analyse_V(index):\n", + "#这是一个处理动词语法问题的函数,输入为问题词在text的token中的下标index\n", + "\n", + "#******************************************初始数据处理**************************************************************************\n", + " need_to = 0 #表示是否需要变为不定式形式,0表示不需要,1表示需要\n", + " need_be = 0 #表示是否需要变为被动语态0表示不需要,1表示需要\n", + " \n", + " sentence_id = in_sentence[index][0]\n", + " id_in_sen = in_sentence[index][1]\n", + " wordV = input_ids_sen[sentence_id][id_in_sen]\n", + " wordV = tokenizer.ids_to_tokens[wordV]\n", + " \n", + " input_ids = copy.deepcopy(input_ids_sen[sentence_id])\n", + " input_type_ids = copy.deepcopy(input_type_ids_sen[sentence_id])\n", + "#*****************************************判断语法应不应该是不定式抑或是被动语态**************************************************************\n", + " '''\n", + " input_ids1 = copy.deepcopy(input_ids)\n", + " input_ids1.insert(id_in_sen,tokenizer.vocab[\"[MASK]\"])\n", + " input_type_ids1 = copy.deepcopy(input_type_ids)\n", + " input_type_ids1.append(0)\n", + " \n", + " T_input_ids1 = torch.tensor([input_ids1], dtype=torch.long) #把input_ids增加了一个维度\n", + " T_input_ids1 = T_input_ids1.to(device) #拿去GPU\n", + "\n", + " T_input_type_ids1 = torch.tensor([input_type_ids1], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样\n", + " T_input_type_ids1 = T_input_type_ids1.to(device) \n", + " \n", + " mlm_logits1, _ = model(T_input_ids1, T_input_type_ids1)\n", + " mlm_probs1 = F.softmax(mlm_logits1, dim=-1)\n", + " reduced_mlm_probs1 = mlm_probs1[0][id_in_sen]\n", + " '''\n", + "#**************************************判断是不是不定式********************* \n", + " input_ids1 = copy.deepcopy(input_ids)\n", + " input_ids1.insert(id_in_sen,tokenizer.vocab[\"[MASK]\"])\n", + " input_ids1[id_in_sen + 1]=tokenizer.vocab[conjugate(verb=wordV,tense=PRESENT,person = 1)]\n", + " input_type_ids1 = copy.deepcopy(input_type_ids)\n", + " input_type_ids1.append(0)\n", + " \n", + " T_input_ids1 = torch.tensor([input_ids1], dtype=torch.long) #把input_ids增加了一个维度\n", + " T_input_ids1 = T_input_ids1.to(device) #拿去GPU\n", + "\n", + " T_input_type_ids1 = torch.tensor([input_type_ids1], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样\n", + " T_input_type_ids1 = T_input_type_ids1.to(device) \n", + " \n", + " mlm_logits1, _ = model(T_input_ids1, T_input_type_ids1)\n", + " mlm_probs1 = F.softmax(mlm_logits1, dim=-1)\n", + " reduced_mlm_probs1 = mlm_probs1[0][id_in_sen]\n", + " \n", + " prob_to = float(reduced_mlm_probs1[tokenizer.vocab[\"to\"]])\n", + " top_prob1 = reduced_mlm_probs1.max().item()\n", + " print(\"是否用不定式:\")\n", + " print(\"用to的可能性\"+str(prob_to))\n", + " print(\"可能性最大的词概率\"+str(top_prob1))\n", + " gap1 = math.log(top_prob1) - math.log(prob_to)\n", + " if gap1 < 1:\n", + " need_to = 1 \n", + "#**************************************判断是不是被动语态或者进行时******************* \n", + " print(\"是否用被动或进行时:\")\n", + " input_ids3 = copy.deepcopy(input_ids)\n", + " input_ids3.insert(id_in_sen,tokenizer.vocab[\"[MASK]\"])\n", + " input_ids3_ = copy.deepcopy(input_ids3)\n", + " input_ids3[id_in_sen + 1]=tokenizer.vocab[conjugate(verb=wordV,tense=PAST,aspect=PROGRESSIVE)]\n", + " input_ids3_[id_in_sen + 1]=tokenizer.vocab[conjugate(verb=wordV,tense=PRESENT,aspect=PROGRESSIVE)]\n", + " input_type_ids3 = copy.deepcopy(input_type_ids)\n", + " input_type_ids3.append(0)\n", + " \n", + " T_input_ids3 = torch.tensor([input_ids3], dtype=torch.long) #把input_ids增加了一个维度\n", + " T_input_ids3 = T_input_ids3.to(device) #拿去GPU\n", + " T_input_ids3_ = torch.tensor([input_ids3_], dtype=torch.long)\n", + " T_input_ids3_ = T_input_ids3_.to(device)\n", + "\n", + " T_input_type_ids3 = torch.tensor([input_type_ids3], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样\n", + " T_input_type_ids3 = T_input_type_ids3.to(device) \n", + " \n", + " mlm_logits3, _ = model(T_input_ids3, T_input_type_ids3)\n", + " mlm_logits3_,_ = model(T_input_ids3_, T_input_type_ids3)\n", + " mlm_probs3 = F.softmax(mlm_logits3, dim=-1)\n", + " reduced_mlm_probs3 = mlm_probs3[0][id_in_sen]\n", + " mlm_probs3_= F.softmax(mlm_logits3_, dim=-1)\n", + " reduced_mlm_probs3_ = mlm_probs3_[0][id_in_sen]\n", + " \n", + " list_be = lexeme('be')\n", + " list_be = lexeme('be')[:8]\n", + "\n", + " list_be_id = tokenizer.convert_tokens_to_ids(list_be)\n", + " list_be_prob = {}\n", + " for word,word_id in zip(list_be,list_be_id):\n", + " list_be_prob.update({word:float(reduced_mlm_probs3[word_id].data)})\n", + " prob_ord3 = sorted(list_be_prob.items(),key = lambda x:x[1],reverse = True)\n", + " print(prob_ord3)\n", + " top_ind3 = reduced_mlm_probs3.argmax().item()\n", + " top_prob3 = reduced_mlm_probs3.max().item()\n", + " print(tokenizer.ids_to_tokens[top_ind3],top_prob3)\n", + " print(prob_ord3[0][0],prob_ord3[0][1])\n", + " top_prob_be = prob_ord3[0][1]\n", + " gap3 = math.log(top_prob3) - math.log(top_prob_be)\n", + " if gap3 < 1:\n", + " need_be = 1 \n", + " be_ = prob_ord3[0][0]\n", + " else:\n", + " print('不是被动')\n", + "#*******************************************是不是现在分词******************************** \n", + " list_be_prob = {}\n", + " for word,word_id in zip(list_be,list_be_id):\n", + " list_be_prob.update({word:float(reduced_mlm_probs3_[word_id].data)})\n", + " prob_ord3 = sorted(list_be_prob.items(),key = lambda x:x[1],reverse = True)\n", + " print(prob_ord3)\n", + " top_ind3 = reduced_mlm_probs3_.argmax().item()\n", + " top_prob3 = reduced_mlm_probs3_.max().item()\n", + " print(tokenizer.ids_to_tokens[top_ind3],top_prob3)\n", + " print(prob_ord3[0][0],prob_ord3[0][1])\n", + " top_prob_be = prob_ord3[0][1]\n", + " gap3 = math.log(top_prob3) - math.log(top_prob_be)\n", + " if gap3 < 1:\n", + " need_be = 1 \n", + " be_ = prob_ord3[0][0] \n", + "#*************************************************判断其他语法******************************************************************\n", + " print(\"判断其他语法:\")\n", + " if need_to == 0 and need_be == 0:\n", + " input_ids[id_in_sen] = tokenizer.vocab[\"[MASK]\"]\n", + " input_type_ids = copy.deepcopy(input_type_ids_sen[sentence_id])\n", + "\n", + " T_input_ids = torch.tensor([input_ids], dtype=torch.long) #把input_ids增加了一个维度\n", + " T_input_type_ids = torch.tensor([input_type_ids], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样\n", + " T_input_ids = T_input_ids.to(device) #拿去GPU\n", + " T_input_type_ids = T_input_type_ids.to(device)\n", + "\n", + " mlm_logits, _ = model(T_input_ids, T_input_type_ids)\n", + " mlm_probs = F.softmax(mlm_logits, dim=-1)\n", + " reduced_mlm_probs = mlm_probs[0][id_in_sen]\n", + "\n", + " list_word = lexeme(wordV)\n", + " #list_word = [word]\n", + "\n", + " list_word_id = tokenizer.convert_tokens_to_ids(list_word)\n", + " print(list_word)\n", + " print(list_word_id) \n", + " list_word_prob = {}\n", + " for word,word_id in zip(list_word,list_word_id):\n", + " list_word_prob.update({word:float(reduced_mlm_probs[word_id].data)})\n", + " print(list_word_prob)\n", + " prob_ord = sorted(list_word_prob.items(),key = lambda x:x[1],reverse = True)\n", + "\n", + " top_ind = reduced_mlm_probs.argmax().item()\n", + " top_prob = reduced_mlm_probs.max().item()\n", + " top_prob_thisV = prob_ord[0][1]\n", + " gap = math.log(top_prob) - math.log(top_prob_thisV)\n", + " \n", + " suggestion = tokenizer.ids_to_tokens[top_ind]\n", + " sentence = copy.deepcopy(sentences[sentence_id])\n", + " sentence = tokenizer.tokenize(sentence)\n", + " sentence[id_in_sen - 1] = suggestion\n", + " sentence_tag = nltk.pos_tag(sentence)\n", + " \n", + " suggestion_tag = sentence_tag[id_in_sen - 1][1]\n", + " #print(sentence_tag[id_in_sen - 1][0])\n", + " print(suggestion_tag)\n", + " \n", + " if gap < 5 or suggestion_tag.find(\"V\")==-1:\n", + " suggestion = prob_ord[0][0]\n", + " \n", + " \n", + "\n", + " \"\"\"”values, indices = reduced_mlm_probs.topk(topk)\n", + " for j in range(topk):\n", + " ind, prob = indices[j].item(), values[j].item()\n", + " \n", + " token = tokenizer.ids_to_tokens[ind]\n", + " print(token,prob)\"\"\"\n", + " elif need_to == 1:\n", + " input_ids2 = copy.deepcopy(input_ids)\n", + " input_ids2.insert(id_in_sen,tokenizer.vocab[\"to\"])\n", + " input_ids2[id_in_sen + 1] = tokenizer.vocab[\"[MASK]\"]\n", + " T_input_ids2 = torch.tensor([input_ids2], dtype=torch.long) #把input_ids增加了一个维度\n", + " T_input_ids2 = T_input_ids2.to(device) #拿去GPU\n", + " \n", + " input_type_ids2 = copy.deepcopy(input_type_ids1)\n", + " T_input_type_ids2 = torch.tensor([input_type_ids2], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样\n", + " T_input_type_ids2 = T_input_type_ids2.to(device) \n", + " mlm_logits2, _ = model(T_input_ids2, T_input_type_ids2)\n", + " mlm_probs2 = F.softmax(mlm_logits2, dim=-1)\n", + " reduced_mlm_probs2 = mlm_probs2[0][id_in_sen + 1]\n", + " \n", + " thisV = conjugate(verb = wordV,tense=PRESENT,person = 1)\n", + " print(thisV)\n", + " #list_word = [wordV]\n", + " thisV_id = tokenizer.vocab[thisV]\n", + " \n", + " top_ind2 = reduced_mlm_probs2.argmax().item()\n", + " top_prob2 = reduced_mlm_probs2.max().item()\n", + " prob_thisV2 = reduced_mlm_probs2[thisV_id]\n", + " gap = math.log(top_prob2) - math.log(prob_thisV2)\n", + " \n", + " suggestion = tokenizer.ids_to_tokens[top_ind2]\n", + " sentence = copy.deepcopy(sentences[sentence_id])\n", + " sentence = tokenizer.tokenize(sentence)\n", + " sentence.insert(id_in_sen - 1,'to')\n", + " sentence[id_in_sen] = suggestion\n", + " print(\"sentence是:\",sentence)\n", + " sentence_tag = nltk.pos_tag(sentence)\n", + " \n", + " suggestion_tag = sentence_tag[id_in_sen][1]\n", + " if gap < 5 or suggestion_tag.find(\"V\")== -1:\n", + " suggestion = 'to '+ thisV\n", + " else:\n", + " suggestion = 'to '+ tokenizer.ids_to_tokens[top_ind2]\n", + " elif need_be == 1:#********************************处理需要be动词的时态*****************\n", + " print(\"need_be == 1\")\n", + " input_ids3 = copy.deepcopy(input_ids1)\n", + " input_ids3[id_in_sen] = tokenizer.vocab[be_]\n", + " input_ids3[id_in_sen + 1] = tokenizer.vocab[\"[MASK]\"]\n", + " T_input_ids3 = torch.tensor([input_ids3], dtype=torch.long) #把input_ids增加了一个维度\n", + " T_input_ids3 = T_input_ids3.to(device) #拿去GPU\n", + " \n", + " input_type_ids3 = copy.deepcopy(input_type_ids1)\n", + " T_input_type_ids3 = torch.tensor([input_type_ids3], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样\n", + " T_input_type_ids3 = T_input_type_ids3.to(device)\n", + " mlm_logits3, _ = model(T_input_ids3, T_input_type_ids3)\n", + " mlm_probs3 = F.softmax(mlm_logits3, dim=-1)\n", + " reduced_mlm_probs3 = mlm_probs3[0][id_in_sen + 1]\n", + " \n", + " list_word3 = lexeme(wordV)\n", + " #list_word = [wordV]\n", + " list_word_id3 = tokenizer.convert_tokens_to_ids(list_word3)\n", + " print(list_word3)\n", + " print(list_word_id3) \n", + " list_word_prob3 = {}\n", + " for word,word_id in zip(list_word3,list_word_id3):\n", + " list_word_prob3.update({word:float(reduced_mlm_probs3[word_id].data)})\n", + " print(list_word_prob3)\n", + " prob_ord3 = sorted(list_word_prob3.items(),key = lambda x:x[1],reverse = True)\n", + "\n", + " top_ind3 = reduced_mlm_probs3.argmax().item()\n", + " top_prob3 = reduced_mlm_probs3.max().item()\n", + " top_prob_thisV3 = prob_ord3[0][1]\n", + " gap = math.log(top_prob3) - math.log(top_prob_thisV3)\n", + " print(tokenizer.ids_to_tokens[top_ind3])\n", + " \n", + " suggestion = tokenizer.ids_to_tokens[top_ind3]\n", + " sentence = copy.deepcopy(sentences[sentence_id])\n", + " sentence = tokenizer.tokenize(sentence)\n", + " sentence.insert(id_in_sen -1,be_)\n", + " sentence[id_in_sen] = suggestion\n", + " #print(\"sentence是:\",sentence)\n", + " sentence_tag = nltk.pos_tag(sentence)\n", + " \n", + " suggestion_tag = sentence_tag[id_in_sen][1]\n", + " if gap < 5 or suggestion_tag.find(\"VB\")== -1:\n", + " suggestion = be_ + ' ' + prob_ord3[0][0]\n", + " else:\n", + " suggestion = be_ + ' ' + tokenizer.ids_to_tokens[top_ind3]\n", + " print(suggestion)\n", + " return suggestion\n", + " \n", + "analyse_V(4)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "ename": "IndentationError", + "evalue": "unexpected indent (, line 49)", + "output_type": "error", + "traceback": [ + "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m49\u001b[0m\n\u001b[0;31m T_input_ids = torch.tensor([input_ids], dtype=torch.long) #把input_ids增加了一个维度\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unexpected indent\n" + ] + } + ], + "source": [ + "from pattern.en import article,referenced,pluralize, singularize\n", + "def analyse_N(index):\n", + "#******************************************初始数据处理**************************************************************************\n", + " need_DT = 0 #表示是否需要在前面加冠词\n", + " prob_N = 0 #表示这个名词的单复数中最高的概率 \n", + " sentence_id = in_sentence[index][0]\n", + " id_in_sen = in_sentence[index][1]\n", + " wordN = input_ids_sen[sentence_id][id_in_sen]\n", + " wordN = tokenizer.ids_to_tokens[wordN]\n", + " \n", + " input_ids = copy.deepcopy(input_ids_sen[sentence_id])\n", + " input_type_ids = copy.deepcopy(input_type_ids_sen[sentence_id])\n", + "#*****************************************若一个词有问题************************************************************************* \n", + " input_ids[id_in_sen] = tokenizer.vocab[\"[MASK]\"]\n", + " input_type_ids = copy.deepcopy(input_type_ids_sen[sentence_id])\n", + "\n", + " T_input_ids = torch.tensor([input_ids], dtype=torch.long) #把input_ids增加了一个维度\n", + " T_input_type_ids = torch.tensor([input_type_ids], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样\n", + " T_input_ids = T_input_ids.to(device) #拿去GPU\n", + " T_input_type_ids = T_input_type_ids.to(device)\n", + "\n", + " mlm_logits, _ = model(T_input_ids, T_input_type_ids)\n", + " mlm_probs = F.softmax(mlm_logits, dim=-1)\n", + " reduced_mlm_probs = mlm_probs[0][id_in_sen]\n", + " \n", + " N_ = singularize(wordN)\n", + " N_s= pluralize(wordN)\n", + " N_id = tokenizer.vocab[N_]\n", + " N_s_id = tokenizer.vocab[N_s]\n", + " if(reduced_mlm_probs[N_id] > reduced_mlm_probs[N_s_id]):\n", + " suggestion = N_\n", + " prob_N = reduced_mlm_probs[N_id]\n", + " else:\n", + " suggestion = N_s\n", + " prob_N = reduced_mlm_probs[N_s_id]\n", + " \n", + " top_ind = reduced_mlm_probs.argmax().item()\n", + " top_prob = reduced_mlm_probs.max().item()\n", + " \n", + " gap = math.log(top_prob)- math.log(prob_N)\n", + " if gap > 6.5: #我觉得代词的阈值应该回比名词小一点\n", + " need_DT = 1 #不见棺材不落泪,认为缺冠词 \n", + " \n", + " \n", + " input_ids.insert(id_in_sen,tokenizer.vocab[\"[MASK]\"])\n", + " input_ids.insert[id_in_sen + 1] = tokenizer.vocab[suggestion]\n", + " input_type_ids.append(0)\n", + " \n", + " T_input_ids = torch.tensor([input_ids], dtype=torch.long) #把input_ids增加了一个维度\n", + " T_input_type_ids = torch.tensor([input_type_ids], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样\n", + " T_input_ids = T_input_ids.to(device) #拿去GPU\n", + " T_input_type_ids = T_input_type_ids.to(device)\n", + "\n", + " mlm_logits, _ = model(T_input_ids, T_input_type_ids)\n", + " mlm_probs = F.softmax(mlm_logits, dim=-1)\n", + " reduced_mlm_probs = mlm_probs[0][id_in_sen]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "analyzed_cache = {}\n", + "\n", + "def analyze_text(text, masked_tokens=None, show_suggestions=True, show_firstk_probs=20):\n", + " step = 7\n", + " if text[0] in analyzed_cache: #分析过的缓存\n", + " features, mlm_probs = analyzed_cache[text[0]]\n", + " given_mask = \"[MASK]\" in features[0].tokens\n", + " tokens = features[0].tokens \n", + " else:\n", + " examples = convert_text_to_examples(text)\n", + " features = convert_examples_to_features(examples, tokenizer, print_info=False)\n", + " given_mask = \"[MASK]\" in features[0].tokens\n", + " if not given_mask or masked_tokens is not None:\n", + " assert len(features) == 1\n", + " features, batches = copy_and_mask_feature(features[0],step, masked_tokens=masked_tokens)\n", + " #print(len(features))\n", + "\n", + " input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) #把input_ids增加了一个维度,变成[n_features,sequence_len]\n", + " #这里的n_features实际上是句子有多少批训练\n", + " \n", + " input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样\n", + " input_ids = input_ids.to(device) #拿去GPU\n", + " input_type_ids = input_type_ids.to(device)\n", + "\n", + " mlm_logits, _ = model(input_ids, input_type_ids)\n", + " mlm_probs = F.softmax(mlm_logits, dim=-1) #最后一维,也就是vocab 换算成概率和为百分之百\n", + " #print(mlm_probs.size())#这里实验的是torch.Size([5, 5, 30522])\n", + " tokens = features[0].tokens #为了输出,[mask]在input_ids里面表示出来,features的token都一样\n", + " #print(tokens)\n", + " if not given_mask or masked_tokens is not None:\n", + " bsz, seq_len, vocab_size = mlm_probs.size() #三个维度分别是batch_size, sequence_length, vocab_size\n", + " assert bsz == len(batches)\n", + " # reduced_mlm_probs = torch.Tensor(1, seq_len, vocab_size)\n", + " # for i in range(seq_len):\n", + " # reduced_mlm_probs[0, i] = mlm_probs[i, i]\n", + " reduced_mlm_probs = torch.Tensor(1, len(tokens), vocab_size)\n", + " for i in batches:\n", + " pos = i\n", + " while pos < len(tokens):\n", + " reduced_mlm_probs[0, pos] = mlm_probs[i, pos]\n", + " pos = pos + step\n", + " mlm_probs = reduced_mlm_probs #压缩一下大小,节约不必要浪费的空间(只需要第i个batch里面[mask]位置的词汇表概率即可)\n", + " #tokens = [tokens[i] for i in masked_positions]\n", + " \n", + " analyzed_cache[text[0]] = (features, mlm_probs)\n", + " \n", + " top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=show_firstk_probs) #传入的probs是二维的\n", + " #print(top_pairs) #******************************\n", + " if not given_mask:\n", + " show_abnormals(tokens, mlm_probs[0], show_suggestions=show_suggestions)\n", + " #return top_pairs\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'analyze_text' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"I hate you.\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;31m#text =[\"Last week I go to the zoo. I had a very good seat. The play was very interesting.But I didn't enjoy it. A young man and a young woman were sitting behind me.They were talking loudly. I got very angry.\"]#因为外面有中括号,所以是二维的\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0manalyze_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshow_firstk_probs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;31m#print(analyzed_cache)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0mtime_end\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'analyze_text' is not defined" + ] + } + ], + "source": [ + "# text = [\"Who was Jim Henson? Jim Henson _ a puppeteer.\"]\n", + "# text = [\"Last week I went to the theatre. I had a very good seat. The play was very interesting. But I didn't enjoy it. A young man and a young woman were sitting behind me. They were talking loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angrily. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'\"]\n", + "# text = [\"After the outbreak of the disease, the Ministry of Agriculture and rural areas immediately sent a supervision team to the local. Local Emergency Response Mechanism has been activated in accordance with the requirements, to take blockade, culling, harmless treatment, disinfection and other treatment measures to all disease and culling of pigs for harmless treatment. At the same time, all live pigs and their products are prohibited from transferring out of the blockade area, and live pigs are not allowed to be transported into the blockade area. At present, all the above measures have been implemented.\"]\n", + "# text = [\"Early critics of Emily Dickinson's poetry mistook for simplemindedness the surface of artlessness that in fact she constructed with such innocence.\"]\n", + "import time\n", + "time_start=time.time()\n", + "text = [\"I hate you.\"]\n", + "#text =[\"Last week I go to the zoo. I had a very good seat. The play was very interesting.But I didn't enjoy it. A young man and a young woman were sitting behind me.They were talking loudly. I got very angry.\"]#因为外面有中括号,所以是二维的\n", + "analyze_text(text, show_firstk_probs=100)\n", + "#print(analyzed_cache)\n", + "time_end=time.time()\n", + "print('time cost',time_end-time_start,'s')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "01/03/2019 17:10:45 - INFO - examples.extract_features - tokens: [CLS] the trophy doesn ' t fit into the brown suitcase because the [MASK] is too large . [SEP]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0 | [CLS] \t 2 | . 1 | ) 1 | the 1 | , 1 | \" \n", + " 100 | the \t*100 | the 0 | his 0 | a 0 | its 0 | her \n", + " 97 | trophy \t* 97 | trophy 0 | cup 0 | prize 0 | trophies 0 | competition \n", + " 100 | doesn \t*100 | doesn 0 | can 0 | does 0 | won 0 | didn \n", + " 100 | ' \t*100 | ' 0 | t 0 | \" 0 | = 0 | ` \n", + " 100 | t \t*100 | t 0 | not 0 | s 0 | n 0 | to \n", + " 100 | fit \t*100 | fit 0 | fits 0 | sit 0 | get 0 | fitting \n", + " 100 | into \t*100 | into 0 | in 0 | inside 0 | onto 0 | within \n", + " 100 | the \t*100 | the 0 | her 0 | his 0 | a 0 | my \n", + " 100 | brown \t*100 | brown 0 | black 0 | green 0 | blue 0 | plastic \n", + " 95 | suitcase \t* 95 | suitcase 3 | bag 1 | luggage 0 | backpack 0 | trunk \n", + " 100 | because \t*100 | because 0 | as 0 | since 0 | due 0 | . \n", + " 100 | the \t*100 | the 0 | its 0 | his 0 | it 0 | her \n", + " 0 | [MASK] \t 21 | suitcase 19 | bag 6 | box 2 | luggage 2 | case \n", + " 99 | is \t* 99 | is 1 | was 0 | being 0 | has 0 | it \n", + " 100 | too \t*100 | too 0 | very 0 | extra 0 | overly 0 | more \n", + " 87 | large \t* 87 | large 11 | big 1 | small 1 | huge 0 | larger \n", + " 100 | . \t*100 | . 0 | ; 0 | , 0 | ! 0 | ' \n", + " 0 | [SEP] \t 35 | . 8 | ) 5 | , 4 | ( 3 | it \n" + ] + } + ], + "source": [ + "text = [\"The trophy doesn't fit into the brown suitcase because the _ is too large.\"]\n", + "# text = [\"Mary beat John in the match because _ was very strong.\"]\n", + "features = convert_examples_to_features(convert_text_to_examples(text), tokenizer, print_info=False)\n", + "input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).to(device)\n", + "input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long).to(device)\n", + "mlm_logits, _ = model(input_ids, input_type_ids)\n", + "mlm_probs = F.softmax(mlm_logits, dim=-1)\n", + "tokens = features[0].tokens\n", + "top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have the same hair color.',\n", + " 'Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have different hair colors.']" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text = [\n", + " # same / different\n", + " \"Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have the same hair color.\",\n", + " \"Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have different hair colors.\",\n", + " \"Tom has yellow hair. Mary has black hair. John has black hair. Mary and _ have the same hair color.\",\n", + " # because / although\n", + " \"John is taller/shorter than Mary because/although _ is older/younger.\",\n", + " \"The red ball is heavier/lighter than the blue ball because/although the _ ball is bigger/smaller.\",\n", + " \"Charles did a lot better/worse than his good friend Nancy on the test because/although _ had/hadn't studied so hard.\",\n", + " \"The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.\",\n", + " \"John thought that he would arrive earlier than Susan, but/and indeed _ was the first to arrive.\",\n", + " # reverse\n", + " \"John came then Mary came. They left in reverse order. _ left then _ left.\",\n", + " \"John came after Mary. They left in reverse order. _ left after _ .\",\n", + " \"John came first, then came Mary. They left in reverse order: _ left first, then left _ .\",\n", + " # compare\n", + " \"Though John is tall, Tom is taller than John. So John is _ than Tom.\",\n", + " \"Tom is taller than John. So _ is shorter than _.\",\n", + " # WSC-style: before /after\n", + " \"Mary came before/after John. _ was late/early .\",\n", + " # yes / no\n", + " \"Was Tom taller than Susan? Yes, _ was taller.\",\n", + " # right / wrong, epistemic modality\n", + " \"John said the rain was about to stop. Mary said the rain would continue. Later the rain stopped. _ was wrong.\",\n", + " \n", + " \"The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.\",\n", + " \"John thanked Mary because _ had given help to _ . \",\n", + " \"John felt vindicated/crushed when his longtime rival Mary revealed that _ was the winner of the competition.\",\n", + " \"John couldn't see the stage with Mary in front of him because _ is so short/tall.\",\n", + " \"Although they ran at about the same speed, John beat Sally because _ had such a bad start.\",\n", + " \"The fish ate the worm. The _ was hungry/tasty.\",\n", + " \n", + " \"John beat Mary. _ won the game/e winner.\",\n", + "]\n", + "text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "config" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "with open('WSC_switched_label.json') as f:\n", + " examples = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "with open('WSC_child_problem.json') as f:\n", + " cexamples = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "for ce in cexamples:\n", + " for s in ce['sentences']:\n", + " for a in s['answer0'] + s['answer1']:\n", + " a = a.lower()\n", + " if a not in tokenizer.vocab:\n", + " ce\n", + " print(a, 'not in vocab!!!')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "for ce in cexamples:\n", + " if len(ce['sentences']) > 0:\n", + " e = examples[ce['index']]\n", + " assert ce['index'] == e['index']\n", + " e['score'] = all([s['score'] for s in ce['sentences']])\n", + " assert len(set([s['adjacent_ref'] for s in ce['sentences']])) == 1, 'adjcent_refs are different!'\n", + " e['adjacent_ref'] = ce['sentences'][0]['adjacent_ref']" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "groups = defaultdict(list)\n", + "for e in examples:\n", + " if 'score' in e:\n", + " index = e['index']\n", + " if index < 252:\n", + " if index % 2 == 1:\n", + " index -= 1\n", + " elif index in [252, 253, 254]:\n", + " index = 252\n", + " else:\n", + " if index % 2 == 0:\n", + " index -= 1\n", + " groups[index].append(e)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(2, 'fit into:large/small', False),\n", + " (4, 'thank:receive/give', False),\n", + " (6, 'call:successful available', True),\n", + " (8, 'ask:repeat answer', False),\n", + " (10, 'zoom by:fast/slow', False),\n", + " (12, 'vindicated/crushed:be the winner', False),\n", + " (14, 'lift:weak heavy', False),\n", + " (16, 'crash through:[hard]/[soft]', False),\n", + " (18, '[block]:short/tall', False),\n", + " (20, 'down to:top/bottom', False),\n", + " (22, 'beat:good/bad', False),\n", + " (24, 'roll off:anchored level', False),\n", + " (26, 'above/below', False),\n", + " (28, 'better/worse:study hard', False),\n", + " (30, 'after/before:far away', False),\n", + " (32, 'be upset with:buy from not work/sell not work', True),\n", + " (34, '?yell at comfort:upset', False),\n", + " (36, 'above/below:moved first', False),\n", + " (38, 'although/because', False),\n", + " (40, 'bully:punish rescue', False),\n", + " (42, 'pour:empty/full', False),\n", + " (44, 'know:nosy indiscreet', False),\n", + " (46, 'explain:convince/understand', True),\n", + " (48, '?know tell:so/because', True),\n", + " (50, 'beat:younger/older', False),\n", + " (56, 'clog:cleaned removed', True),\n", + " (58, '?immediately follow:short delayed', False),\n", + " (60, '?between:see see around', True),\n", + " (64, 'but/and', False),\n", + " (66, 'clean:put in the trash put in the drawer', False),\n", + " (68, 'because/but', False),\n", + " (70, 'out of:handy lighter', False),\n", + " (72, 'put:tall high', False),\n", + " (74, 'show:good famous', True),\n", + " (76, 'pay for:generous grateful', False),\n", + " (78, 'but', False),\n", + " (80, 'if', False),\n", + " (82, 'if', False),\n", + " (84, 'fool:get/lose', False),\n", + " (88, 'wait:impatient cautious', False),\n", + " (90, 'give birth:woman baby', True),\n", + " (92, '?stop normal/stop abnormal:strange', False),\n", + " (96, 'eat:hungry tasty', False),\n", + " (98, 'put ... into filled with ... :get in/get out', False),\n", + " (100, 'up:at the bottom/at the top', False),\n", + " (102, 'crash through:removed repaired', False),\n", + " (104, 'stab:taken to the police station taken to the hospital', False),\n", + " (106, 'hear ... humming and whistling:annoyed/annoying', True),\n", + " (108, 'see ... juggling watermelons:impressed/impressive', True),\n", + " (114, 'tell lies: truthful skeptical', True),\n", + " (130, 'but:disappointed', True),\n", + " (132, 'visit:invite come out/invite come in', True),\n", + " (134, 'take classes from:eager known to speak it fluently', False),\n", + " (138, 'cover:out gone', True),\n", + " (144, 'tuck:work sleep', True),\n", + " (150, 'influence:later/earlier', False),\n", + " (152, 'can not cut:thick small', False),\n", + " (154, 'attack:kill guard', False),\n", + " (156, 'attack:bold nervous', False),\n", + " (160, 'change:hard:easy', False),\n", + " (166, 'alive:is/was', False),\n", + " (168, 'infant:twelve years old twelve months old', False),\n", + " (170, 'better equipped and large:defeated/victorious', False),\n", + " (178, 'interview:persistent cooperative', False),\n", + " (186, 'be full of:minority/majority', False),\n", + " (188, 'like over:more/fewer', False),\n", + " (190, 'place on all:not enough/too many', True),\n", + " (192, 'stick:leave have', True),\n", + " (196, 'follow:admire/influence', True),\n", + " (198, 'fit through:wide/narrow', False),\n", + " (200, 'trade:dowdy/great', False),\n", + " (202, 'hire/hire oneself to:take care of', True),\n", + " (204, 'promise/order', False),\n", + " (208, 'mother:education place', True),\n", + " (210, 'knock:get an answer/answer', True),\n", + " (212, 'pay:receive/deliver', False),\n", + " (218, '?', False),\n", + " (220, 'say check:move take', False),\n", + " (222, '?', False),\n", + " (224, 'give a life:drive alone walk', False),\n", + " (226, 'pass the plate:full/hungry', False),\n", + " (228, 'pass:turn over turn next', False),\n", + " (232, 'stretch pat', True),\n", + " (234, 'accept share', False),\n", + " (236, 'speak:break silence break concentration', False),\n", + " (240, 'carry:leg ache leg dangle', True),\n", + " (242, 'carry:in arms in bassinet', False),\n", + " (244, 'hold:against chest against will', True),\n", + " (250, 'stop', False),\n", + " (252, 'even though/because/not', False),\n", + " (255, 'give:not hungry/hungry', False),\n", + " (259, 'ask for a favor:refuse/be refused`', False),\n", + " (261, 'cede:less popular/more popular', False),\n", + " (263, 'not pass although:see open/open', True),\n", + " (271, 'suspect regret', True)]" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def filter_dict(d, keys=['index', 'sentence', 'correct_answer', 'relational_word', 'is_associative', 'score']):\n", + " return {k: d[k] for k in d if k in keys}\n", + "\n", + "# ([[filter_dict(e) for e in eg] for eg in groups.values() if eg[0]['relational_word'] != 'none' and all([e['score'] for e in eg])])# / len([eg for eg in groups.values() if eg[0]['relational_word'] != 'none'])\n", + "[(index, eg[0]['relational_word'], all([e['score'] for e in eg])) for index, eg in groups.items() if eg[0]['relational_word'] != 'none']\n", + "# len([filter_dict(e) for e in examples if 'score' in e and not e['score'] and e['adjacent_ref']])\n", + "# for e in examples:\n", + "# if e['index'] % 2 == 0:\n", + "# print(e['sentence'])" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "179" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(['because' in e['sentence'] for e in examples]) + \\\n", + "sum(['so ' in e['sentence'] for e in examples]) + \\\n", + "sum(['but ' in e['sentence'] for e in examples]) + \\\n", + "sum(['though' in e['sentence'] for e in examples])" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "# with open('WSC_switched_label.json', 'w') as f:\n", + "# json.dump(examples, f)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "vis_attn_topk = 3\n", + "\n", + "def has_chinese_label(labels):\n", + " labels = [label.split('->')[0].strip() for label in labels]\n", + " r = sum([len(label) > 1 for label in labels if label not in ['BOS', 'EOS']]) * 1. / (len(labels) - 1)\n", + " return 0 < r < 0.5 # r == 0 means empty query labels used in self attention\n", + "\n", + "def _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col, color='b'):\n", + " assert len(query_labels) == attn.size(0)\n", + " assert len(key_labels) == attn.size(1)\n", + "\n", + " ax1.set_xlim([-1, 1])\n", + " ax1.set_xticks([])\n", + " ax2 = ax1.twinx()\n", + " nlabels = max(len(key_labels), len(query_labels))\n", + " pos = range(nlabels)\n", + " \n", + " if 'self' in attn_name and col < ncols - 1:\n", + " query_labels = ['' for _ in query_labels]\n", + "\n", + " for ax, labels in [(ax1, key_labels), (ax2, query_labels)]:\n", + " ax.set_yticks(pos)\n", + " if has_chinese_label(labels):\n", + " ax.set_yticklabels(labels, fontproperties=zhfont)\n", + " else:\n", + " ax.set_yticklabels(labels)\n", + " ax.set_ylim([nlabels - 1, 0])\n", + " ax.tick_params(width=0, labelsize='xx-large')\n", + "\n", + " for spine in ax.spines.values():\n", + " spine.set_visible(False)\n", + "\n", + "# mask, attn = filter_attn(attn)\n", + " for qi in range(attn.size(0)):\n", + "# if not mask[qi]:\n", + "# continue\n", + "# for ki in range(attn.size(1)):\n", + " for ki in attn[qi].topk(vis_attn_topk)[1]:\n", + " a = attn[qi, ki]\n", + " ax1.plot((-1, 1), (ki, qi), color, alpha=a)\n", + "# print(attn.mean(dim=0).topk(5)[0])\n", + "# ax1.barh(pos, attn.mean(dim=0).data.cpu().numpy())\n", + "\n", + "def plot_layer_attn(result_tuple, attn_name='dec_self_attns', layer=0, heads=None):\n", + " hypo, nheads, labels_dict = result_tuple\n", + " key_labels, query_labels = labels_dict[attn_name]\n", + " if heads is None:\n", + " heads = range(nheads)\n", + " else:\n", + " nheads = len(heads)\n", + " \n", + " stride = 2 if attn_name == 'dec_enc_attns' else 1\n", + " nlabels = max(len(key_labels), len(query_labels))\n", + " rcParams['figure.figsize'] = 20, int(round(nlabels * stride * nheads / 8 * 1.0))\n", + " \n", + " rows = nheads // ncols * stride\n", + " fig, axes = plt.subplots(rows, ncols)\n", + " \n", + " # for head in range(nheads):\n", + " for head_i, head in enumerate(heads):\n", + " row, col = head_i * stride // ncols, head_i * stride % ncols\n", + " ax1 = axes[row, col]\n", + " attn = hypo[attn_name][layer][head]\n", + " _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col)\n", + " if attn_name == 'dec_enc_attns':\n", + " col = col + 1\n", + " axes[row, col].axis('off') # next subfig acts as blank place holder\n", + " # plt.suptitle('%s with %d heads, Layer %d' % (attn_name, nheads, layer), fontsize=20)\n", + " plt.show() \n", + " \n", + "ncols = 4" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'BertSelfAttention' object has no attribute 'attention_probs'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mattn_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'enc_self_attns'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mhypo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mattn_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbert\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattention\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattention_probs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_hidden_layers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mkey_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mquery_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokens\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mlabels_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mattn_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mkey_labels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery_labels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mresult_tuple\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mhypo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_attention_heads\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mattn_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'enc_self_attns'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mhypo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mattn_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbert\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattention\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattention_probs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_hidden_layers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mkey_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mquery_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokens\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mlabels_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mattn_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mkey_labels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery_labels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mresult_tuple\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mhypo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_attention_heads\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/qsj/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 516\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmodules\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 517\u001b[0m raise AttributeError(\"'{}' object has no attribute '{}'\".format(\n\u001b[0;32m--> 518\u001b[0;31m type(self).__name__, name))\n\u001b[0m\u001b[1;32m 519\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'BertSelfAttention' object has no attribute 'attention_probs'" + ] + } + ], + "source": [ + "attn_name = 'enc_self_attns'\n", + "hypo = {attn_name: [model.bert.encoder.layer[i].attention.self.attention_probs[0] for i in range(config.num_hidden_layers)]}\n", + "key_labels = query_labels = tokens\n", + "labels_dict = {attn_name: (key_labels, query_labels)}\n", + "result_tuple = (hypo, config.num_attention_heads, labels_dict)\n", + "plot_layer_attn(result_tuple, attn_name=attn_name, layer=10, heads=None)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git "a/likunlin-\344\274\230\345\214\226.ipynb" "b/likunlin-\344\274\230\345\214\226.ipynb" new file mode 100644 index 00000000000000..f5932b2e393d71 --- /dev/null +++ "b/likunlin-\344\274\230\345\214\226.ipynb" @@ -0,0 +1,4690 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "from IPython.core.interactiveshell import InteractiveShell\n", + "InteractiveShell.ast_node_interactivity = 'all'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten.\n", + "Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead.\n", + "Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable.\n", + "Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable.\n", + "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "import nltk\n", + "import numpy as np\n", + "import math\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "from pylab import rcParams\n", + "\n", + "import torch\n", + "import torch.nn.functional as F\n", + "from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig\n", + "from examples.extract_features import *" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/21/2019 18:04:54 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/xd/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084\n", + "03/21/2019 18:04:54 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/\n", + "03/21/2019 18:04:54 - INFO - pytorch_pretrained_bert.modeling - Model config {\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"max_position_embeddings\": 512,\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"type_vocab_size\": 2,\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "class Args:\n", + " def __init__(self):\n", + " pass\n", + " \n", + "args = Args()\n", + "args.no_cuda = True\n", + "\n", + "CONFIG_NAME = 'bert_config.json'\n", + "BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/'\n", + "config_file = os.path.join(BERT_DIR, CONFIG_NAME)\n", + "config = BertConfig.from_json_file(config_file)\n", + "\n", + "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')#do_lower_case:在标记化时将文本转换为小写。默认= True\n", + "#tokenizer.tokenize = nltk.word_tokenize\n", + "model = BertForPreTraining.from_pretrained(BERT_DIR)\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n", + "_ = model.to(device)\n", + "_ = model.eval()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BertForPreTraining:\n", + "Outputs:\n", + " if `masked_lm_labels` and `next_sentence_label` are not `None`:\n", + " Outputs the total_loss which is the sum of the masked language modeling loss and the next\n", + " sentence classification loss.\n", + " if `masked_lm_labels` or `next_sentence_label` is `None`:\n", + " Outputs a tuple comprising\n", + " - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and\n", + " - the next sentence classification logits of shape [batch_size, 2]." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "from_pretrained:\n", + "Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.\n", + "Download and cache the pre-trained model file if needed." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "import re\n", + "def convert_text_to_examples(text): \n", + " '''功能:\n", + " 把输入的文本变成一个实例,一个实例中包含text_a,text_b(text_b用于是否为上下句的任务,该任务不使用此功能)\n", + " 输入:\n", + " text:一个列表结构,列表中包含原始文本字符串,由于仅完成mlm任务,所以text列表中仅包含一个字符串,就是待检查的字符串\n", + " 输出:\n", + " example:实例,其中包含:\n", + " unique_id:此任务仅用到0\n", + " text_a:text列表内的字符串\n", + " text_b:此任务下该变量为None\n", + " '''\n", + " examples = []\n", + " unique_id = 0\n", + " if True:\n", + " for line in text:\n", + " line = line.strip()\n", + " text_a = None\n", + " text_b = None\n", + " m = re.match(r\"^(.*) \\|\\|\\| (.*)$\", line) #想要匹配这样的字符串'You are my sunshine. ||| I love you.'\n", + " \n", + " if m is None:\n", + " text_a = line\n", + " else:\n", + " text_a = m.group(1) #匹配的第一句,比如You are my sunshine,my only sunshine.\n", + " text_b = m.group(2) #匹配的第二句,比如I love you.\n", + " \n", + " examples.append(\n", + " InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))\n", + " unique_id += 1\n", + " return examples\n", + "#print(convert_text_to_examples(['I love you. The cat is so cute.'])[0].text_a)\n", + "\n", + "def convert_examples_to_features(examples, tokenizer, append_special_tokens=True, replace_mask=True, print_info=False):\n", + " '''功能:\n", + " 把实例变成一个特征列表\n", + " 输入:\n", + " examples:实例,convert_text_to_examples()函数的输出\n", + " tokenizer:BERT的tokenizer,用于将文本进行各种处理,它可以把一个text转变成tokens,把tokens变成每个token在词典中的编号以及逆运算\n", + " append_special_tokens:是否允许在生成的tokens中加入特殊符号,也就是[CLS]、[MASK]和[SEP],默认为True\n", + " replace_mask:不明\n", + " print_info:不明\n", + " 输出:\n", + " features:每一个feature包含:\n", + " unique_id:编号,目前实现的功能features里面仅有一个feature\n", + " tokens=tokens,tokens:是形如['i','love','you','.']的一个列表\n", + " input_ids=input_ids:字符串中的每个单词在词典中的index序列\n", + " input_mask=input_mask:一堆1\n", + " input_type_ids=input_type_ids)):对text_a,text_b的区分,用于上下句任务,对于本任务,该参数为一个列表,其中包含token长度个的0\n", + " '''\n", + " features = []\n", + " for (ex_index, example) in enumerate(examples):\n", + " tokens_a = tokenizer.tokenize(example.text_a) #tokenize的作用是把\"i love you.\"变成['i','love','you','.']\n", + " tokens_b = None\n", + " if example.text_b:\n", + " tokens_b = tokenizer.tokenize(example.text_b)\n", + "\n", + " tokens = []\n", + " input_type_ids = [] #segment embedding\n", + " if append_special_tokens: #输入参数中默认为true\n", + " tokens.append(\"[CLS]\")\n", + " input_type_ids.append(0)\n", + " for token in tokens_a:\n", + " if replace_mask and token == '_': # XD\n", + " token = \"[MASK]\"\n", + " tokens.append(token)\n", + " input_type_ids.append(0)\n", + " if append_special_tokens:\n", + " tokens.append(\"[SEP]\")\n", + " input_type_ids.append(0)\n", + "\n", + " if tokens_b:\n", + " for token in tokens_b:\n", + " if replace_mask and token == '_': # XD\n", + " token = \"[MASK]\"\n", + " tokens.append(token)\n", + " input_type_ids.append(1)\n", + " if append_special_tokens:\n", + " tokens.append(\"[SEP]\")\n", + " input_type_ids.append(1)\n", + " input_ids = tokenizer.convert_tokens_to_ids(tokens) #把原来句子中的词语编成在字典中的编号\n", + " input_mask = [1] * len(input_ids) \n", + " \n", + " if ex_index < 5:\n", + "# logger.info(\"*** Example ***\")\n", + "# logger.info(\"unique_id: %s\" % (example.unique_id))\n", + " logger.info(\"tokens: %s\" % \" \".join([str(x) for x in tokens]))\n", + "# logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n", + "# logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n", + "# logger.info(\n", + "# \"input_type_ids: %s\" % \" \".join([str(x) for x in input_type_ids]))\n", + " \n", + " features.append(\n", + " InputFeatures(\n", + " unique_id=example.unique_id,#编号,目前实现的功能features里面仅有一个feature\n", + " tokens=tokens,#形如['i','love','you','.']的一个列表\n", + " input_ids=input_ids,#字符串中的每个单词在词典中的index序列\n", + " input_mask=input_mask, #一堆1\n", + " input_type_ids=input_type_ids)) #第0类和第1类,对text_a,text_b的区分,本代码中全都是零\n", + " return features \n", + "\n", + "def copy_and_mask_feature(feature, step, masked_tokens=None): \n", + " '''\n", + " 功能:\n", + " 输入feature生成训练的批次数以及mask好的训练素材\n", + " 输入:\n", + " feature:convert_examples_to_features函数的输出\n", + " step:两个[mask]位置的步长\n", + " masked_tokens:默认为None,在程序中没有使用\n", + " '''\n", + " import copy\n", + " tokens = feature.tokens\n", + " len_token = len(tokens)\n", + " if len_token 0\n", + " masked_feature_copies = []\n", + " for i in batches: #用[mask]依次掩盖每一个位置\n", + " feature_copy = copy.deepcopy(feature)\n", + " masked_pos = i\n", + " while masked_pos < len_token:\n", + " feature_copy.input_ids[masked_pos] = tokenizer.vocab[\"[MASK]\"]\n", + " masked_pos = masked_pos + step\n", + " masked_feature_copies.append(feature_copy)\n", + " return masked_feature_copies, batches\n", + "\n", + "#masked_feature_copies, batches = copy_and_mask_feature(features[0],3)\n", + "#print(masked_feature_copies[0].input_ids) #结果[101, 1045, 2293, 103, 102]\n", + "#print(batches) #结果是一个range(0,5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'print(input_ids_sen)\\nprint(in_sentence)\\nprint(input_type_ids_sen)\\nprint(sentences)\\nprint(entire_ids)\\nprint(entire_type_ids)\\n#input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids = None'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "analyzed_cache = {}\n", + "from pattern.en import conjugate, lemma, lexeme,PRESENT,SG\n", + "#print (lemma('gave'))\n", + "#print (lexeme('production'))\n", + "#print (conjugate(verb='give',tense=PRESENT,number=SG))\n", + "def process_text(text): \n", + " '''\n", + " 功能:\n", + " 处理输入文本,将文本按句子分成若干token,得出原来text中index位置的单词在x句子的y位置,还得出各个句子类别码\n", + " 输入:\n", + " text:文本字符串,注意区别\n", + " 输出:\n", + " input_ids_sen:二维列表,第一维列表的元素是每个句子的input_ids列表\n", + " input_type_ids_sen:二维列表,第一维列表的元素是每个句子的input_type_ids列表\n", + " in_sentence:通过这个二维数组可以很方便的通过在完整text中的下标找到这个下标所在的句子和在句子中的下标\n", + " sentences:字符串列表,列表中每一个元素是一个句子字符串\n", + " entire_ids:整个text的input_ids\n", + " entire_type_ids:整个text的input_type_ids\n", + " '''\n", + " token =[]\n", + " entire_type_ids = []\n", + " token0 = tokenizer.tokenize(text)\n", + " token.append('[CLS]')\n", + " entire_type_ids.append(0)\n", + " for i in token0:\n", + " token.append(i)\n", + " entire_type_ids.append(0)\n", + " token.append('[SEP]')\n", + " entire_type_ids.append(0)\n", + " \n", + " entire_ids = tokenizer.convert_tokens_to_ids(token)\n", + " in_sentence = [[0,0]] \n", + " sentence_n = 0\n", + " index = 1\n", + " for i in range(1,len(token)-1):\n", + " in_sentence.append([sentence_n,index]) #每个token中的词在所在句中的位置表示出来,以及该位置在哪一句中\n", + " index = index + 1 #比如,位置i这个词在第sentence句的index位置上\n", + " if token[i] == '.':\n", + " sentence_n = sentence_n + 1\n", + " index = 1\n", + " sentences = text.split(\".\")\n", + " \n", + " sen_token = []\n", + " input_ids_sen = []\n", + " input_type_ids_sen = []\n", + " for i,sentence in enumerate(sentences):\n", + " sentence = sentence + '.'\n", + " sentences[i] = sentences[i] + '.'\n", + " token = []\n", + " input_type_ids = []\n", + " tokens = tokenizer.tokenize(sentence)\n", + " token.append('[CLS]')\n", + " input_type_ids.append(0) \n", + " for i in tokens:\n", + " token.append(i)\n", + " input_type_ids.append(0) \n", + " token.append('[SEP]') \n", + " input_type_ids.append(0)\n", + " input_ids_sen.append(tokenizer.convert_tokens_to_ids(token))\n", + " input_type_ids_sen.append(input_type_ids)\n", + " return input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids\n", + "#text = [\"Last week I went to the theatre. I had an very good a seat.The play were very interesting. But I didn't enjoy it. A young man and a young woman were sitting behind me. They were talking loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angrily. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'\"]\n", + "#text = [\"Last week I went to the theatre. I had very good seat. The play was very interesting. But I didn't enjoy it. A young man and a young woman were sitting behind me. They were talking loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angrily. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'\"]\n", + "#text = [\"The question is more easy than that one.\"]\n", + "text = [\"Last week I went to the theater. There are many person . Luckily I had very good seat. The plays was very interesting. However, I didn't enjoy it. A young man and a young woman were sitting behind me. They were talk loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angry. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'\"]\n", + "\n", + "input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids = process_text(text[0])\n", + "'''print(input_ids_sen)\n", + "print(in_sentence)\n", + "print(input_type_ids_sen)\n", + "print(sentences)\n", + "print(entire_ids)\n", + "print(entire_type_ids)\n", + "#input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids = None'''" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "def get_word(index):\n", + " '''\n", + " 输入:\n", + " index:在完整text中的位置\n", + " 输出\n", + " word:该位置上的单词\n", + " '''\n", + " word_id = entire_ids[index]\n", + " word = tokenizer.ids_to_tokens[word_id]\n", + " return word\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "import copy\n", + "import nltk\n", + "from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,PRESENT,SG,INFINITIVE, PRESENT, PAST, FUTURE, PROGRESSIVE\n", + "\n", + "def give_suggestion(input_ids_,input_type_ids_,id_in_sen,alternative_word,threshold):\n", + " '''\n", + " 功能:\n", + " 给出指定文本指定位置的推荐用词\n", + " 输入:\n", + " input_ids_:要分析的文本的input_ids\n", + " input_type_ids_:要分析的文本的的input_type_ids\n", + " id_in_sen:要分析的文本中[MASK]的位置下标,也就是需要给出建议用词的位置\n", + " alternative_word:推荐的备选词范围\n", + " threshold:阈值\n", + " 输出:\n", + " suggestion:推荐\n", + " need:推荐的是否是备选词中的词\n", + " suggestion_prob:推荐词填在id_in_sen位置的概率\n", + " top_of_alternative:备选词中最值得推荐的词\n", + " '''\n", + " input_ids = copy.deepcopy(input_ids_)\n", + " input_type_ids = copy.deepcopy(input_type_ids_)\n", + " word0 = input_ids[id_in_sen]\n", + " word0 = tokenizer.ids_to_tokens[word0]\n", + " list_word_id = []\n", + " \n", + " input_ids[id_in_sen] = tokenizer.vocab[\"[MASK]\"]\n", + " T_input_ids = torch.tensor([input_ids], dtype=torch.long) #把input_ids增加了一个维度\n", + " T_input_type_ids = torch.tensor([input_type_ids], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样\n", + " T_input_ids = T_input_ids.to(device) #拿去GPU\n", + " T_input_type_ids = T_input_type_ids.to(device)\n", + "\n", + " mlm_logits, _ = model(T_input_ids, T_input_type_ids)\n", + " mlm_probs = F.softmax(mlm_logits, dim=-1)\n", + " reduced_mlm_probs = mlm_probs[0][id_in_sen]\n", + "\n", + " top_ind = reduced_mlm_probs.argmax().item()\n", + " top_prob = reduced_mlm_probs.max().item() \n", + " \n", + " list_word = []\n", + " \n", + " top_of_alternative = None\n", + " if len(alternative_word)>0:\n", + " list_word_prob = {}\n", + " for word in alternative_word:\n", + " try:\n", + " list_word_id.append(tokenizer.vocab[word])\n", + " list_word.append(word)\n", + " except KeyError:\n", + " pass\n", + " #print(list_word_id) \n", + " #print(list_word)\n", + " for word,word_id in zip(list_word,list_word_id):\n", + " list_word_prob.update({word:float(reduced_mlm_probs[word_id].data)})\n", + " prob_ord = sorted(list_word_prob.items(),key = lambda x:x[1],reverse = True)\n", + " #print(prob_ord)\n", + " #print(tokenizer.ids_to_tokens[top_ind],top_prob)\n", + " #print(prob_ord[0][0],prob_ord[0][1])\n", + " top_prob_word = prob_ord[0][1]\n", + " top_of_alternative = prob_ord[0][0]\n", + " gap = math.log(top_prob) - math.log(top_prob_word)\n", + " if gap < threshold:\n", + " suggestion = prob_ord[0][0]\n", + " suggestion_prob = prob_ord[0][1]\n", + " need = 1\n", + " else:\n", + " suggestion = tokenizer.ids_to_tokens[top_ind]\n", + " suggestion_prob = top_prob\n", + " need = 0\n", + " #print(\"gap = \" + str(gap))\n", + " #print(prob_ord)\n", + " else:\n", + " suggestion = tokenizer.ids_to_tokens[top_ind]\n", + " suggestion_prob = top_prob\n", + " need = 0\n", + " \n", + " return suggestion,need,suggestion_prob,top_of_alternative \n", + "\n", + "#返回变量5\n", + "#suggestion -> 最值得推荐的词\n", + "#need -> 是否需要可选词中的一个\n", + "#suggestion_prob ->最值得推荐的词的概率\n", + "#top_of_alternative -> 可选词中最值得推荐的\n", + "#suggestion,need,suggestion_prob,top_of_alternative = give_suggestion(input_ids_,input_type_ids_,id_in_sen,alternative_word,threshold)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from spacy.lemmatizer import Lemmatizer\n", + "from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES\n", + "from pattern.en import comparative, superlative\n", + "from pattern.en import suggest\n", + "from nltk.stem.lancaster import LancasterStemmer\n", + "from nltk.stem.porter import PorterStemmer\n", + "from nltk.stem import SnowballStemmer\n", + "import enchant\n", + "d = enchant.Dict(\"en_US\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n", + "totally time cost 0.2184145450592041 s\n" + ] + } + ], + "source": [ + "stemmers=[]\n", + "stemmers.append(LancasterStemmer()) \n", + "stemmers.append(SnowballStemmer(\"english\"))\n", + "stemmers.append(PorterStemmer())\n", + "lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)\n", + "#分情况讨论,如果新词比旧的词长,或者是短\n", + "def word_convert(word,new_word,Stemmer):\n", + " '''\n", + " 功能:\n", + " 根据提供的word和可能的变形new_word,得到正确的变形,例如给出basic,basicly得到basically\n", + " 输入:\n", + " word:需要变形的词\n", + " new_word:猜想的变形\n", + " 输出:\n", + " suggest_word:推荐的正确变形\n", + " '''\n", + " suggest_word = None\n", + " word_stem = Stemmer().stem(word)\n", + " suggest_ = new_word\n", + " \n", + " suggest_list = suggest(suggest_)\n", + "\n", + " if len(word) 0.95):# or word_[1] > 0.95 :\n", + " suggest_word = word_[0]\n", + " break \n", + " if word_[1] < 0.001:\n", + " break\n", + " stem_list = []\n", + " for stemmer in stemmers:\n", + " suggest_stem = stemmer.stem(word_[0])\n", + " if flag == 1 and suggest_stem[:-1] in word_stem and word_stem[:3] in suggest_stem[:3]: #一般是去后缀\n", + " suggest_word = word_[0]\n", + " break\n", + " elif flag == 0 and word_stem in suggest_stem and word_[0][-1:] in suggest_[-1:]: #一般是加后缀,后缀一定要一样\n", + " suggest_word = word_[0]\n", + " break\n", + " \n", + " if suggest_word != None:\n", + " break\n", + " return suggest_word \n", + "\n", + "import time\n", + "time_start=time.time()\n", + "for i in range(1):\n", + " print(word_convert(\"dark\",\"darkment\",PorterStemmer))\n", + "time_end=time.time()\n", + "print('totally time cost',time_end-time_start,'s')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "beaus\n", + "totally time cost 0.0006299018859863281 s\n" + ] + } + ], + "source": [ + "stemmers=[]\n", + "stemmers.append(LancasterStemmer()) \n", + "stemmers.append(SnowballStemmer(\"english\"))\n", + "stemmers.append(PorterStemmer())\n", + "lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)\n", + "def word_convert(word,new_word,Stemmer):\n", + " '''\n", + " 说明;\n", + " 与上面的区别是使用的拼写改错算法不同,上面那个平均速度慢,但更符合我的要求,这个平均速度更快\n", + " 功能:\n", + " 根据提供的word和可能的变形new_word,得到正确的变形,例如给出basic,basicly得到basically\n", + " 输入:\n", + " word:需要变形的词\n", + " new_word:猜想的变形\n", + " Stemmer:词根提取器\n", + " 输出:\n", + " suggest_word:推荐的正确变形\n", + " '''\n", + " if d.check(new_word)==True: #如果发现new_word拼写正确,则直接返回\n", + " return new_word\n", + " else:\n", + " suggest_word = None\n", + " word_stem = Stemmer().stem(word)\n", + " suggest_ = new_word\n", + " suggest_list = d.suggest(suggest_) #可能的正确单词列表\n", + "\n", + " if len(word)death,success->succeed无能为力" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "successfully\n", + "basic\n" + ] + } + ], + "source": [ + "\n", + "def adj_to_adv(word):\n", + " suggest_word = None\n", + " if(word == \"good\"):\n", + " return \"well\"\n", + " else:\n", + " suggest_ = word + 'ly'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " return suggest_word\n", + "#如果形容词副词同形,那么他会返回none,但是不影响计算,因为形容词副词同形啊\n", + "print(adj_to_adv(\"successful\"))\n", + "\n", + "def adv_to_adj(word):\n", + " suggest_word = None\n", + " if(word == \"well\"):\n", + " return \"good\" \n", + " elif word[-2:] == 'ly':\n", + " suggest_ = word[:-2]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " return suggest_word\n", + "print(adv_to_adj(\"basically\"))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['interested', 'interest']\n" + ] + } + ], + "source": [ + "def adj_to_anything(word):#形容词变成其他词性\n", + " suggest_word = None\n", + " suggest_list = []\n", + " if word[-1:] == 'y': #举例 healthy->health\n", + " suggest_ = word[:-1]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-3:] == 'ful':#举例 successful->success\n", + " suggest_ = word[:-3]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-3:] == 'ive': #举例 active -> act\n", + " suggest_ = word[:-4]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-2:] == 'ed': #举例 interested->interest->interesting\n", + " suggest_ = word[:-2]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word) \n", + " suggest_ = suggest_ + 'ing'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word) \n", + " \n", + " elif word[-3:] == 'ing':#举例 interesting->interest->interested\n", + " suggest_ = word[:-3]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " suggest_ = suggest_ + 'ed'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word) \n", + " \n", + " elif word[-4:] == 'less': #举例 careless -> care\n", + " suggest_ = word[:-4]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-2:] == 'ly': #举例: friendly -> friend , lovely -> love\n", + " suggest_ = word[:-2]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " \n", + " elif word[-1:] == 't': #举例 different -> different\n", + " suggest_ = word[:-1]\n", + " suggest_ = suggest_ + 'ce'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-3:] == 'ous': #举例 dangerous -> danger\n", + " suggest_ = word[:-3]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-2:] == 'al': #举例 original -> origin\n", + " suggest_ = word[:-2]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-4:] == 'able':\n", + " suggest_ = word[:-4]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-2:] == 'en': #举例 woolen -> wool\n", + " suggest_ = word[:-2]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-2:] == 'ic': \n", + " suggest_ = word[:-2]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word) \n", + " elif word[-3:] == 'ish':\n", + " suggest_ = word[:-3]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word == None:\n", + " suggest_ = word[:-3]\n", + " suggest_ = suggest_ + 'and'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer) \n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-3:] == 'ese':\n", + " suggest_ = word[:-3]\n", + " suggest_ = suggest_ + 'a'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer) \n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-3:] == 'ian':\n", + " suggest_ = word[:-1]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word == None:\n", + " suggest_ = word[:-3]\n", + " suggest_ = suggest_ + 'y'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " if suggest_word == None:\n", + " HouZhui_list = ['ment','ness','tion','ture','sion','ty','y','tive','sive']\n", + " for HouZhui in HouZhui_list:\n", + " suggest_ = word + HouZhui\n", + " new_word = word_convert(word,suggest_,PorterStemmer)\n", + " if new_word != None:\n", + " suggest_word = new_word\n", + " suggest_list.append(suggest_word)\n", + " suggest_list = list(set(suggest_list)) \n", + " return suggest_list\n", + "\n", + "print(adj_to_anything('interesting'))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'import time\\ntime_start=time.time()\\nprint(N_to_anything(\"success\"))\\ntime_end=time.time()\\nprint(\\'time cost\\',time_end-time_start,\\'s\\')'" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def N_to_anything(word):#名词变成其他词性\n", + " suggest_list = []\n", + " list_HouZhui = ['y','ful','tive','sive','ed','ing','less','ly','ous','al','able','en','tic','ish','ance','er','or']\n", + " list_QianZhui = ['a']\n", + " if word[-4:] in ['ment','ness','tion','ture','sion','tive','sive']:\n", + " suggest_ = word[:-4]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " else:\n", + " for HouZhui in list_HouZhui:\n", + " suggest_ = word + HouZhui\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " for QianZhui in list_QianZhui:\n", + " suggest_ = QianZhui + word\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " if word[-2:] == 'ce':\n", + " suggest_ = word[:-2]\n", + " suggest_ = syggest_ + 't'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word) \n", + " elif word[-4:] == 'land':\n", + " suggest_ = word[:-4]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word == None:\n", + " suggest_ = suggest_ + 'lish'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word) \n", + " #print(suggest_list)\n", + " suggest_list = list(set(suggest_list))\n", + " return suggest_list\n", + "'''import time\n", + "time_start=time.time()\n", + "print(N_to_anything(\"success\"))\n", + "time_end=time.time()\n", + "print('time cost',time_end-time_start,'s')'''" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['succeeder', 'succeeds', 'succeeded']\n", + "time cost 0.654491662979126 s\n" + ] + } + ], + "source": [ + "def V_to_anything(word):#动词变成其他词性\n", + " suggest_word = None\n", + " suggest_list = []\n", + "\n", + " HouZhui_list = ['ful','tive','sive','ed','less','ly','ous','al','able','en','tic','ish','ance','tion','sion','ment','er','or','ee']\n", + " for HouZhui in HouZhui_list:\n", + " suggest_ = word + HouZhui\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + "\n", + " suggest_list = list(set(suggest_list))\n", + " return suggest_list\n", + "\n", + "time_start=time.time()\n", + "print(V_to_anything('succeed'))\n", + "time_end=time.time()\n", + "print('time cost',time_end-time_start,'s') " + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n 功能:\\n 生成形容词,副词关联词表\\n 输入:\\n word:形容词/副词\\n 输出:\\n list_word:为没有添加词的其他形式,包括三音节以下词的比较级最高级\\n list_word2:为三音节及以上的词的比较级最高级,如果输入形容词比较级最高级没有more/most,该列表为空\\n 说明:\\n 由于三音节形容词/副词的比较级,最高级为more/most+原形容词/副词,所以特别把形容词/副词和其他词性变形区分出来\\n'" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(['difficult', 'difficulty', 'difficultly'], ['more difficult', 'most difficult'])\n", + "(['early', 'ear', 'earliest', 'earlier'], [])\n" + ] + } + ], + "source": [ + "'''\n", + " 功能:\n", + " 生成形容词,副词关联词表\n", + " 输入:\n", + " word:形容词/副词\n", + " 输出:\n", + " list_word:为没有添加词的其他形式,包括三音节以下词的比较级最高级\n", + " list_word2:为三音节及以上的词的比较级最高级,如果输入形容词比较级最高级没有more/most,该列表为空\n", + " 说明:\n", + " 由于三音节形容词/副词的比较级,最高级为more/most+原形容词/副词,所以特别把形容词/副词和其他词性变形区分出来\n", + "'''\n", + "\n", + "def build_like_word_adj(word): #创建类似形容词列表\n", + " list_word = []\n", + " list_word2 = [] #把比较级最高级带more的放在这里\n", + " lemmas = lemmatizer(word, u'adj')\n", + " #print(lemmas)\n", + " for i in lemmas:\n", + " list_word.append(i)\n", + " word_er = comparative(i)\n", + " if \"more\" in word_er: #把比较级带more,most的词放在另一个列表list_word2\n", + " list_word2.append(word_er)\n", + " else:\n", + " list_word.append(word_er)\n", + " word_est = superlative(i)\n", + " if \"most\" in word_est:\n", + " list_word2.append(word_est)\n", + " else:\n", + " list_word.append(word_est)\n", + " word_adv = adj_to_adv(i)\n", + " if word_adv != None:\n", + " list_word.append(word_adv)\n", + " list_N = adj_to_anything(word)\n", + " for N in list_N:\n", + " list_word.append(N)\n", + " \n", + " list_word = list(set(list_word))\n", + " return list_word,list_word2\n", + "\n", + "def build_like_word_adv(word): #创建类似形容词列表\n", + " list_word = []\n", + " list_word2 = []\n", + " list_special = ['however','seldom','often','never','otherwise']\n", + " if word in list_special:\n", + " list_word = [word]\n", + " list_word2 = []\n", + " else:\n", + " lemmas = lemmatizer(word, u'adj')\n", + " #print(lemmas)\n", + " for i in lemmas:\n", + " list_word.append(i)\n", + " word_er = comparative(i)\n", + " if \"more\" in word_er:\n", + " list_word2.append(word_er)\n", + " else:\n", + " list_word.append(word_er)\n", + " word_est = superlative(i)\n", + " if \"most\" in word_est:\n", + " list_word2.append(word_est)\n", + " else:\n", + " list_word.append(word_est)\n", + " word_adv = adv_to_adj(i)\n", + " if word_adv != None:\n", + " list_word.append(word_adv)\n", + " list_word = list(set(list_word))\n", + " return list_word,list_word2\n", + "\n", + "\n", + "\n", + "print(build_like_word_adj(\"difficult\"))\n", + "print(build_like_word_adv(\"early\"))\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n 功能:\\n 根据检查的位置整理出放入BERT模型的input_ids,input_type_ids以及检查位置在input_ids中的下标位置\\n pre_training_input_in_sentence得到检查位置所在句子的信息\\n pre_training_input_entire得到检查位置所在句子的信息\\n 输入:\\n index:在完整text中的位置\\n 输出:\\n input_ids:\\n input_type_ids:\\n id_in_sen:检查位置在句子中的下标\\n index:检查位置在完整text中的下标,其实就是输入的下标\\n'" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "the\n", + "[101, 2197, 2733, 1045, 2253, 2000, 1996, 4258, 1012, 102]\n", + "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", + "6\n" + ] + } + ], + "source": [ + "'''\n", + " 功能:\n", + " 根据检查的位置整理出放入BERT模型的input_ids,input_type_ids以及检查位置在input_ids中的下标位置\n", + " pre_training_input_in_sentence得到检查位置所在句子的信息\n", + " pre_training_input_entire得到检查位置所在句子的信息\n", + " 输入:\n", + " index:在完整text中的位置\n", + " 输出:\n", + " input_ids:\n", + " input_type_ids:\n", + " id_in_sen:检查位置在句子中的下标\n", + " index:检查位置在完整text中的下标,其实就是输入的下标\n", + "'''\n", + "def pre_training_input_in_sentence(index): \n", + " sentence_id = in_sentence[index][0]\n", + " id_in_sen = in_sentence[index][1]\n", + " word = input_ids_sen[sentence_id][id_in_sen]\n", + " word = tokenizer.ids_to_tokens[word]\n", + " input_ids = copy.deepcopy(input_ids_sen[sentence_id])\n", + " input_type_ids = copy.deepcopy(input_type_ids_sen[sentence_id])\n", + "\n", + " return word,input_ids,input_type_ids,id_in_sen\n", + "\n", + "def pre_training_input_entire(index): \n", + " word = entire_ids[index]\n", + " word = tokenizer.ids_to_tokens[word]\n", + " input_ids = copy.deepcopy(entire_ids)\n", + " input_type_ids = copy.deepcopy(entire_type_ids)\n", + "\n", + " return word,input_ids,input_type_ids,index\n", + "\n", + "word,input_ids,input_type_ids,index = pre_training_input_in_sentence(6)\n", + "print(word)\n", + "print(input_ids)\n", + "print(input_type_ids)\n", + "print(index)\n", + "#[101, 1045, 2572, 3153, 2006, 1996, 2754, 1012, 102]\n", + "#[101, 1045, 2572, 3153, 2006, 1996, 2754, 1012, 1045, 2018, 1037, 2200, 2204, 2835, 1012, 1996, 2377, 2001, 2200, 5875, 1012, 102]" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "分析各种词性系列函数\n", + " 功能:对第一遍检查得出的有问题的位置的单词,根据不同的词性进行不同步骤的分析\n", + " 输入:\n", + " index:在原文中的错误位置\n", + " 输出:\n", + " 给出的修改建议,修改建议不局限于错误位置" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'week'" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import copy\n", + "import nltk\n", + "from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,PRESENT,SG,INFINITIVE, PRESENT, PAST, FUTURE, PROGRESSIVE\n", + "\n", + "def analyse_V(index):\n", + "#这是一个处理动词语法问题的函数,输入为问题词在text的token中的下标index\n", + " need_to_will = need_be = 0\n", + " list_be = lexeme('be')\n", + " list_be = lexeme('be')[:8]\n", + " #**************************************判断是不是动词其他形式************************\n", + " wordV,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " if wordV in list_be:\n", + " list_word = list_be\n", + " else:\n", + " list_word = lexeme(wordV)\n", + " list_others = V_to_anything(conjugate(verb=wordV,tense=PRESENT,person = 1))\n", + " for other in list_others:\n", + " list_word.append(other)\n", + " #print(\"list_word = \",list_word)\n", + " #print(tokenizer.convert_ids_to_tokens(input_ids))\n", + " suggestion0,need,_,_= give_suggestion(input_ids,input_type_ids,index,list_word,5)\n", + " if need == 1 and suggestion0 != wordV:\n", + " return suggestion0\n", + " \n", + " else:#**************************************判断是不是缺介词***************************\n", + " wordV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)\n", + " input_ids.insert(id_in_sen + 1,tokenizer.vocab['at'])#就随便插入一个东西,占位子\n", + " input_type_ids.append(0)\n", + " list_IN = [\"at\",\"in\",\"on\",\"by\",\"for\",\"from\",\"with\",\"about\",\"against\",\"along\",\"among\",\"around\",\"as\",\"before\",\"behind\",\"below\",\"beside\",\"between\",\"during\",\"besides\",\"into\",\"near\",\"over\",\"through\",\"under\",\"without\",\"after\",\"above\",\"of\"]\n", + " suggestion4,need_IN,_,_ = give_suggestion(input_ids,input_type_ids,id_in_sen + 1,list_IN,1)\n", + " if need_IN == 1:\n", + " input_ids[id_in_sen + 1] = tokenizer.vocab[suggestion4]\n", + " list_word = lexeme(wordV)\n", + " suggestion44,need,_,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,3)\n", + " if need == 1:\n", + " suggestion = suggestion44 + ' ' +suggestion4\n", + " return suggestion\n", + " #**************************************判断是不是不定式或者将来时*************************** \n", + " #print(\"是否用不定式或将来时\")\n", + " wordV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)\n", + " input_ids.insert(id_in_sen,tokenizer.vocab['to'])#就随便插入一个东西,占位子\n", + " input_type_ids.append(0)\n", + " input_ids[id_in_sen + 1] = tokenizer.vocab[conjugate(verb=wordV,tense=PRESENT,person = 1)]\n", + " #print(tokenizer.convert_ids_to_tokens(input_ids))\n", + " suggestion_to_will,need_to_will,prob0,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,[\"to\",\"will\"],1)\n", + " if need_to_will == 1:\n", + " list_word = [conjugate(verb=wordV,tense=PRESENT,person = 1),conjugate(verb=wordV,tense=PRESENT,aspect=PROGRESSIVE)]\n", + " suggestion,need0,_,prob00= give_suggestion(input_ids,input_type_ids,id_in_sen + 1,list_word,5) \n", + " \n", + " #**********************************判断是不是被动语态或者进行时******************* \n", + "\n", + " #********************是不是被动语态**************** \n", + " #print(\"是不是被动语态\")\n", + " wordV,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " input_ids.insert(index,tokenizer.vocab['be'])#就随便插入一个东西,占位子\n", + " try:\n", + " input_ids[index + 1]=tokenizer.vocab[conjugate(verb=wordV,tense=PAST,aspect=PROGRESSIVE)]\n", + " input_type_ids.append(0)\n", + " #print(tokenizer.convert_ids_to_tokens(input_ids))\n", + " suggestion1,need_be1,prob1,_ = give_suggestion(input_ids,input_type_ids,index,list_be,1)\n", + " except KeyError:\n", + " need_be1 = 0\n", + " #********************是不是现在分词**************** \n", + " #print(\"是不是进行时\")\n", + " try:\n", + " input_ids[index + 1]=tokenizer.vocab[conjugate(verb=wordV,tense=PRESENT,aspect=PROGRESSIVE)]\n", + " suggestion2,need_be2,prob2,_ = give_suggestion(input_ids,input_type_ids,index,list_be,1)\n", + " #print(tokenizer.convert_ids_to_tokens(input_ids))\n", + " except KeyError:\n", + " need_be2 = 0\n", + " \n", + " #if need_be1 == 1 or need_be2 == 1:\n", + " #print(\"需要be\")\n", + " #***************************选择是不定式还是被动语态还是进行时****************************\n", + " prob_max = 0\n", + " if need_to_will == 1:\n", + " prob_max = max(prob_max,prob0)\n", + " if need_be1 == 1:\n", + " prob_max = max(prob_max,prob1)\n", + " if need_be2 == 1:\n", + " prob_max = max(prob_max,prob2)\n", + "\n", + " if need_to_will == 1 and prob_max == prob0:\n", + " need_be = 0\n", + " if need_be1 == 1 and prob_max == prob1:\n", + " need_to_will = 0\n", + " need_be = 1\n", + " be_ = suggestion1\n", + " if need_be2 == 1 and prob_max == prob2:\n", + " need_to_will = 0\n", + " need_be = 1\n", + " be_ = suggestion2\n", + " #*************************************************处理各种语法******************************************************************\n", + " if need_to_will == 1:\n", + " wordV,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " input_ids.insert(index,tokenizer.vocab[suggestion_to_will])\n", + " input_type_ids.append(0)\n", + " list_word = [conjugate(verb=wordV,tense=PRESENT,person = 1),conjugate(verb=wordV,tense=PRESENT,aspect=PROGRESSIVE)]\n", + " suggestion,_,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,5)\n", + " return 'to ' + suggestion\n", + "\n", + " elif need_be == 1:\n", + " #********************************被动语态或者进行时*****************\n", + " wordV,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " input_ids.insert(index,tokenizer.vocab[be_])\n", + " input_type_ids.append(0)\n", + " list_word = lexeme(wordV)\n", + " suggestion,_,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,5)\n", + " suggestion = be_ + ' '+ suggestion\n", + " else:\n", + " #*****************************************判断该位置是不是动词的其他时态**************************************************************\n", + " suggestion = suggestion0\n", + "\n", + " return suggestion\n", + " \n", + "analyse_V(2)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "was unwilling\n" + ] + } + ], + "source": [ + "def analyse_adj(index):\n", + " #这是一个处理形容词语法问题的函数,输入为问题词在text的token中的下标index\n", + " wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_entire(index) \n", + " list_word,list_word2 = build_like_word_adj(wordADJ)\n", + " #print(list_word)\n", + " suggestion0,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,5)\n", + " \n", + " if need_adj == 1 and suggestion0 != wordADJ:#判断是不是形容词其他变形\n", + " return suggestion0\n", + " elif get_word(index - 1) in ['more','most'] and len(list_word2) == 0:\n", + " #判断是不是比较级使用错误,如果该形容词比较级/最高级不需要加more/most,但是前面有more/most\n", + " wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) \n", + " del input_ids[id_in_sen - 1]\n", + " del input_type_ids[0]\n", + " suggestion3,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen - 1,list_word,6)\n", + " return '去掉前面 ' + get_word(index - 1)+ ' 原位置改成 ' + suggestion3\n", + " elif get_word(index + 1) in ['##er','##est','##r','##st'] and len(list_word2) != 0:\n", + " #判断是不是比较级使用错误,如果该形容词比较级/最高级需要more/most,但是错写成形容词+er/est\n", + " wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) \n", + " input_ids[id_in_sen + 1] = tokenizer.vocab[wordADJ]\n", + " suggestion4,need_bijiao,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,['more','most'],2)\n", + " if need_bijiao == 1:\n", + " input_ids[id_in_sen] = tokenizer.vocab[suggestion4]\n", + " suggestion5,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen+1,list_word,6)\n", + " return '去掉后面 '+ get_word(index + 1) + ' 原位置改成 '+ suggestion4 + ' ' + suggestion5 \n", + " else:#检查形容词前面是否需要加冠词或者是需要more,most的比较级,最高级抑或是be动词\n", + " #print(\"缺冠词或者没用比较级\")\n", + " wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) \n", + " input_ids.insert(id_in_sen,tokenizer.vocab[\"[MASK]\"])\n", + " input_type_ids.append(0)\n", + " #print(tokenizer.convert_ids_to_tokens(input_ids))\n", + " list_DT = ['the','a','an','this','that','these','those','some','any','all','more','most','am','is','are','was','were'] \n", + " suggestion,need_DT,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_DT,1)\n", + " if need_DT == 1:\n", + " wordADJ,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " input_ids.insert(index,tokenizer.vocab[suggestion])\n", + " input_type_ids.append(0)\n", + " #print(tokenizer.convert_ids_to_tokens(input_ids))\n", + " suggestion2,_,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,6) \n", + " return suggestion + ' ' + suggestion2\n", + " else:\n", + " return suggestion0\n", + "print(analyse_adj(78))" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ",\n" + ] + } + ], + "source": [ + "def analyse_adv(index):\n", + " #这是一个处理形容词语法问题的函数,输入为问题词在text的token中的下标index\n", + " need_DT = 0\n", + " need_douhao = 0\n", + " wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_entire(index)\n", + " list_word,list_word2 = build_like_word_adv(wordADV)\n", + " suggestion0,need_adv,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,3.5)\n", + " if need_adv == 1 and suggestion0 != wordADV:\n", + " return suggestion0\n", + " elif get_word(index - 1) in ['more','most'] and len(list_word2) == 0:\n", + " #判断是不是比较级使用错误,如果该形容词比较级/最高级不需要加more/most,但是前面有more/most\n", + " wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) \n", + " del input_ids[id_in_sen - 1]\n", + " del input_type_ids[0]\n", + " suggestion3,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen - 1,list_word,5)\n", + " return '去掉前面 ' + get_word(index - 1)+ ' 原位置改成 ' + suggestion3\n", + " elif get_word(index + 1) in ['##er','##est','##r','##st'] and len(list_word2) != 0:\n", + " #判断是不是比较级使用错误,如果该形容词比较级/最高级需要more/most,但是错写成形容词+er/est\n", + " wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) \n", + " input_ids[id_in_sen + 1] = tokenizer.vocab[wordADV]\n", + " suggestion4,need_bijiao,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,['more','most'],2)\n", + " if need_bijiao == 1:\n", + " input_ids[id_in_sen] = tokenizer.vocab[suggestion4]\n", + " suggestion5,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen+1,list_word,5)\n", + " return '去掉后面 '+ get_word(index + 1) + ' 原位置改成 '+ suggestion4 + ' ' + suggestion5 \n", + " else:#检查形容词前面是否需要加冠词或者是需要more,most的比较级,最高级,be动词\n", + " wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)\n", + " input_ids.insert(id_in_sen,tokenizer.vocab[\"[MASK]\"])\n", + " input_type_ids.append(0)\n", + " #print(tokenizer.convert_ids_to_tokens(input_ids))\n", + " list_DT = ['the','a','an','this','that','these','those','some','any','all','more','most','am','is','are','was','were'] \n", + " suggestion,need_DT,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_DT,1)\n", + " if need_DT == 1:\n", + " #print(\"需要冠词\")\n", + " wordADV,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " input_ids.insert(index,tokenizer.vocab[suggestion])\n", + " input_type_ids.append(0)\n", + " #print(tokenizer.convert_ids_to_tokens(input_ids))\n", + " suggestion2,_,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,3) \n", + " return suggestion + ' ' + suggestion2\n", + " else:\n", + " #副词后面可能缺少逗号,比如 Luckily,I won the game.\n", + " wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)\n", + " input_ids.insert(id_in_sen + 1,tokenizer.vocab[\",\"])\n", + " input_type_ids.append(0)\n", + " suggestion3,need_douhao,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,2)\n", + " if need_douhao == 1:\n", + " return suggestion3 + ' ,'\n", + " else:\n", + " return suggestion0\n", + "print(analyse_adv(5))" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "wanted\n" + ] + } + ], + "source": [ + "from pattern.en import article,referenced,pluralize, singularize\n", + "import nltk\n", + "def analyse_N(index):\n", + " #这是一个处理名词语法问题的函数,输入为问题词在text的token中的下标index\n", + "#******************************************初始数据处理**************************************************************************\n", + " need_DT = 0 #表示是否需要在前面加冠词 \n", + " wordN,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)\n", + " word_tag = nltk.pos_tag([wordN])\n", + " if word_tag[0][1] == \"NN\":\n", + " N_ = wordN\n", + " N_s= pluralize(wordN)\n", + " else:\n", + " N_ = singularize(wordN)\n", + " N_s= wordN\n", + " list_word = [N_,N_s]\n", + " list_others = N_to_anything(N_)\n", + " for other in list_others:\n", + " list_word.append(other)\n", + " #print(list_word)\n", + "#*****************************************判断是否需要冠词或者代词************************************************************************ \n", + " \n", + " input_ids.insert(id_in_sen,tokenizer.vocab[\"[MASK]\"])\n", + " input_type_ids.append(0)\n", + " #print(tokenizer.convert_ids_to_tokens(input_ids))\n", + " list_DT = ['the','a','an','this','that','these','those','some','any','all']\n", + " suggestion,need_DT,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_DT,1)\n", + " if need_DT == 0:#不需要冠词\n", + " #print(\"不需要冠词\")\n", + " wordN,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " suggestion,need_DT,_,top_of_list_word = give_suggestion(input_ids,input_type_ids,index,list_word,7)\n", + " return suggestion\n", + " elif need_DT == 1:\n", + " wordN,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " input_ids.insert(index,tokenizer.vocab[suggestion])\n", + " input_type_ids.append(0)\n", + " suggestion2,_,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,7)\n", + " return suggestion + ' ' + suggestion2\n", + "\n", + "print(analyse_N(78))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n 这是一个相关代词的词典,容易混淆的词放在一个列表中\\n\\n'" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'he': ['he', 'his', 'him', 'himself', 'who', 'whom', 'whose'], 'his': ['he', 'his', 'him', 'himself', 'who', 'whom', 'whose'], 'him': ['he', 'his', 'him', 'himself', 'who', 'whom', 'whose'], 'himself': ['he', 'his', 'him', 'himself', 'who', 'whom', 'whose'], 'who': ['that', 'which', 'who', 'whom', 'whose', 'as'], 'whom': ['that', 'which', 'who', 'whom', 'whose', 'as'], 'whose': ['that', 'which', 'who', 'whom', 'whose', 'as'], 'she': ['she', 'her', 'herself', 'hers', 'who', 'whom', 'whose'], 'her': ['she', 'her', 'herself', 'hers', 'who', 'whom', 'whose'], 'herself': ['she', 'her', 'herself', 'hers', 'who', 'whom', 'whose'], 'hers': ['she', 'her', 'herself', 'hers', 'who', 'whom', 'whose'], 'it': ['it', 'its', 'itself', 'who', 'whom', 'whose'], 'its': ['it', 'its', 'itself', 'who', 'whom', 'whose'], 'itself': ['it', 'its', 'itself', 'who', 'whom', 'whose'], 'i': ['i', 'me', 'my', 'myself', 'mine'], 'me': ['i', 'me', 'my', 'myself', 'mine'], 'my': ['i', 'me', 'my', 'myself', 'mine'], 'myself': ['i', 'me', 'my', 'myself', 'mine'], 'mine': ['i', 'me', 'my', 'myself', 'mine'], 'you': ['you', 'your', 'yourself', 'yourselves'], 'your': ['you', 'your', 'yourself', 'yourselves'], 'yourself': ['you', 'your', 'yourself', 'yourselves'], 'yourselves': ['you', 'your', 'yourself', 'yourselves'], 'we': ['we', 'us', 'our', 'ours', 'ourselves'], 'us': ['we', 'us', 'our', 'ours', 'ourselves'], 'our': ['we', 'us', 'our', 'ours', 'ourselves'], 'ours': ['we', 'us', 'our', 'ours', 'ourselves'], 'ourselves': ['we', 'us', 'our', 'ours', 'ourselves'], 'they': ['they', 'them', 'their', 'theirs'], 'them': ['they', 'them', 'their', 'theirs'], 'their': ['they', 'them', 'their', 'theirs'], 'theirs': ['they', 'them', 'their', 'theirs'], 'this': ['this', 'these'], 'these': ['this', 'these'], 'that': ['that', 'which', 'who', 'whom', 'whose', 'as'], 'those': ['that', 'those'], 'which': ['that', 'which', 'who', 'whom', 'whose', 'as'], 'what': ['who', 'whom', 'whose', 'which', 'what', 'whoever', 'whichever', 'whatever'], 'whoever': ['who', 'whom', 'whose', 'which', 'what', 'whoever', 'whichever', 'whatever'], 'whichever': ['who', 'whom', 'whose', 'which', 'what', 'whoever', 'whichever', 'whatever'], 'whatever': ['who', 'whom', 'whose', 'which', 'what', 'whoever', 'whichever', 'whatever'], 'as': ['that', 'which', 'who', 'whom', 'whose', 'as'], 'some': ['some', 'any'], 'any': ['some', 'any'], 'few': ['few', 'little'], 'little': ['few', 'little'], 'many': ['many', 'much'], 'much': ['many', 'much'], 'another': ['another', 'other'], 'other': ['another', 'other']}\n" + ] + } + ], + "source": [ + "'''\n", + " 这是一个相关代词的词典,容易混淆的词放在一个列表中\n", + "\n", + "'''\n", + "like_he = ['he','his','him','himself','who', 'whom', 'whose']\n", + "like_she = ['she','her','herself','hers','who', 'whom', 'whose']\n", + "like_it = ['it','its','itself','who', 'whom', 'whose']\n", + "like_i = ['i','me','my','myself','mine']\n", + "like_you = ['you','your','yourself','yourselves']\n", + "like_we = ['we','us','our','ours','ourselves']\n", + "like_they = ['they','them','their','theirs']\n", + "\n", + "like_this = ['this', 'these'] \n", + "like_that = ['that','those'] \n", + "pronoun_Question = ['who', 'whom', 'whose', 'which', 'what', 'whoever', 'whichever', 'whatever'] #疑问代词\n", + "pronoun_relation = ['that', 'which', 'who', 'whom', 'whose', 'as'] #关系代词\n", + "like_some = ['some','any']\n", + "like_few = ['few','little']\n", + "like_many = ['many','much']\n", + "like_other = ['another','other']\n", + "\n", + "pronoun = [like_he,like_she,like_it,like_i,like_you,like_we,like_they,like_this,like_that,pronoun_Question,pronoun_relation,like_some,like_few,like_many,like_other]\n", + "pronoun_dictionary = {}\n", + "\n", + "for list_word in pronoun:\n", + " for word in list_word:\n", + " pronoun_dictionary.update({word:list_word})\n", + "print(pronoun_dictionary)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'night'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0msuggestion\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mgive_suggestion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0minput_type_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mlist_word\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msuggestion\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0manalyse_pronoun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m14\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36manalyse_pronoun\u001b[0;34m(index)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;31m#这是一个处理代词语法问题的函数,输入为问题词在text的token中的下标index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mwordPROP\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0minput_type_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpre_training_input_entire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mlist_word\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpronoun_dictionary\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mwordPROP\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0msuggestion\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mgive_suggestion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0minput_type_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mlist_word\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msuggestion\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'night'" + ] + } + ], + "source": [ + "def analyse_pronoun(index):\n", + " #这是一个处理代词语法问题的函数,输入为问题词在text的token中的下标index\n", + " wordPROP,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " list_word = pronoun_dictionary[wordPROP]\n", + " suggestion,_,_,_= give_suggestion(input_ids,input_type_ids,index,list_word,3)\n", + " return suggestion\n", + "print(analyse_pronoun(14))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "解释一下,有很多副词确实也不是ly形式结尾的,比如用在三音节形容词前面的比较级more,most,还有频度副词often,seldom,never这种。因为这些词比较不容易用错,先暂时不考虑" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "all\n" + ] + } + ], + "source": [ + "def analyse_DT(index):\n", + " #检查冠词,检查是不是用别的冠词,或者是去掉会不会更好\n", + " wordDT,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) \n", + " if wordDT in ['all',\"every\",'per']:\n", + " return wordDT\n", + " else:\n", + " if wordDT in ['some','any']:\n", + " list_word = ['some','any']\n", + " elif wordDT in ['this','that','these','those']:\n", + " list_word = ['this','that','these','those']\n", + " elif wordDT in ['the','a','an']:\n", + " list_word = ['the','a','an']\n", + " elif wordDT in ['another','other']:\n", + " list_word = ['another','other']\n", + " else:\n", + " list_word = []\n", + " suggestion0,need_DT,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,1)\n", + " if wordDT in ['some','any','this','that','these','those','another','other','the','a','an']:\n", + " if need_DT == 1:\n", + " return suggestion0\n", + " else:\n", + " return \"去掉 \" + get_word(index)\n", + " else:\n", + " return wordDT\n", + " \n", + "print(analyse_DT(25))" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "but\n" + ] + } + ], + "source": [ + "def analyse_IN(index):\n", + " #检查介词是否需要去掉\n", + " wordIN,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) \n", + " list_word = [\"at\",\"in\",\"on\",\"by\",\"for\",\"from\",\"with\",\"about\",\"against\",\"along\",\"among\",\"around\",\"as\",\"before\",\"behind\",\"below\",\"beside\",\"between\",\"during\",\"besides\",\"into\",\"near\",\"over\",\"through\",\"under\",\"without\",\"after\",\"above\",\"of\",'to']\n", + " suggestion0,need_IN,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,3)\n", + " if need_IN == 1:\n", + " return suggestion0\n", + " else:\n", + " if wordIN in list_word:\n", + " return \"去掉 \" + get_word(index)\n", + " else:\n", + " return suggestion0\n", + " \n", + "print(analyse_IN(76))" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n 这是一个输出BERT模型训练结果的函数,方便查看调试\\n'" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import nltk\n", + "from pattern.en import conjugate, lemma, lexeme,PRESENT,SG\n", + "'''\n", + " 这是一个输出BERT模型训练结果的函数,方便查看调试\n", + "'''\n", + "def show_lm_probs(tokens, input_ids, probs, topk=5, firstk=20): #输出结果的函数,要最高概率topk个输出\n", + " def print_pair(token, prob, end_str='', hit_mark=' '):\n", + " if i < firstk:\n", + " # token = token.replace('', '').replace('\\n', '/n')\n", + " print('{}{: >3} | {: <12}'.format(hit_mark, int(round(prob*100)), token), end=end_str)\n", + " \n", + " ret = None\n", + " for i in range(len(tokens)):\n", + " ind_ = input_ids[i].item() if input_ids is not None else tokenizer.vocab[tokens[i]]\n", + " prob_ = probs[i][ind_].item() #这个probs是该字符串第i个位置上填上词典上各个词的概率,prob_是词典上原来天的这个词的概率\n", + " print_pair(tokens[i], prob_, end_str='\\t')\n", + " values, indices = probs[i].topk(topk)\n", + " #print(values, indices)\n", + " #print(\"****************************************************************************************************\")\n", + " top_pairs = []\n", + " for j in range(topk):\n", + " ind, prob = indices[j].item(), values[j].item()\n", + " hit_mark = '*' if ind == ind_ else ' '\n", + " token = tokenizer.ids_to_tokens[ind]\n", + " print_pair(token, prob, hit_mark=hit_mark, end_str='' if j < topk - 1 else '\\n')\n", + " top_pairs.append((token, prob))\n", + " if tokens[i] == \"[MASK]\":\n", + " ret = top_pairs\n", + " return ret #返回的这是个啥" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n 功能:\\n judge_and_suggestion系列函数,这个系列函数是在analyse之前做的一个预先判断处理,判断的是该位置原来词的相关词中有没有可以代替它的词\\n 当相关词中有词的可能性和原词的可能性的差距大于阈值,则认为原词是错的\\n 输入:\\n prob:该位置可能性列表\\n original:该位置原先的词\\n list_word:该位置相关词表\\n threhold:门槛,也就是阈值\\n 输出:\\n judge:判断原来的词是否正确,0表示需要换词,1表示不需要换词或者说相关词里面没一个合适的\\n suggestion:相关词中最好的推荐\\n'" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import math\n", + "from pattern import en\n", + "from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,INFINITIVE, PRESENT, PAST, FUTURE, PROGRESSIVE\n", + "'''\n", + " 功能:\n", + " judge_and_suggestion系列函数,这个系列函数是在analyse之前做的一个预先判断处理,判断的是该位置原来词的相关词中有没有可以代替它的词\n", + " 当相关词中有词的可能性和原词的可能性的差距大于阈值,则认为原词是错的\n", + " 输入:\n", + " prob:该位置可能性列表\n", + " original:该位置原先的词\n", + " list_word:该位置相关词表\n", + " threhold:门槛,也就是阈值\n", + " 输出:\n", + " judge:判断原来的词是否正确,0表示需要换词,1表示不需要换词或者说相关词里面没一个合适的\n", + " suggestion:相关词中最好的推荐\n", + "'''\n", + "def judge_and_suggestion(prob,original,list_word,threhold):\n", + " top_prob = 0\n", + " original_prob = prob[tokenizer.vocab[original]]\n", + " best = None\n", + " suggestion = None\n", + " for word in list_word:\n", + " try:\n", + " word_id = tokenizer.vocab[word]\n", + " prob_word = prob[word_id]\n", + " if prob_word > top_prob:\n", + " top_prob = prob_word\n", + " best_word = word\n", + " except KeyError:\n", + " pass\n", + " #print(best_word,top_prob)\n", + " #print(original,original_prob)\n", + " gap = math.log(top_prob) - math.log(original_prob)\n", + " #print(gap)\n", + " if gap > threhold:\n", + " suggestion = best_word\n", + " return 0,suggestion\n", + " else:\n", + " return 1,suggestion\n", + "def judge_CC_and_suggestion(prob,original_CC):\n", + " list_CC = [\"but\",\"yet\",\"still\",\"however\",\"although\",\"for\",\"so\",\"thus\",\"and\",\"or\",\"too\",\"again\",\"another\",\"either\",\"or\",\"neither\",\"nor\",\"when\",\"while\",\"as\",\"whenever\",\"since\",\"until\",\"till\"]\n", + " judge,suggestion = judge_and_suggestion(prob,original_CC,list_CC,2)\n", + " return judge,suggestion\n", + "def judge_V_and_suggestion(prob,original_V):\n", + " list_V = lexeme(original_V)\n", + " judge,suggestion = judge_and_suggestion(prob,original_V,list_V,2)\n", + " #print(\"检查点\",judge,suggestion)\n", + " return judge,suggestion\n", + " \n", + "def judge_IN_and_suggestion(prob,original_IN):\n", + " list_IN = [\"at\",\"in\",\"on\",\"by\",\"for\",\"from\",\"with\",\"about\",\"against\",\"along\",\"among\",\"around\",\"as\",\"before\",\"behind\",\"below\",\"beside\",\"between\",\"during\",\"besides\",\"into\",\"near\",\"over\",\"through\",\"under\",\"without\",\"after\",\"above\",\"of\",'to']\n", + " judge,suggestion = judge_and_suggestion(prob,original_IN,list_IN,1)\n", + " return judge,suggestion\n", + "def judge_DT_and_suggestion(prob,original_DT):\n", + " if original_DT in ['some','any']:\n", + " list_word = ['some','any']\n", + " elif original_DT in ['this','that','these','those']:\n", + " list_word = ['this','that','these','those']\n", + " elif original_DT in ['the','a','an']:\n", + " list_word = ['the','a','an']\n", + " elif original_DT in ['another','other']:\n", + " list_word = ['another','other']\n", + " judge,suggestion = judge_and_suggestion(prob,original_DT,list_DT,1)\n", + " return judge,suggestion\n", + "\n", + "def judge_MD_and_suggestion(prob,original_MD):\n", + " if original_MD in ['can','could']:\n", + " list_MD = ['can','could']\n", + " elif original_MD in ['may','might']:\n", + " list_MD = ['may','might']\n", + " elif original_MD in ['shall','should']:\n", + " list_MD = ['shall','should'] \n", + " elif original_MD in ['will','would']:\n", + " list_MD = ['will','would'] \n", + " elif original_MD in ['dare','dared']:\n", + " list_MD = ['dare','dared'] \n", + " else:\n", + " list_MD = []\n", + " judge,suggestion = judge_and_suggestion(prob,original_MD,list_MD,1)\n", + " if original_MD not in ['can','could','may','might','shall','should','will','would'] :\n", + " return judge,suggestion\n", + " else:\n", + " return 1,None\n", + " \n", + "def judge_N_and_suggestion(prob,original_N):\n", + " word_tag = nltk.pos_tag([original_N])\n", + " if word_tag[0][1] == \"NN\":\n", + " N_ = original_N\n", + " N_s= pluralize(original_N)\n", + " else:\n", + " N_ = singularize(original_N)\n", + " N_s= original_N\n", + " list_N = [N_,N_s]\n", + " list_others = N_to_anything(N_)\n", + " for other in list_others:\n", + " list_N.append(other)\n", + " judge,suggestion = judge_and_suggestion(prob,original_N,list_N,0.5)\n", + " return judge,suggestion\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "import colored\n", + "from colored import stylize\n", + "import spacy\n", + "nlp = spacy.load('en')\n", + "\n", + "suggestions = {} #\n", + "def show_abnormals(tokens, probs, show_suggestions=False):\n", + " global suggestions\n", + " suggestions = {} \n", + " def gap2color(gap):\n", + " if gap <= 5:\n", + " return 'yellow_1'\n", + " elif gap <= 10:\n", + " return 'orange_1'\n", + " else:\n", + " return 'red_1'\n", + " \n", + " def print_token(token, suggestion, gap):\n", + " if gap == 0:\n", + " print(stylize(token + ' ', colored.fg('white') + colored.bg('black')), end='')\n", + " else:\n", + " print(stylize(token, colored.fg(gap2color(gap)) + colored.bg('black')), end='')\n", + " if show_suggestions and gap > 5:\n", + " print(stylize('/' + suggestion + ' ', colored.fg('green' if gap > 10 else 'cyan') + colored.bg('black')), end='')\n", + " else:\n", + " print(stylize(' ', colored.fg(gap2color(gap)) + colored.bg('black')), end='')\n", + " # print('/' + suggestion, end=' ')\n", + " # print('%.2f' % gap, end=' ')\n", + " \n", + " avg_gap = 0.\n", + " tokens_tag = nltk.pos_tag(tokens)\n", + " #print(tokens_tag)\n", + " for i in range(1, len(tokens) - 1): # skip first [CLS] and last [SEP]\n", + " ind_ = tokenizer.vocab[tokens[i]]\n", + " prob_ = probs[i][ind_].item()\n", + " top_prob = probs[i].max().item()\n", + " top_ind = probs[i].argmax().item()\n", + " top_word = tokenizer.ids_to_tokens[top_ind]\n", + " gap = math.log(top_prob) - math.log(prob_) #计算两个词之间的差距\n", + " print()\n", + " print(\"*******************************************************************************************************************\")\n", + " print(i)\n", + " print(gap)\n", + " avg_gap += gap\n", + " #suggestion = tokenizer.ids_to_tokens[top_ind]\n", + " suggestion = None\n", + " #tag = tokens_tag[i][1]\n", + " #doc = nlp(tokens[i])\n", + " #tag = doc[0].tag_\n", + " tag = tokens_tag[i][1]\n", + " #print(tokens_tag[i])\n", + " print(tag)\n", + " if 'VB' in tag:\n", + " if gap>3 and top_word in [\"at\",\"in\",\"on\",\"by\",\"for\",\"from\",\"with\",\"about\",\"against\",\"along\",\"among\",\"around\",\"as\",\"before\",\"behind\",\"below\",\"beside\",\"between\",\"during\",\"besides\",\"into\",\"near\",\"over\",\"through\",\"under\",\"without\",\"after\",\"above\",\"of\",'to']:\n", + " suggestion = analyse_V(i) #如果推荐的是介词,说明这个位置可能需要补充什么 \n", + " elif gap > 7.5:\n", + " suggestion = analyse_V(i)\n", + " elif gap < 7.5 and gap > 3:\n", + " judge,suggestion = judge_V_and_suggestion(probs[i],tokens[i])\n", + " if judge == 0 :\n", + " gap = 6\n", + " else:\n", + " gap = 3\n", + " elif \"DT\" == tag and gap > 3:\n", + " suggestion = analyse_DT(i)\n", + " elif \"JJ\" in tag :\n", + " if gap > 6:\n", + " suggestion = analyse_adj(i)\n", + " else:\n", + " gap = 3\n", + " elif \"RB\" in tag and gap > 5:\n", + " suggestion = analyse_adv(i)\n", + " \n", + " elif \"PRP\" in tag and gap >5:\n", + " suggestion = analyse_pronoun(i)\n", + " elif \"NN\" in tag:\n", + " if gap > 4 and tokens[i][:2]==\"##\" and suggestions.__contains__(i-1)==False:\n", + " #如果gap>4并且该位置是后缀,并且前一个位置被建议修改,说明该位置需要去掉\n", + " suggestion = '去掉' + ' ' + tokens[i]\n", + " elif gap > 7.5:\n", + " suggestion = analyse_N(i)\n", + " elif gap < 7.5 and gap > 2:\n", + " judge,suggestion = judge_N_and_suggestion(probs[i],tokens[i])\n", + " if judge == 0 :\n", + " gap = 6\n", + " else:\n", + " gap = 3\n", + " elif \"CC\" in tag and gap > 2 :\n", + " judge,suggestion = judge_CC_and_suggestion(probs[i],tokens[i])\n", + " if judge == 1 :\n", + " gap = 3\n", + "\n", + " elif (\"IN\" == tag or 'TO' == tag) and gap > 2:\n", + " suggestion = analyse_IN(i)\n", + " \n", + " elif 'MD' in tag and gap > 5:\n", + " print(\"检查点1*****************************************************\")\n", + " judge,suggestion = judge_MD_and_suggestion(probs[i],tokens[i])\n", + " if judge == 1:\n", + " gap = 3\n", + " \n", + " elif \"CD\" in tag:\n", + " gap = 0 \n", + " \n", + " elif \"WDT\" == tag and gap > 2: #who,which,that那些\n", + " suggestion = top_word\n", + " \n", + " elif gap > 5:\n", + " suggestion = top_word\n", + " \n", + " if suggestion != tokens[i] and suggestion != None:\n", + " suggestions.update({i:suggestion})\n", + " gap = max(gap,6)\n", + " else:\n", + " gap = min(gap,3)\n", + " print_token(tokens[i], suggestion, gap)\n", + " \n", + " avg_gap /= (len(tokens) - 2)\n", + " print()\n", + " print('平均gap:'+ str(avg_gap))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "([[101,\n", + " 2043,\n", + " 1045,\n", + " 2001,\n", + " 2210,\n", + " 1010,\n", + " 5958,\n", + " 1005,\n", + " 1055,\n", + " 2305,\n", + " 2001,\n", + " 2256,\n", + " 2155,\n", + " 2208,\n", + " 2305,\n", + " 1012,\n", + " 102],\n", + " [101,\n", + " 2044,\n", + " 15264,\n", + " 1010,\n", + " 2057,\n", + " 2052,\n", + " 2377,\n", + " 4003,\n", + " 2399,\n", + " 1997,\n", + " 2035,\n", + " 4066,\n", + " 1999,\n", + " 1996,\n", + " 3564,\n", + " 2282,\n", + " 1012,\n", + " 102],\n", + " [101,\n", + " 2004,\n", + " 1996,\n", + " 4845,\n", + " 1010,\n", + " 1045,\n", + " 3866,\n", + " 2000,\n", + " 3422,\n", + " 13941,\n", + " 1989,\n", + " 2021,\n", + " 2053,\n", + " 3043,\n", + " 2129,\n", + " 2116,\n", + " 2335,\n", + " 1045,\n", + " 2356,\n", + " 2000,\n", + " 3666,\n", + " 2068,\n", + " 1989,\n", + " 2026,\n", + " 3008,\n", + " 2052,\n", + " 2025,\n", + " 2000,\n", + " 2292,\n", + " 2033,\n", + " 1012,\n", + " 102],\n", + " [101,\n", + " 2027,\n", + " 2052,\n", + " 2360,\n", + " 2000,\n", + " 2149,\n", + " 2008,\n", + " 2652,\n", + " 4003,\n", + " 2399,\n", + " 2052,\n", + " 2393,\n", + " 2026,\n", + " 4167,\n", + " 1012,\n", + " 102],\n", + " [101,\n", + " 2145,\n", + " 1045,\n", + " 15175,\n", + " 2000,\n", + " 2377,\n", + " 1996,\n", + " 2399,\n", + " 2005,\n", + " 2068,\n", + " 2823,\n", + " 1012,\n", + " 102],\n", + " [101,\n", + " 1045,\n", + " 2134,\n", + " 1005,\n", + " 1056,\n", + " 5382,\n", + " 2129,\n", + " 2157,\n", + " 2026,\n", + " 3008,\n", + " 2024,\n", + " 2127,\n", + " 1045,\n", + " 3133,\n", + " 2152,\n", + " 2082,\n", + " 1012,\n", + " 102],\n", + " [101,\n", + " 1996,\n", + " 2399,\n", + " 2026,\n", + " 3008,\n", + " 4036,\n", + " 2033,\n", + " 2073,\n", + " 1045,\n", + " 2001,\n", + " 1037,\n", + " 2775,\n", + " 2357,\n", + " 2041,\n", + " 2000,\n", + " 2022,\n", + " 2200,\n", + " 6179,\n", + " 2101,\n", + " 1999,\n", + " 2026,\n", + " 2166,\n", + " 1012,\n", + " 102],\n", + " [101, 1012, 102]],\n", + " [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", + " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", + " [0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0],\n", + " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", + " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", + " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", + " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", + " [0, 0, 0]],\n", + " [[0, 0],\n", + " [0, 1],\n", + " [0, 2],\n", + " [0, 3],\n", + " [0, 4],\n", + " [0, 5],\n", + " [0, 6],\n", + " [0, 7],\n", + " [0, 8],\n", + " [0, 9],\n", + " [0, 10],\n", + " [0, 11],\n", + " [0, 12],\n", + " [0, 13],\n", + " [0, 14],\n", + " [0, 15],\n", + " [1, 1],\n", + " [1, 2],\n", + " [1, 3],\n", + " [1, 4],\n", + " [1, 5],\n", + " [1, 6],\n", + " [1, 7],\n", + " [1, 8],\n", + " [1, 9],\n", + " [1, 10],\n", + " [1, 11],\n", + " [1, 12],\n", + " [1, 13],\n", + " [1, 14],\n", + " [1, 15],\n", + " [1, 16],\n", + " [2, 1],\n", + " [2, 2],\n", + " [2, 3],\n", + " [2, 4],\n", + " [2, 5],\n", + " [2, 6],\n", + " [2, 7],\n", + " [2, 8],\n", + " [2, 9],\n", + " [2, 10],\n", + " [2, 11],\n", + " [2, 12],\n", + " [2, 13],\n", + " [2, 14],\n", + " [2, 15],\n", + " [2, 16],\n", + " [2, 17],\n", + " [2, 18],\n", + " [2, 19],\n", + " [2, 20],\n", + " [2, 21],\n", + " [2, 22],\n", + " [2, 23],\n", + " [2, 24],\n", + " [2, 25],\n", + " [2, 26],\n", + " [2, 27],\n", + " [2, 28],\n", + " [2, 29],\n", + " [2, 30],\n", + " [3, 1],\n", + " [3, 2],\n", + " [3, 3],\n", + " [3, 4],\n", + " [3, 5],\n", + " [3, 6],\n", + " [3, 7],\n", + " [3, 8],\n", + " [3, 9],\n", + " [3, 10],\n", + " [3, 11],\n", + " [3, 12],\n", + " [3, 13],\n", + " [3, 14],\n", + " [4, 1],\n", + " [4, 2],\n", + " [4, 3],\n", + " [4, 4],\n", + " [4, 5],\n", + " [4, 6],\n", + " [4, 7],\n", + " [4, 8],\n", + " [4, 9],\n", + " [4, 10],\n", + " [4, 11],\n", + " [5, 1],\n", + " [5, 2],\n", + " [5, 3],\n", + " [5, 4],\n", + " [5, 5],\n", + " [5, 6],\n", + " [5, 7],\n", + " [5, 8],\n", + " [5, 9],\n", + " [5, 10],\n", + " [5, 11],\n", + " [5, 12],\n", + " [5, 13],\n", + " [5, 14],\n", + " [5, 15],\n", + " [5, 16],\n", + " [6, 1],\n", + " [6, 2],\n", + " [6, 3],\n", + " [6, 4],\n", + " [6, 5],\n", + " [6, 6],\n", + " [6, 7],\n", + " [6, 8],\n", + " [6, 9],\n", + " [6, 10],\n", + " [6, 11],\n", + " [6, 12],\n", + " [6, 13],\n", + " [6, 14],\n", + " [6, 15],\n", + " [6, 16],\n", + " [6, 17],\n", + " [6, 18],\n", + " [6, 19],\n", + " [6, 20],\n", + " [6, 21],\n", + " [6, 22]],\n", + " [\"When I was little, Friday's night was our family game night.\",\n", + " ' After supper, we would play card games of all sort in the sitting room.',\n", + " ' As the kid, I loved to watch cartoons,but no matter how many times I asked to watching them, my parents would not to let me.',\n", + " ' They would say to us that playing card games would help my brain.',\n", + " ' Still I unwilling to play the games for them sometimes.',\n", + " \" I didn't realize how right my parents are until I entered high school.\",\n", + " ' The games my parents taught me where I was a child turned out to be very useful later in my life.',\n", + " '.'],\n", + " [101,\n", + " 2043,\n", + " 1045,\n", + " 2001,\n", + " 2210,\n", + " 1010,\n", + " 5958,\n", + " 1005,\n", + " 1055,\n", + " 2305,\n", + " 2001,\n", + " 2256,\n", + " 2155,\n", + " 2208,\n", + " 2305,\n", + " 1012,\n", + " 2044,\n", + " 15264,\n", + " 1010,\n", + " 2057,\n", + " 2052,\n", + " 2377,\n", + " 4003,\n", + " 2399,\n", + " 1997,\n", + " 2035,\n", + " 4066,\n", + " 1999,\n", + " 1996,\n", + " 3564,\n", + " 2282,\n", + " 1012,\n", + " 2004,\n", + " 1996,\n", + " 4845,\n", + " 1010,\n", + " 1045,\n", + " 3866,\n", + " 2000,\n", + " 3422,\n", + " 13941,\n", + " 1989,\n", + " 2021,\n", + " 2053,\n", + " 3043,\n", + " 2129,\n", + " 2116,\n", + " 2335,\n", + " 1045,\n", + " 2356,\n", + " 2000,\n", + " 3666,\n", + " 2068,\n", + " 1989,\n", + " 2026,\n", + " 3008,\n", + " 2052,\n", + " 2025,\n", + " 2000,\n", + " 2292,\n", + " 2033,\n", + " 1012,\n", + " 2027,\n", + " 2052,\n", + " 2360,\n", + " 2000,\n", + " 2149,\n", + " 2008,\n", + " 2652,\n", + " 4003,\n", + " 2399,\n", + " 2052,\n", + " 2393,\n", + " 2026,\n", + " 4167,\n", + " 1012,\n", + " 2145,\n", + " 1045,\n", + " 15175,\n", + " 2000,\n", + " 2377,\n", + " 1996,\n", + " 2399,\n", + " 2005,\n", + " 2068,\n", + " 2823,\n", + " 1012,\n", + " 1045,\n", + " 2134,\n", + " 1005,\n", + " 1056,\n", + " 5382,\n", + " 2129,\n", + " 2157,\n", + " 2026,\n", + " 3008,\n", + " 2024,\n", + " 2127,\n", + " 1045,\n", + " 3133,\n", + " 2152,\n", + " 2082,\n", + " 1012,\n", + " 1996,\n", + " 2399,\n", + " 2026,\n", + " 3008,\n", + " 4036,\n", + " 2033,\n", + " 2073,\n", + " 1045,\n", + " 2001,\n", + " 1037,\n", + " 2775,\n", + " 2357,\n", + " 2041,\n", + " 2000,\n", + " 2022,\n", + " 2200,\n", + " 6179,\n", + " 2101,\n", + " 1999,\n", + " 2026,\n", + " 2166,\n", + " 1012,\n", + " 102],\n", + " [0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0])" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids\n", + "\n", + "def analyze_text(text, masked_tokens=None, show_suggestions=True, show_firstk_probs=20):\n", + " step = 15\n", + " #print(text[0])\n", + " global input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids\n", + " input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids = process_text(text[0])\n", + "\n", + " examples = convert_text_to_examples(text)\n", + " features = convert_examples_to_features(examples, tokenizer, print_info=False)\n", + " given_mask = \"[MASK]\" in features[0].tokens\n", + " if not given_mask or masked_tokens is not None:\n", + " assert len(features) == 1\n", + " features, batches = copy_and_mask_feature(features[0],step, masked_tokens=masked_tokens)\n", + " #print(len(features))\n", + "\n", + " input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) #把input_ids增加了一个维度,变成[n_features,sequence_len]\n", + " #这里的n_features实际上是句子有多少批训练\n", + "\n", + " input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样\n", + " input_ids = input_ids.to(device) #拿去GPU\n", + " input_type_ids = input_type_ids.to(device)\n", + "\n", + " mlm_logits, _ = model(input_ids, input_type_ids)\n", + " mlm_probs = F.softmax(mlm_logits, dim=-1) #最后一维,也就是vocab 换算成概率和为百分之百\n", + " #print(mlm_probs.size())#这里实验的是torch.Size([5, 5, 30522])\n", + " tokens = features[0].tokens #为了输出,[mask]在input_ids里面表示出来,features的token都一样\n", + " #print(tokens)\n", + " if not given_mask or masked_tokens is not None:\n", + " bsz, seq_len, vocab_size = mlm_probs.size() #三个维度分别是batch_size, sequence_length, vocab_size\n", + " assert bsz == len(batches)\n", + " # reduced_mlm_probs = torch.Tensor(1, seq_len, vocab_size)\n", + " # for i in range(seq_len):\n", + " # reduced_mlm_probs[0, i] = mlm_probs[i, i]\n", + " reduced_mlm_probs = torch.Tensor(1, len(tokens), vocab_size)\n", + " for i in batches:\n", + " pos = i\n", + " while pos < len(tokens):\n", + " reduced_mlm_probs[0, pos] = mlm_probs[i, pos]\n", + " pos = pos + step\n", + " mlm_probs = reduced_mlm_probs #压缩一下大小,节约不必要浪费的空间(只需要第i个batch里面[mask]位置的词汇表概率即可)\n", + " #tokens = [tokens[i] for i in masked_positions]\n", + " top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=show_firstk_probs) #传入的probs是二维的\n", + " #print(top_pairs) #******************************\n", + " if not given_mask:\n", + " show_abnormals(tokens, mlm_probs[0], show_suggestions=show_suggestions)\n", + " #return top_pairs\n" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/21/2019 18:19:51 - INFO - examples.extract_features - tokens: [CLS] when i was little , friday ' s night was our family game night . after supper , we would play card games of all sort in the sitting room . as the kid , i loved to watch cartoons , but no matter how many times i asked to watching them , my parents would not to let me . they would say to us that playing card games would help my brain . still i unwilling to play the games for them sometimes . i didn ' t realize how right my parents are until i entered high school . the games my parents taught me where i was a child turned out to be very useful later in my life . [SEP]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0 | [CLS] \t 2 | . 1 | the 1 | ) 1 | \" 1 | , \n", + " 97 | when \t* 97 | when 2 | since 1 | until 0 | while 0 | before \n", + " 99 | i \t* 99 | i 0 | she 0 | he 0 | we 0 | me \n", + " 100 | was \t*100 | was 0 | were 0 | got 0 | turned 0 | is \n", + " 11 | little \t 19 | younger * 11 | little 8 | eight 7 | young 7 | twelve \n", + " 51 | , \t* 51 | , 24 | . 4 | and 1 | of 1 | the \n", + " 0 | friday \t 33 | valentine 16 | mother 7 | children 7 | father 5 | grandma \n", + " 100 | ' \t*100 | ' 0 | ` 0 | ′ 0 | \" 0 | * \n", + " 100 | s \t*100 | s 0 | til 0 | n 0 | d 0 | round \n", + " 39 | night \t* 39 | night 16 | dinner 6 | eve 5 | day 5 | supper \n", + " 90 | was \t* 90 | was 8 | became 1 | is 0 | were 0 | , \n", + " 4 | our \t 79 | a 13 | the * 4 | our 1 | my 0 | their \n", + " 1 | family \t 59 | favorite 18 | first 3 | favourite 2 | only 1 | last \n", + " 3 | game \t 12 | dinner 7 | christmas 6 | fun 5 | day * 3 | game \n", + " 81 | night \t* 81 | night 13 | day 1 | dinner 1 | date 0 | nights \n", + " 97 | . \t* 97 | . 2 | and 0 | ; 0 | , 0 | ! \n", + " 80 | after \t* 80 | after 6 | during 4 | at 4 | over 3 | before \n", + " 1 | supper \t 68 | school 10 | dinner 9 | that 2 | midnight 1 | breakfast \n", + " 100 | , \t*100 | , 0 | ##time 0 | together 0 | time 0 | dinner \n", + " 98 | we \t* 98 | we 1 | i 0 | they 0 | everyone 0 | people \n", + " 64 | would \t* 64 | would 21 | could 2 | will 2 | can 1 | did \n", + " 96 | play \t* 96 | play 2 | have 1 | watch 0 | enjoy 0 | played \n", + " 97 | card \t* 97 | card 1 | board 1 | cards 0 | video 0 | computer \n", + " 100 | games \t*100 | games 0 | game 0 | ##games 0 | matches 0 | sports \n", + " 99 | of \t* 99 | of 0 | in 0 | with 0 | and 0 | , \n", + " 0 | all \t 85 | some 6 | any 3 | a 3 | every 1 | this \n", + " 2 | sort \t 41 | kinds 34 | types 5 | sorts 4 | sizes 3 | kind \n", + " 98 | in \t* 98 | in 0 | around 0 | inside 0 | at 0 | from \n", + " 73 | the \t* 73 | the 23 | our 1 | my 1 | a 1 | their \n", + " 0 | sitting \t 56 | family 17 | dining 14 | living 1 | back 1 | same \n", + " 99 | room \t* 99 | room 0 | area 0 | rooms 0 | hall 0 | areas \n", + " 99 | . \t* 99 | . 1 | and 0 | ; 0 | , 0 | ... \n", + " 48 | as \t* 48 | as 29 | like 8 | being 3 | unlike 3 | for \n", + " 0 | the \t 100 | a 0 | another 0 | an * 0 | the 0 | one \n", + " 8 | kid \t 43 | child 22 | youngest * 8 | kid 4 | baby 3 | oldest \n", + " 63 | , \t* 63 | , 5 | i 2 | . 1 | myself 1 | and \n", + " 99 | i \t* 99 | i 0 | we 0 | he 0 | she 0 | me \n", + " 15 | loved \t 36 | wanted 21 | used * 15 | loved 11 | liked 4 | tried \n", + " 100 | to \t*100 | to 0 | and 0 | playing 0 | watching 0 | going \n", + " 99 | watch \t* 99 | watch 1 | see 0 | play 0 | watching 0 | watched \n", + " 0 | cartoons \t 52 | them 41 | games 1 | movies 1 | cards 1 | it \n", + " 0 | , \t 81 | , 19 | . 0 | ; 0 | - 0 | ... \n", + " 44 | but \t 47 | and * 44 | but 6 | so 1 | yet 1 | because \n", + " 100 | no \t*100 | no 0 | little 0 | the 0 | zero 0 | not \n", + " 100 | matter \t*100 | matter 0 | to 0 | telling 0 | idea 0 | , \n", + " 100 | how \t*100 | how 0 | what 0 | however 0 | the 0 | where \n", + " 100 | many \t*100 | many 0 | often 0 | few 0 | several 0 | numerous \n", + " 85 | times \t* 85 | times 3 | questions 1 | minutes 1 | hours 1 | people \n", + " 82 | i \t* 82 | i 10 | we 1 | was 1 | being 1 | he \n", + " 0 | asked \t 37 | took 19 | went 13 | admitted 6 | got 4 | confessed \n", + " 5 | to \t 23 | for 13 | about * 5 | to 4 | myself 3 | me \n", + " 0 | watching \t 64 | play 30 | watch 3 | see 1 | join 0 | read \n", + " 57 | them \t* 57 | them 23 | cartoons 5 | it 2 | movies 2 | games \n", + " 0 | , \t 100 | , 0 | . 0 | ... 0 | again 0 | even \n", + " 99 | my \t* 99 | my 0 | the 0 | her 0 | his 0 | our \n", + " 98 | parents \t* 98 | parents 0 | family 0 | father 0 | mother 0 | grandparents\n", + " 0 | would \t 47 | decided 18 | chose 8 | tried 4 | seemed 4 | knew \n", + " 0 | not \t 70 | refuse 10 | have 5 | agree 2 | want 1 | promise \n", + " 0 | to \t 45 | always 17 | have 11 | really 9 | even 3 | ever \n", + " 28 | let \t* 28 | let 17 | believe 11 | tell 11 | bother 5 | stop \n", + " 91 | me \t* 91 | me 6 | go 1 | up 1 | on 0 | it \n", + " 97 | . \t* 97 | . 1 | and 1 | ; 0 | because 0 | , \n", + " 94 | they \t* 94 | they 1 | he 1 | she 1 | dad 1 | i \n", + " 97 | would \t* 97 | would 1 | did 1 | always 1 | could 0 | might \n", + " 27 | say \t* 27 | say 21 | prove 16 | explain 4 | swear 3 | lie \n", + " 65 | to \t* 65 | to 6 | about 2 | for 2 | that 2 | in \n", + " 0 | us \t 99 | me 0 | themselves 0 | myself * 0 | us 0 | him \n", + " 94 | that \t* 94 | that 3 | how 1 | if 1 | , 0 | maybe \n", + " 89 | playing \t* 89 | playing 4 | the 4 | watching 0 | doing 0 | their \n", + " 46 | card \t* 46 | card 40 | the 4 | these 3 | those 1 | cards \n", + " 99 | games \t* 99 | games 0 | game 0 | together 0 | tricks 0 | again \n", + " 68 | would \t* 68 | would 14 | could 4 | might 4 | will 4 | did \n", + " 5 | help \t 23 | change 6 | use * 5 | help 3 | drain 3 | control \n", + " 61 | my \t* 61 | my 22 | the 9 | our 3 | your 3 | their \n", + " 1 | brain \t 15 | life 5 | family 4 | dad 3 | future 3 | parents \n", + " 57 | . \t* 57 | . 16 | and 14 | , 9 | but 1 | ; \n", + " 0 | still \t 35 | am 26 | was 8 | but 7 | is 5 | and \n", + " 6 | i \t 60 | , * 6 | i 3 | . 3 | too 3 | ... \n", + " 0 | unwilling \t 8 | want 8 | used 8 | have 8 | wanted 7 | had \n", + " 48 | to \t 50 | ##ly * 48 | to 0 | always 0 | t 0 | ##tly \n", + " 28 | play \t* 28 | play 6 | do 5 | make 1 | stop 1 | keep \n", + " 5 | the \t 82 | card * 5 | the 1 | these 1 | cards 1 | those \n", + " 59 | games \t* 59 | games 37 | game 1 | cards 0 | piano 0 | kids \n", + " 1 | for \t 92 | with * 1 | for 1 | in 1 | against 1 | without \n", + " 22 | them \t 51 | myself * 22 | them 9 | fun 1 | hours 1 | real \n", + " 2 | sometimes \t 16 | anyway 12 | anymore 11 | too 10 | all 4 | though \n", + " 96 | . \t* 96 | . 1 | because 1 | and 1 | ; 0 | , \n", + " 99 | i \t* 99 | i 0 | we 0 | they 0 | you 0 | people \n", + " 99 | didn \t* 99 | didn 0 | wouldn 0 | don 0 | couldn 0 | did \n", + " 100 | ' \t*100 | ' 0 | ` 0 | \" 0 | , 0 | ′ \n", + " 100 | t \t*100 | t 0 | m 0 | s 0 | d 0 | no \n", + " 45 | realize \t 46 | know * 45 | realize 3 | understand 3 | realise 2 | see \n", + " 100 | how \t*100 | how 0 | what 0 | the 0 | it 0 | however \n", + " 0 | right \t 6 | strict 5 | powerful 4 | wonderful 4 | smart 4 | helpful \n", + " 97 | my \t* 97 | my 1 | our 0 | the 0 | your 0 | their \n", + " 29 | parents \t* 29 | parents 6 | thoughts 4 | words 2 | people 2 | kids \n", + " 1 | are \t 97 | were * 1 | are 0 | thought 0 | felt 0 | was \n", + " 87 | until \t* 87 | until 9 | when 2 | before 1 | till 0 | once \n", + " 100 | i \t*100 | i 0 | we 0 | they 0 | he 0 | me \n", + " 13 | entered \t 54 | graduated * 13 | entered 9 | finished 7 | started 6 | left \n", + " 51 | high \t* 51 | high 20 | elementary 15 | middle 8 | grade 1 | primary \n", + " 100 | school \t*100 | school 0 | schools 0 | society 0 | college 0 | class \n", + " 81 | . \t* 81 | . 14 | and 2 | but 1 | , 1 | ; \n", + " 92 | the \t* 92 | the 5 | card 1 | those 0 | playing 0 | these \n", + " 43 | games \t* 43 | games 30 | game 6 | lessons 4 | rules 1 | math \n", + " 100 | my \t*100 | my 0 | our 0 | his 0 | me 0 | that \n", + " 53 | parents \t* 53 | parents 15 | father 13 | mother 5 | dad 4 | grandparents\n", + " 56 | taught \t* 56 | taught 20 | showed 12 | played 8 | gave 1 | told \n", + " 100 | me \t*100 | me 0 | us 0 | i 0 | him 0 | my \n", + " 0 | where \t 96 | when 2 | since 2 | while 0 | as 0 | until \n", + " 99 | i \t* 99 | i 0 | me 0 | he 0 | she 0 | my \n", + " 99 | was \t* 99 | was 0 | were 0 | as 0 | became 0 | had \n", + " 100 | a \t*100 | a 0 | the 0 | and 0 | one 0 | still \n", + " 22 | child \t 51 | kid * 22 | child 7 | boy 4 | teenager 4 | freshman \n", + " 97 | turned \t* 97 | turned 1 | turn 1 | came 0 | grew 0 | turning \n", + " 100 | out \t*100 | out 0 | into 0 | on 0 | up 0 | proving \n", + " 100 | to \t*100 | to 0 | into 0 | and 0 | not 0 | would \n", + " 94 | be \t* 94 | be 3 | become 3 | prove 0 | get 0 | seem \n", + " 69 | very \t* 69 | very 6 | extremely 5 | quite 3 | more 3 | really \n", + " 7 | useful \t 19 | important 14 | different 9 | helpful * 7 | useful 6 | influential \n", + " 6 | later \t 46 | things 17 | early * 6 | later 3 | lessons 3 | times \n", + " 100 | in \t*100 | in 0 | on 0 | during 0 | into 0 | than \n", + " 100 | my \t*100 | my 0 | our 0 | his 0 | their 0 | the \n", + " 99 | life \t* 99 | life 1 | career 0 | childhood 0 | education 0 | lives \n", + " 100 | . \t*100 | . 0 | ; 0 | ! 0 | ? 0 | ... \n", + " 0 | [SEP] \t 25 | \" 3 | for 3 | now 3 | and 2 | so \n", + "\n", + "*******************************************************************************************************************\n", + "1\n", + "0.0\n", + "WRB\n", + "\u001b[38;5;15m\u001b[48;5;0mwhen \u001b[0m\n", + "*******************************************************************************************************************\n", + "2\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mi \u001b[0m\n", + "*******************************************************************************************************************\n", + "3\n", + "0.0\n", + "VBD\n", + "\u001b[38;5;15m\u001b[48;5;0mwas \u001b[0m\n", + "*******************************************************************************************************************\n", + "4\n", + "0.4996413875309904\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mlittle\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "5\n", + "0.0\n", + ",\n", + "\u001b[38;5;15m\u001b[48;5;0m, \u001b[0m\n", + "*******************************************************************************************************************\n", + "6\n", + "5.037531860577574\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mfriday\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "7\n", + "0.0\n", + "POS\n", + "\u001b[38;5;15m\u001b[48;5;0m' \u001b[0m\n", + "*******************************************************************************************************************\n", + "8\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0ms \u001b[0m\n", + "*******************************************************************************************************************\n", + "9\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mnight \u001b[0m\n", + "*******************************************************************************************************************\n", + "10\n", + "0.0\n", + "VBD\n", + "\u001b[38;5;15m\u001b[48;5;0mwas \u001b[0m\n", + "*******************************************************************************************************************\n", + "11\n", + "2.9288282257051295\n", + "PRP$\n", + "\u001b[38;5;226m\u001b[48;5;0mour\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "12\n", + "3.944041330267972\n", + "NN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[38;5;226m\u001b[48;5;0mfamily\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "13\n", + "1.2859363936756965\n", + "NN\n", + "\u001b[38;5;226m\u001b[48;5;0mgame\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "14\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mnight \u001b[0m\n", + "*******************************************************************************************************************\n", + "15\n", + "0.0\n", + ".\n", + "\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "*******************************************************************************************************************\n", + "16\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0mafter \u001b[0m\n", + "*******************************************************************************************************************\n", + "17\n", + "3.864973993379616\n", + "NN\n", + "\u001b[38;5;226m\u001b[48;5;0msupper\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "18\n", + "0.0\n", + ",\n", + "\u001b[38;5;15m\u001b[48;5;0m, \u001b[0m\n", + "*******************************************************************************************************************\n", + "19\n", + "0.0\n", + "PRP\n", + "\u001b[38;5;15m\u001b[48;5;0mwe \u001b[0m\n", + "*******************************************************************************************************************\n", + "20\n", + "0.0\n", + "MD\n", + "\u001b[38;5;15m\u001b[48;5;0mwould \u001b[0m\n", + "*******************************************************************************************************************\n", + "21\n", + "0.0\n", + "VB\n", + "\u001b[38;5;15m\u001b[48;5;0mplay \u001b[0m\n", + "*******************************************************************************************************************\n", + "22\n", + "0.0\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mcard\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "23\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mgames \u001b[0m\n", + "*******************************************************************************************************************\n", + "24\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0mof \u001b[0m\n", + "*******************************************************************************************************************\n", + "25\n", + "6.181150402503261\n", + "DT\n", + "\u001b[38;5;226m\u001b[48;5;0mall\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "26\n", + "3.2442493513478983\n", + "NN\n", + "\u001b[38;5;214m\u001b[48;5;0msort\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/sorts \u001b[0m\n", + "*******************************************************************************************************************\n", + "27\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0min \u001b[0m\n", + "*******************************************************************************************************************\n", + "28\n", + "0.0\n", + "DT\n", + "\u001b[38;5;15m\u001b[48;5;0mthe \u001b[0m\n", + "*******************************************************************************************************************\n", + "29\n", + "5.1759204333264215\n", + "NN\n", + "\u001b[38;5;226m\u001b[48;5;0msitting\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "30\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mroom \u001b[0m\n", + "*******************************************************************************************************************\n", + "31\n", + "0.0\n", + ".\n", + "\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "*******************************************************************************************************************\n", + "32\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0mas \u001b[0m\n", + "*******************************************************************************************************************\n", + "33\n", + "11.548374205660924\n", + "DT\n", + "\u001b[38;5;196m\u001b[48;5;0mthe\u001b[0m\u001b[38;5;2m\u001b[48;5;0m/a \u001b[0m\n", + "*******************************************************************************************************************\n", + "34\n", + "1.7249087614151182\n", + "NN\n", + "\u001b[38;5;226m\u001b[48;5;0mkid\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "35\n", + "0.0\n", + ",\n", + "\u001b[38;5;15m\u001b[48;5;0m, \u001b[0m\n", + "*******************************************************************************************************************\n", + "36\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mi \u001b[0m\n", + "*******************************************************************************************************************\n", + "37\n", + "0.9132128054919955\n", + "VBD\n", + "\u001b[38;5;226m\u001b[48;5;0mloved\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "38\n", + "0.0\n", + "TO\n", + "\u001b[38;5;15m\u001b[48;5;0mto \u001b[0m\n", + "*******************************************************************************************************************\n", + "39\n", + "0.0\n", + "VB\n", + "\u001b[38;5;15m\u001b[48;5;0mwatch \u001b[0m\n", + "*******************************************************************************************************************\n", + "40\n", + "6.495473375871382\n", + "NNS\n", + "\u001b[38;5;226m\u001b[48;5;0mcartoons\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "41\n", + "15.452660592340097\n", + "VB\n", + "\u001b[38;5;196m\u001b[48;5;0m,\u001b[0m\u001b[38;5;2m\u001b[48;5;0m/, \u001b[0m\n", + "*******************************************************************************************************************\n", + "42\n", + "0.053008093757376584\n", + "CC\n", + "\u001b[38;5;226m\u001b[48;5;0mbut\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "43\n", + "0.0\n", + "DT\n", + "\u001b[38;5;15m\u001b[48;5;0mno \u001b[0m\n", + "*******************************************************************************************************************\n", + "44\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mmatter \u001b[0m\n", + "*******************************************************************************************************************\n", + "45\n", + "0.0\n", + "WRB\n", + "\u001b[38;5;15m\u001b[48;5;0mhow \u001b[0m\n", + "*******************************************************************************************************************\n", + "46\n", + "0.0\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mmany\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "47\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mtimes \u001b[0m\n", + "*******************************************************************************************************************\n", + "48\n", + "0.0\n", + "VBP\n", + "\u001b[38;5;15m\u001b[48;5;0mi \u001b[0m\n", + "*******************************************************************************************************************\n", + "49\n", + "10.646676048338493\n", + "VBN\n", + "\u001b[38;5;196m\u001b[48;5;0masked\u001b[0m\u001b[38;5;2m\u001b[48;5;0m/was used \u001b[0m\n", + "*******************************************************************************************************************\n", + "50\n", + "1.5529499099577042\n", + "TO\n", + "\u001b[38;5;226m\u001b[48;5;0mto\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "51\n", + "8.95932484381705\n", + "VBG\n", + "\u001b[38;5;214m\u001b[48;5;0mwatching\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/watch \u001b[0m\n", + "*******************************************************************************************************************\n", + "52\n", + "0.0\n", + "PRP\n", + "\u001b[38;5;15m\u001b[48;5;0mthem \u001b[0m\n", + "*******************************************************************************************************************\n", + "53\n", + "18.383069999315744\n", + "VB\n", + "\u001b[38;5;196m\u001b[48;5;0m,\u001b[0m\u001b[38;5;2m\u001b[48;5;0m/, \u001b[0m\n", + "*******************************************************************************************************************\n", + "54\n", + "0.0\n", + "PRP$\n", + "\u001b[38;5;15m\u001b[48;5;0mmy \u001b[0m\n", + "*******************************************************************************************************************\n", + "55\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mparents \u001b[0m\n", + "*******************************************************************************************************************\n", + "56\n", + "5.4762173007041035\n", + "MD\n", + "检查点1*****************************************************\n", + "\u001b[38;5;226m\u001b[48;5;0mwould\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "57\n", + "5.3354081649787535\n", + "RB\n", + "\u001b[38;5;214m\u001b[48;5;0mnot\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/refuse \u001b[0m\n", + "*******************************************************************************************************************\n", + "58\n", + "5.981459151215268\n", + "TO\n", + "\u001b[38;5;214m\u001b[48;5;0mto\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/去掉 to \u001b[0m\n", + "*******************************************************************************************************************\n", + "59\n", + "0.0\n", + "VB\n", + "\u001b[38;5;15m\u001b[48;5;0mlet \u001b[0m\n", + "*******************************************************************************************************************\n", + "60\n", + "0.0\n", + "PRP\n", + "\u001b[38;5;15m\u001b[48;5;0mme \u001b[0m\n", + "*******************************************************************************************************************\n", + "61\n", + "0.0\n", + ".\n", + "\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "*******************************************************************************************************************\n", + "62\n", + "0.0\n", + "PRP\n", + "\u001b[38;5;15m\u001b[48;5;0mthey \u001b[0m\n", + "*******************************************************************************************************************\n", + "63\n", + "0.0\n", + "MD\n", + "\u001b[38;5;15m\u001b[48;5;0mwould \u001b[0m\n", + "*******************************************************************************************************************\n", + "64\n", + "0.0\n", + "VB\n", + "\u001b[38;5;15m\u001b[48;5;0msay \u001b[0m\n", + "*******************************************************************************************************************\n", + "65\n", + "0.0\n", + "TO\n", + "\u001b[38;5;15m\u001b[48;5;0mto \u001b[0m\n", + "*******************************************************************************************************************\n", + "66\n", + "6.581833896065917\n", + "PRP\n", + "\u001b[38;5;214m\u001b[48;5;0mus\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/me \u001b[0m\n", + "*******************************************************************************************************************\n", + "67\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0mthat \u001b[0m\n", + "*******************************************************************************************************************\n", + "68\n", + "0.0\n", + "VBG\n", + "\u001b[38;5;15m\u001b[48;5;0mplaying \u001b[0m\n", + "*******************************************************************************************************************\n", + "69\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mcard \u001b[0m\n", + "*******************************************************************************************************************\n", + "70\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mgames \u001b[0m\n", + "*******************************************************************************************************************\n", + "71\n", + "0.0\n", + "MD\n", + "\u001b[38;5;15m\u001b[48;5;0mwould \u001b[0m\n", + "*******************************************************************************************************************\n", + "72\n", + "1.4588194328350998\n", + "VB\n", + "\u001b[38;5;226m\u001b[48;5;0mhelp\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "73\n", + "0.0\n", + "PRP$\n", + "\u001b[38;5;15m\u001b[48;5;0mmy \u001b[0m\n", + "*******************************************************************************************************************\n", + "74\n", + "3.173226871228209\n", + "NN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[38;5;226m\u001b[48;5;0mbrain\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "75\n", + "0.0\n", + ".\n", + "\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "*******************************************************************************************************************\n", + "76\n", + "5.90809919263306\n", + "RB\n", + "\u001b[38;5;214m\u001b[48;5;0mstill\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/still , \u001b[0m\n", + "*******************************************************************************************************************\n", + "77\n", + "2.2313680234481628\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mi\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "78\n", + "7.241924210620825\n", + "NN\n", + "\u001b[38;5;226m\u001b[48;5;0munwilling\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "79\n", + "0.033503519227476186\n", + "TO\n", + "\u001b[38;5;226m\u001b[48;5;0mto\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "80\n", + "0.0\n", + "VB\n", + "\u001b[38;5;15m\u001b[48;5;0mplay \u001b[0m\n", + "*******************************************************************************************************************\n", + "81\n", + "2.7297515213736863\n", + "DT\n", + "\u001b[38;5;226m\u001b[48;5;0mthe\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "82\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mgames \u001b[0m\n", + "*******************************************************************************************************************\n", + "83\n", + "4.231741889869705\n", + "IN\n", + "\u001b[38;5;214m\u001b[48;5;0mfor\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/with \u001b[0m\n", + "*******************************************************************************************************************\n", + "84\n", + "0.8675317652760016\n", + "PRP\n", + "\u001b[38;5;226m\u001b[48;5;0mthem\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "85\n", + "2.1558967133083646\n", + "RB\n", + "\u001b[38;5;226m\u001b[48;5;0msometimes\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "86\n", + "0.0\n", + ".\n", + "\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "*******************************************************************************************************************\n", + "87\n", + "0.0\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mi\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "88\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mdidn \u001b[0m\n", + "*******************************************************************************************************************\n", + "89\n", + "0.0\n", + "POS\n", + "\u001b[38;5;15m\u001b[48;5;0m' \u001b[0m\n", + "*******************************************************************************************************************\n", + "90\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mt \u001b[0m\n", + "*******************************************************************************************************************\n", + "91\n", + "0.011093191090367771\n", + "VB\n", + "\u001b[38;5;226m\u001b[48;5;0mrealize\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "92\n", + "0.0\n", + "WRB\n", + "\u001b[38;5;15m\u001b[48;5;0mhow \u001b[0m\n", + "*******************************************************************************************************************\n", + "93\n", + "3.6692828920487384\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mright\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "94\n", + "0.0\n", + "PRP$\n", + "\u001b[38;5;15m\u001b[48;5;0mmy \u001b[0m\n", + "*******************************************************************************************************************\n", + "95\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mparents \u001b[0m\n", + "*******************************************************************************************************************\n", + "96\n", + "4.758635578869137\n", + "VBP\n", + "\u001b[38;5;214m\u001b[48;5;0mare\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/were \u001b[0m\n", + "*******************************************************************************************************************\n", + "97\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0muntil \u001b[0m\n", + "*******************************************************************************************************************\n", + "98\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mi \u001b[0m\n", + "*******************************************************************************************************************\n", + "99\n", + "1.4610567542265707\n", + "VBD\n", + "\u001b[38;5;226m\u001b[48;5;0mentered\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "100\n", + "0.0\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mhigh\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "101\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mschool \u001b[0m\n", + "*******************************************************************************************************************\n", + "102\n", + "0.0\n", + ".\n", + "\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "*******************************************************************************************************************\n", + "103\n", + "0.0\n", + "DT\n", + "\u001b[38;5;15m\u001b[48;5;0mthe \u001b[0m\n", + "*******************************************************************************************************************\n", + "104\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mgames \u001b[0m\n", + "*******************************************************************************************************************\n", + "105\n", + "0.0\n", + "PRP$\n", + "\u001b[38;5;15m\u001b[48;5;0mmy \u001b[0m\n", + "*******************************************************************************************************************\n", + "106\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mparents \u001b[0m\n", + "*******************************************************************************************************************\n", + "107\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0mtaught \u001b[0m\n", + "*******************************************************************************************************************\n", + "108\n", + "0.0\n", + "PRP\n", + "\u001b[38;5;15m\u001b[48;5;0mme \u001b[0m\n", + "*******************************************************************************************************************\n", + "109\n", + "9.636217093727145\n", + "WRB\n", + "\u001b[38;5;214m\u001b[48;5;0mwhere\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/when \u001b[0m\n", + "*******************************************************************************************************************\n", + "110\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mi \u001b[0m\n", + "*******************************************************************************************************************\n", + "111\n", + "0.0\n", + "VBD\n", + "\u001b[38;5;15m\u001b[48;5;0mwas \u001b[0m\n", + "*******************************************************************************************************************\n", + "112\n", + "0.0\n", + "DT\n", + "\u001b[38;5;15m\u001b[48;5;0ma \u001b[0m\n", + "*******************************************************************************************************************\n", + "113\n", + "0.8537064036270944\n", + "NN\n", + "\u001b[38;5;226m\u001b[48;5;0mchild\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "114\n", + "0.0\n", + "VBD\n", + "\u001b[38;5;15m\u001b[48;5;0mturned \u001b[0m\n", + "*******************************************************************************************************************\n", + "115\n", + "0.0\n", + "RP\n", + "\u001b[38;5;15m\u001b[48;5;0mout \u001b[0m\n", + "*******************************************************************************************************************\n", + "116\n", + "0.0\n", + "TO\n", + "\u001b[38;5;15m\u001b[48;5;0mto \u001b[0m\n", + "*******************************************************************************************************************\n", + "117\n", + "0.0\n", + "VB\n", + "\u001b[38;5;15m\u001b[48;5;0mbe \u001b[0m\n", + "*******************************************************************************************************************\n", + "118\n", + "0.0\n", + "RB\n", + "\u001b[38;5;15m\u001b[48;5;0mvery \u001b[0m\n", + "*******************************************************************************************************************\n", + "119\n", + "0.9869002719874604\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0museful\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "120\n", + "1.9653529850905183\n", + "RB\n", + "\u001b[38;5;226m\u001b[48;5;0mlater\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "121\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0min \u001b[0m\n", + "*******************************************************************************************************************\n", + "122\n", + "0.0\n", + "PRP$\n", + "\u001b[38;5;15m\u001b[48;5;0mmy \u001b[0m\n", + "*******************************************************************************************************************\n", + "123\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mlife \u001b[0m\n", + "*******************************************************************************************************************\n", + "124\n", + "0.0\n", + ".\n", + "\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "平均gap:1.4890399906268712\n", + "time cost 8.737523794174194 s\n" + ] + } + ], + "source": [ + "import time\n", + "# text = [\"Who was Jim Henson? Jim Henson _ a puppeteer.\"]\n", + "# text = [\"Last week I went to the theater. There are many person . Luckily , I had very good seat. The plays was very interesting. However, I didn't enjoy it. A young man and a young woman were sitting behind me. They were talk loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angry. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'\"]\n", + "\n", + "#text = [\"Last week I went to the theater. I had very good seat. The plays was very interesting. However, I didn't enjoy it. A young man and a young woman were sitting behind me. They were talk loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angry. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'\"]\n", + "# text = [\"After the outbreak of the disease, the Ministry of Agriculture and rural areas immediately sent a supervision team to the local. Local Emergency Response Mechanism has been activated in accordance with the requirements, to take blockade, culling, harmless treatment, disinfection and other treatment measures to all disease and culling of pigs for harmless treatment. At the same time, all live pigs and their products are prohibited from transferring out of the blockade area, and live pigs are not allowed to be transported into the blockade area. At present, all the above measures have been implemented.\"]\n", + "# text = [\"Early critics of Emily Dickinson's poetry mistook for simplemindedness the surface of artlessness that in fact she constructed with such innocence.\"]\n", + "#text = [\"The journey was long and tired. We left London at five o'clock in the evening and spend eight hours in the train. We had been travelled for 3 hours after someone appeared selling food and drinks. It was darkness all the time we were crossing Wales, but we could see nothing through the windows. When we finally arrived Holyhead nearly , everyone was slept. As soon as the train stopped, everybody come to life, grabbing their suitcases and rushing onto the platform.\"]\n", + "text = [\"When I was little, Friday's night was our family game night. After supper, we would play card games of all sort in the sitting room. As the kid, I loved to watch cartoons,but no matter how many times I asked to watching them, my parents would not to let me. They would say to us that playing card games would help my brain. Still I unwilling to play the games for them sometimes. I didn't realize how right my parents are until I entered high school. The games my parents taught me where I was a child turned out to be very useful later in my life.\"]\n", + "#text = [\"Mr. and Mrs.Zhang all work in our school. They live far from the school, and it takes them about a hour and a half to go to work every day. In their spare time, they are interesting in planting vegetables in their garden, that is on the rooftop of their house. They often get up earlier and water the vegetables together. They have also bought in some gardening tools.beside, they often get some useful informations from the internet. When summer came, they will invite their students pick the vegetables!\"]\n", + "#text = ['The question is more easy than that.']\n", + "#text = [\"Last week I go to the zoo. I had a very good seat. The play was very interesting.\"]\n", + "#text =[\"Last week I went to the theater. I had very good seat. The play was very interesting.But I didn't enjoy it. A young man and a young woman were sitting behind me.They were talking loudly. I got very angry.\"]#因为外面有中括号,所以是二维的\n", + "time_start=time.time()\n", + "analyze_text(text, show_firstk_probs=200)\n", + "time_end=time.time()\n", + "print('time cost',time_end-time_start,'s')" + ] + }, + { + "cell_type": "code", + "execution_count": 438, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "**********************************display_suggestions********************************************************\n", + "| suggestion : position in text\n", + "---------------------------------------------------------------------------------------\n", + "| 去掉前面 more 原位置改成 easier : 5\n", + "*************************************************************************************************************\n", + "['去掉前面', 'more', '原位置改成', 'easier']\n", + " the question is easier than that .\n" + ] + } + ], + "source": [ + "#print(suggestions)\n", + "def display_suggestion():\n", + " print(\"**********************************display_suggestions********************************************************\")\n", + " print(\"| {:50} : {}\".format(\"suggestion\",\"position in text\"))\n", + " print(\"---------------------------------------------------------------------------------------\")\n", + " for key in suggestions:\n", + " print(\"| {:<50} : {}\".format(suggestions[key] ,key))\n", + " print(\"*************************************************************************************************************\")\n", + "display_suggestion()\n", + "\n", + "def modify_text(index):\n", + " #entire_ids,entire_type_ids\n", + " entire_ids_copy = copy.deepcopy(entire_ids)\n", + " new_text = \"\"\n", + " suggestion = suggestions[index]\n", + " if suggestion[0:2] == '##':\n", + " suggestion = tokenizer.ids_to_tokens[entire_ids_copy[index - 1]] + suggestion[2:]\n", + " del entire_ids_copy[index]\n", + " index = index - 1\n", + " #print(suggestion)\n", + " suggestion_tokens = suggestion.split(\" \")\n", + " print(suggestion_tokens)\n", + " if '去掉前面' == suggestion_tokens[0]:\n", + " del entire_ids_copy[index - 1]\n", + " del suggestion_tokens[0]\n", + " del suggestion_tokens[0]\n", + " index = index - 1\n", + " elif '去掉后面' == suggestion_tokens[0]:\n", + " del entire_ids_copy[index + 1]\n", + " del suggestion_tokens[0]\n", + " del suggestion_tokens[0]\n", + " elif '去掉' == suggestion_tokens[0]:\n", + " del entire_ids_copy[index]\n", + " del suggestion_tokens[0]\n", + " del suggestion_tokens[0]\n", + " if '原位置改成' in suggestion_tokens:\n", + " del suggestion_tokens[0]\n", + " \n", + " len_suggest = len(suggestion_tokens)\n", + " if len_suggest == 1:\n", + " entire_ids_copy[index] = tokenizer.vocab[suggestion_tokens[0]]\n", + " elif len_suggest == 2:\n", + " entire_ids_copy.insert(index,tokenizer.vocab[suggestion_tokens[0]])\n", + " entire_ids_copy[index + 1] = tokenizer.vocab[suggestion_tokens[1]]\n", + " \n", + " for i in range(1,len(entire_ids_copy)-1):\n", + " word = tokenizer.ids_to_tokens[entire_ids_copy[i]]\n", + " if word[0:2] == \"##\":\n", + " new_text = new_text + word[2:]\n", + " else:\n", + " new_text = new_text + ' ' + tokenizer.ids_to_tokens[entire_ids_copy[i]]\n", + " return new_text\n", + "\n", + "print(modify_text(5))" + ] + }, + { + "cell_type": "code", + "execution_count": 283, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/20/2019 15:48:16 - INFO - examples.extract_features - tokens: [CLS] when i was little , friday ' s night was our family game night . after supper , we would play card games of all sort in the sitting room . as the kid , i loved to watch cartoons , but no matter how many times i asked to watching them , my parents would not to let me . they would say to us that playing card games would help my brain . still i unwilling to play the games for them sometimes . i didn ' t realize how right my parents are until i entered high school . the games my parents taught me where i was a child turned out to be very useful later in my life . [SEP]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['[CLS]', 'when', 'i', 'was', 'little', ',', 'friday', \"'\", 's', 'night', 'was', 'our', 'family', 'game', 'night', '.', 'after', 'supper', ',', 'we', 'would', 'play', 'card', 'games', 'of', 'all', 'sort', 'in', 'the', 'sitting', 'room', '.', 'as', 'the', 'kid', ',', 'i', 'loved', 'to', 'watch', 'cartoons', ',', 'but', 'no', 'matter', 'how', 'many', 'times', 'i', 'asked', 'to', 'watching', 'them', ',', 'my', 'parents', 'would', 'not', 'to', 'let', 'me', '.', 'they', 'would', 'say', 'to', 'us', 'that', 'playing', 'card', 'games', 'would', 'help', 'my', 'brain', '.', 'still', 'i', 'unwilling', 'to', 'play', 'the', 'games', 'for', 'them', 'sometimes', '.', 'i', 'didn', \"'\", 't', 'realize', 'how', 'right', 'my', 'parents', 'are', 'until', 'i', 'entered', 'high', 'school', '.', 'the', 'games', 'my', 'parents', 'taught', 'me', 'where', 'i', 'was', 'a', 'child', 'turned', 'out', 'to', 'be', 'very', 'useful', 'later', 'in', 'my', 'life', '.', '[SEP]']\n", + "********************************************************************\n", + " 0 | [CLS] \t 2 | . 1 | the 1 | ) 1 | \" 1 | , \n", + " 97 | when \t* 97 | when 2 | since 1 | until 0 | while 0 | before \n", + " 99 | i \t* 99 | i 0 | she 0 | he 0 | we 0 | me \n", + " 100 | was \t*100 | was 0 | were 0 | got 0 | turned 0 | is \n", + " 11 | little \t 19 | younger * 11 | little 8 | eight 7 | young 7 | twelve \n", + " 51 | , \t* 51 | , 24 | . 4 | and 1 | of 1 | the \n", + " 0 | friday \t 33 | valentine 16 | mother 7 | children 7 | father 5 | grandma \n", + " 100 | ' \t*100 | ' 0 | ` 0 | ′ 0 | \" 0 | * \n", + " 100 | s \t*100 | s 0 | til 0 | n 0 | d 0 | round \n", + " 39 | night \t* 39 | night 16 | dinner 6 | eve 5 | day 5 | supper \n", + " 90 | was \t* 90 | was 8 | became 1 | is 0 | were 0 | , \n", + " 4 | our \t 79 | a 13 | the * 4 | our 1 | my 0 | their \n", + " 1 | family \t 59 | favorite 18 | first 3 | favourite 2 | only 1 | last \n", + " 3 | game \t 12 | dinner 7 | christmas 6 | fun 5 | day * 3 | game \n", + " 81 | night \t* 81 | night 13 | day 1 | dinner 1 | date 0 | nights \n", + " 97 | . \t* 97 | . 2 | and 0 | ; 0 | , 0 | ! \n", + " 80 | after \t* 80 | after 6 | during 4 | at 4 | over 3 | before \n", + " 1 | supper \t 68 | school 10 | dinner 9 | that 2 | midnight 1 | breakfast \n", + " 100 | , \t*100 | , 0 | ##time 0 | together 0 | time 0 | dinner \n", + " 98 | we \t* 98 | we 1 | i 0 | they 0 | everyone 0 | people \n", + " 64 | would \t* 64 | would 21 | could 2 | will 2 | can 1 | did \n", + " 96 | play \t* 96 | play 2 | have 1 | watch 0 | enjoy 0 | played \n", + " 97 | card \t* 97 | card 1 | board 1 | cards 0 | video 0 | computer \n", + " 100 | games \t*100 | games 0 | game 0 | ##games 0 | matches 0 | sports \n", + " 99 | of \t* 99 | of 0 | in 0 | with 0 | and 0 | , \n", + " 0 | all \t 85 | some 6 | any 3 | a 3 | every 1 | this \n", + " 2 | sort \t 41 | kinds 34 | types 5 | sorts 4 | sizes 3 | kind \n", + " 98 | in \t* 98 | in 0 | around 0 | inside 0 | at 0 | from \n", + " 73 | the \t* 73 | the 23 | our 1 | my 1 | a 1 | their \n", + " 0 | sitting \t 56 | family 17 | dining 14 | living 1 | back 1 | same \n", + " 99 | room \t* 99 | room 0 | area 0 | rooms 0 | hall 0 | areas \n", + " 99 | . \t* 99 | . 1 | and 0 | ; 0 | , 0 | ... \n", + " 48 | as \t* 48 | as 29 | like 8 | being 3 | unlike 3 | for \n", + " 0 | the \t 100 | a 0 | another 0 | an * 0 | the 0 | one \n", + " 8 | kid \t 43 | child 22 | youngest * 8 | kid 4 | baby 3 | oldest \n", + " 63 | , \t* 63 | , 5 | i 2 | . 1 | myself 1 | and \n", + " 99 | i \t* 99 | i 0 | we 0 | he 0 | she 0 | me \n", + " 15 | loved \t 36 | wanted 21 | used * 15 | loved 11 | liked 4 | tried \n", + " 100 | to \t*100 | to 0 | and 0 | playing 0 | watching 0 | going \n", + " 99 | watch \t* 99 | watch 1 | see 0 | play 0 | watching 0 | watched \n", + " 0 | cartoons \t 52 | them 41 | games 1 | movies 1 | cards 1 | it \n", + " 0 | , \t 81 | , 19 | . 0 | ; 0 | - 0 | ... \n", + " 44 | but \t 47 | and * 44 | but 6 | so 1 | yet 1 | because \n", + " 100 | no \t*100 | no 0 | little 0 | the 0 | zero 0 | not \n", + " 100 | matter \t*100 | matter 0 | to 0 | telling 0 | idea 0 | , \n", + " 100 | how \t*100 | how 0 | what 0 | however 0 | the 0 | where \n", + " 100 | many \t*100 | many 0 | often 0 | few 0 | several 0 | numerous \n", + " 85 | times \t* 85 | times 3 | questions 1 | minutes 1 | hours 1 | people \n", + " 82 | i \t* 82 | i 10 | we 1 | was 1 | being 1 | he \n", + " 0 | asked \t 37 | took 19 | went 13 | admitted 6 | got 4 | confessed \n", + " 5 | to \t 23 | for 13 | about * 5 | to 4 | myself 3 | me \n", + " 0 | watching \t 64 | play 30 | watch 3 | see 1 | join 0 | read \n", + " 57 | them \t* 57 | them 23 | cartoons 5 | it 2 | movies 2 | games \n", + " 0 | , \t 100 | , 0 | . 0 | ... 0 | again 0 | even \n", + " 99 | my \t* 99 | my 0 | the 0 | her 0 | his 0 | our \n", + " 98 | parents \t* 98 | parents 0 | family 0 | father 0 | mother 0 | grandparents\n", + " 0 | would \t 47 | decided 18 | chose 8 | tried 4 | seemed 4 | knew \n", + " 0 | not \t 70 | refuse 10 | have 5 | agree 2 | want 1 | promise \n", + " 0 | to \t 45 | always 17 | have 11 | really 9 | even 3 | ever \n", + " 28 | let \t* 28 | let 17 | believe 11 | tell 11 | bother 5 | stop \n", + " 91 | me \t* 91 | me 6 | go 1 | up 1 | on 0 | it \n", + " 97 | . \t* 97 | . 1 | and 1 | ; 0 | because 0 | , \n", + " 94 | they \t* 94 | they 1 | he 1 | she 1 | dad 1 | i \n", + " 97 | would \t* 97 | would 1 | did 1 | always 1 | could 0 | might \n", + " 27 | say \t* 27 | say 21 | prove 16 | explain 4 | swear 3 | lie \n", + " 65 | to \t* 65 | to 6 | about 2 | for 2 | that 2 | in \n", + " 0 | us \t 99 | me 0 | themselves 0 | myself * 0 | us 0 | him \n", + " 94 | that \t* 94 | that 3 | how 1 | if 1 | , 0 | maybe \n", + " 89 | playing \t* 89 | playing 4 | the 4 | watching 0 | doing 0 | their \n", + " 46 | card \t* 46 | card 40 | the 4 | these 3 | those 1 | cards \n", + " 99 | games \t* 99 | games 0 | game 0 | together 0 | tricks 0 | again \n", + " 68 | would \t* 68 | would 14 | could 4 | might 4 | will 4 | did \n", + " 5 | help \t 23 | change 6 | use * 5 | help 3 | drain 3 | control \n", + " 61 | my \t* 61 | my 22 | the 9 | our 3 | your 3 | their \n", + " 1 | brain \t 15 | life 5 | family 4 | dad 3 | future 3 | parents \n", + " 57 | . \t* 57 | . 16 | and 14 | , 9 | but 1 | ; \n", + " 0 | still \t 35 | am 26 | was 8 | but 7 | is 5 | and \n", + " 6 | i \t 60 | , * 6 | i 3 | . 3 | too 3 | ... \n", + " 0 | unwilling \t 8 | want 8 | used 8 | have 8 | wanted 7 | had \n", + " 48 | to \t 50 | ##ly * 48 | to 0 | always 0 | t 0 | ##tly \n", + " 28 | play \t* 28 | play 6 | do 5 | make 1 | stop 1 | keep \n", + " 5 | the \t 82 | card * 5 | the 1 | these 1 | cards 1 | those \n", + " 59 | games \t* 59 | games 37 | game 1 | cards 0 | piano 0 | kids \n", + " 1 | for \t 92 | with * 1 | for 1 | in 1 | against 1 | without \n", + " 22 | them \t 51 | myself * 22 | them 9 | fun 1 | hours 1 | real \n", + " 2 | sometimes \t 16 | anyway 12 | anymore 11 | too 10 | all 4 | though \n", + " 96 | . \t* 96 | . 1 | because 1 | and 1 | ; 0 | , \n", + " 99 | i \t* 99 | i 0 | we 0 | they 0 | you 0 | people \n", + " 99 | didn \t* 99 | didn 0 | wouldn 0 | don 0 | couldn 0 | did \n", + " 100 | ' \t*100 | ' 0 | ` 0 | \" 0 | , 0 | ′ \n", + " 100 | t \t*100 | t 0 | m 0 | s 0 | d 0 | no \n", + " 45 | realize \t 46 | know * 45 | realize 3 | understand 3 | realise 2 | see \n", + " 100 | how \t*100 | how 0 | what 0 | the 0 | it 0 | however \n", + " 0 | right \t 6 | strict 5 | powerful 4 | wonderful 4 | smart 4 | helpful \n", + " 97 | my \t* 97 | my 1 | our 0 | the 0 | your 0 | their \n", + " 29 | parents \t* 29 | parents 6 | thoughts 4 | words 2 | people 2 | kids \n", + " 1 | are \t 97 | were * 1 | are 0 | thought 0 | felt 0 | was \n", + " 87 | until \t* 87 | until 9 | when 2 | before 1 | till 0 | once \n", + " 100 | i \t*100 | i 0 | we 0 | they 0 | he 0 | me \n", + " 13 | entered \t 54 | graduated * 13 | entered 9 | finished 7 | started 6 | left \n", + " 51 | high \t* 51 | high 20 | elementary 15 | middle 8 | grade 1 | primary \n", + " 100 | school \t*100 | school 0 | schools 0 | society 0 | college 0 | class \n", + " 81 | . \t* 81 | . 14 | and 2 | but 1 | , 1 | ; \n", + " 92 | the \t* 92 | the 5 | card 1 | those 0 | playing 0 | these \n", + " 43 | games \t* 43 | games 30 | game 6 | lessons 4 | rules 1 | math \n", + " 100 | my \t*100 | my 0 | our 0 | his 0 | me 0 | that \n", + " 53 | parents \t* 53 | parents 15 | father 13 | mother 5 | dad 4 | grandparents\n", + " 56 | taught \t* 56 | taught 20 | showed 12 | played 8 | gave 1 | told \n", + " 100 | me \t*100 | me 0 | us 0 | i 0 | him 0 | my \n", + " 0 | where \t 96 | when 2 | since 2 | while 0 | as 0 | until \n", + " 99 | i \t* 99 | i 0 | me 0 | he 0 | she 0 | my \n", + " 99 | was \t* 99 | was 0 | were 0 | as 0 | became 0 | had \n", + " 100 | a \t*100 | a 0 | the 0 | and 0 | one 0 | still \n", + " 22 | child \t 51 | kid * 22 | child 7 | boy 4 | teenager 4 | freshman \n", + " 97 | turned \t* 97 | turned 1 | turn 1 | came 0 | grew 0 | turning \n", + " 100 | out \t*100 | out 0 | into 0 | on 0 | up 0 | proving \n", + " 100 | to \t*100 | to 0 | into 0 | and 0 | not 0 | would \n", + " 94 | be \t* 94 | be 3 | become 3 | prove 0 | get 0 | seem \n", + " 69 | very \t* 69 | very 6 | extremely 5 | quite 3 | more 3 | really \n", + " 7 | useful \t 19 | important 14 | different 9 | helpful * 7 | useful 6 | influential \n", + " 6 | later \t 46 | things 17 | early * 6 | later 3 | lessons 3 | times \n", + " 100 | in \t*100 | in 0 | on 0 | during 0 | into 0 | than \n", + " 100 | my \t*100 | my 0 | our 0 | his 0 | their 0 | the \n", + " 99 | life \t* 99 | life 1 | career 0 | childhood 0 | education 0 | lives \n", + " 100 | . \t*100 | . 0 | ; 0 | ! 0 | ? 0 | ... \n", + " 0 | [SEP] \t 25 | \" 3 | for 3 | now 3 | and 2 | so \n", + "\n", + "*******************************************************************************************************************\n", + "1\n", + "0.0\n", + "WRB\n", + "\u001b[38;5;15m\u001b[48;5;0mwhen \u001b[0m\n", + "*******************************************************************************************************************\n", + "2\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mi \u001b[0m\n", + "*******************************************************************************************************************\n", + "3\n", + "0.0\n", + "VBD\n", + "\u001b[38;5;15m\u001b[48;5;0mwas \u001b[0m\n", + "*******************************************************************************************************************\n", + "4\n", + "0.4996413875309904\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mlittle\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "5\n", + "0.0\n", + ",\n", + "\u001b[38;5;15m\u001b[48;5;0m, \u001b[0m\n", + "*******************************************************************************************************************\n", + "6\n", + "5.037531860577574\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mfriday\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "7\n", + "0.0\n", + "POS\n", + "\u001b[38;5;15m\u001b[48;5;0m' \u001b[0m\n", + "*******************************************************************************************************************\n", + "8\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0ms \u001b[0m\n", + "*******************************************************************************************************************\n", + "9\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mnight \u001b[0m\n", + "*******************************************************************************************************************\n", + "10\n", + "0.0\n", + "VBD\n", + "\u001b[38;5;15m\u001b[48;5;0mwas \u001b[0m\n", + "*******************************************************************************************************************\n", + "11\n", + "2.9288282257051295\n", + "PRP$\n", + "\u001b[38;5;226m\u001b[48;5;0mour\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "12\n", + "3.944041330267972\n", + "NN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[38;5;226m\u001b[48;5;0mfamily\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "13\n", + "1.2859363936756965\n", + "NN\n", + "\u001b[38;5;226m\u001b[48;5;0mgame\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "14\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mnight \u001b[0m\n", + "*******************************************************************************************************************\n", + "15\n", + "0.0\n", + ".\n", + "\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "*******************************************************************************************************************\n", + "16\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0mafter \u001b[0m\n", + "*******************************************************************************************************************\n", + "17\n", + "3.864973993379616\n", + "NN\n", + "\u001b[38;5;226m\u001b[48;5;0msupper\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "18\n", + "0.0\n", + ",\n", + "\u001b[38;5;15m\u001b[48;5;0m, \u001b[0m\n", + "*******************************************************************************************************************\n", + "19\n", + "0.0\n", + "PRP\n", + "\u001b[38;5;15m\u001b[48;5;0mwe \u001b[0m\n", + "*******************************************************************************************************************\n", + "20\n", + "0.0\n", + "MD\n", + "\u001b[38;5;15m\u001b[48;5;0mwould \u001b[0m\n", + "*******************************************************************************************************************\n", + "21\n", + "0.0\n", + "VB\n", + "\u001b[38;5;15m\u001b[48;5;0mplay \u001b[0m\n", + "*******************************************************************************************************************\n", + "22\n", + "0.0\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mcard\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "23\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mgames \u001b[0m\n", + "*******************************************************************************************************************\n", + "24\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0mof \u001b[0m\n", + "*******************************************************************************************************************\n", + "25\n", + "6.181150402503261\n", + "DT\n", + "\u001b[38;5;226m\u001b[48;5;0mall\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "26\n", + "3.2442493513478983\n", + "NN\n", + "\u001b[38;5;226m\u001b[48;5;0msort\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "27\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0min \u001b[0m\n", + "*******************************************************************************************************************\n", + "28\n", + "0.0\n", + "DT\n", + "\u001b[38;5;15m\u001b[48;5;0mthe \u001b[0m\n", + "*******************************************************************************************************************\n", + "29\n", + "5.1759204333264215\n", + "NN\n", + "\u001b[38;5;226m\u001b[48;5;0msitting\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "30\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mroom \u001b[0m\n", + "*******************************************************************************************************************\n", + "31\n", + "0.0\n", + ".\n", + "\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "*******************************************************************************************************************\n", + "32\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0mas \u001b[0m\n", + "*******************************************************************************************************************\n", + "33\n", + "11.548374205660924\n", + "DT\n", + "\u001b[38;5;196m\u001b[48;5;0mthe\u001b[0m\u001b[38;5;2m\u001b[48;5;0m/a \u001b[0m\n", + "*******************************************************************************************************************\n", + "34\n", + "1.7249087614151182\n", + "NN\n", + "\u001b[38;5;226m\u001b[48;5;0mkid\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "35\n", + "0.0\n", + ",\n", + "\u001b[38;5;15m\u001b[48;5;0m, \u001b[0m\n", + "*******************************************************************************************************************\n", + "36\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mi \u001b[0m\n", + "*******************************************************************************************************************\n", + "37\n", + "0.9132128054919955\n", + "VBD\n", + "\u001b[38;5;226m\u001b[48;5;0mloved\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "38\n", + "0.0\n", + "TO\n", + "\u001b[38;5;15m\u001b[48;5;0mto \u001b[0m\n", + "*******************************************************************************************************************\n", + "39\n", + "0.0\n", + "VB\n", + "\u001b[38;5;15m\u001b[48;5;0mwatch \u001b[0m\n", + "*******************************************************************************************************************\n", + "40\n", + "6.495473375871382\n", + "NNS\n", + "\u001b[38;5;226m\u001b[48;5;0mcartoons\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "41\n", + "15.452660592340097\n", + "VB\n", + "\u001b[38;5;196m\u001b[48;5;0m,\u001b[0m\u001b[38;5;2m\u001b[48;5;0m/, \u001b[0m\n", + "*******************************************************************************************************************\n", + "42\n", + "0.053008093757376584\n", + "CC\n", + "\u001b[38;5;226m\u001b[48;5;0mbut\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "43\n", + "0.0\n", + "DT\n", + "\u001b[38;5;15m\u001b[48;5;0mno \u001b[0m\n", + "*******************************************************************************************************************\n", + "44\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mmatter \u001b[0m\n", + "*******************************************************************************************************************\n", + "45\n", + "0.0\n", + "WRB\n", + "\u001b[38;5;15m\u001b[48;5;0mhow \u001b[0m\n", + "*******************************************************************************************************************\n", + "46\n", + "0.0\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mmany\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "47\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mtimes \u001b[0m\n", + "*******************************************************************************************************************\n", + "48\n", + "0.0\n", + "VBP\n", + "\u001b[38;5;15m\u001b[48;5;0mi \u001b[0m\n", + "*******************************************************************************************************************\n", + "49\n", + "10.646676048338493\n", + "VBN\n", + "\u001b[38;5;196m\u001b[48;5;0masked\u001b[0m\u001b[38;5;2m\u001b[48;5;0m/was used \u001b[0m\n", + "*******************************************************************************************************************\n", + "50\n", + "1.5529499099577042\n", + "TO\n", + "\u001b[38;5;226m\u001b[48;5;0mto\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "51\n", + "8.95932484381705\n", + "VBG\n", + "\u001b[38;5;214m\u001b[48;5;0mwatching\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/watch \u001b[0m\n", + "*******************************************************************************************************************\n", + "52\n", + "0.0\n", + "PRP\n", + "\u001b[38;5;15m\u001b[48;5;0mthem \u001b[0m\n", + "*******************************************************************************************************************\n", + "53\n", + "18.383069999315744\n", + "VB\n", + "\u001b[38;5;196m\u001b[48;5;0m,\u001b[0m\u001b[38;5;2m\u001b[48;5;0m/, \u001b[0m\n", + "*******************************************************************************************************************\n", + "54\n", + "0.0\n", + "PRP$\n", + "\u001b[38;5;15m\u001b[48;5;0mmy \u001b[0m\n", + "*******************************************************************************************************************\n", + "55\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mparents \u001b[0m\n", + "*******************************************************************************************************************\n", + "56\n", + "5.4762173007041035\n", + "MD\n", + "检查点1*****************************************************\n", + "\u001b[38;5;226m\u001b[48;5;0mwould\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "57\n", + "5.3354081649787535\n", + "RB\n", + "\u001b[38;5;214m\u001b[48;5;0mnot\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/refuse \u001b[0m\n", + "*******************************************************************************************************************\n", + "58\n", + "5.981459151215268\n", + "TO\n", + "\u001b[38;5;214m\u001b[48;5;0mto\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/去掉 to \u001b[0m\n", + "*******************************************************************************************************************\n", + "59\n", + "0.0\n", + "VB\n", + "\u001b[38;5;15m\u001b[48;5;0mlet \u001b[0m\n", + "*******************************************************************************************************************\n", + "60\n", + "0.0\n", + "PRP\n", + "\u001b[38;5;15m\u001b[48;5;0mme \u001b[0m\n", + "*******************************************************************************************************************\n", + "61\n", + "0.0\n", + ".\n", + "\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "*******************************************************************************************************************\n", + "62\n", + "0.0\n", + "PRP\n", + "\u001b[38;5;15m\u001b[48;5;0mthey \u001b[0m\n", + "*******************************************************************************************************************\n", + "63\n", + "0.0\n", + "MD\n", + "\u001b[38;5;15m\u001b[48;5;0mwould \u001b[0m\n", + "*******************************************************************************************************************\n", + "64\n", + "0.0\n", + "VB\n", + "\u001b[38;5;15m\u001b[48;5;0msay \u001b[0m\n", + "*******************************************************************************************************************\n", + "65\n", + "0.0\n", + "TO\n", + "\u001b[38;5;15m\u001b[48;5;0mto \u001b[0m\n", + "*******************************************************************************************************************\n", + "66\n", + "6.581833896065917\n", + "PRP\n", + "\u001b[38;5;214m\u001b[48;5;0mus\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/me \u001b[0m\n", + "*******************************************************************************************************************\n", + "67\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0mthat \u001b[0m\n", + "*******************************************************************************************************************\n", + "68\n", + "0.0\n", + "VBG\n", + "\u001b[38;5;15m\u001b[48;5;0mplaying \u001b[0m\n", + "*******************************************************************************************************************\n", + "69\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mcard \u001b[0m\n", + "*******************************************************************************************************************\n", + "70\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mgames \u001b[0m\n", + "*******************************************************************************************************************\n", + "71\n", + "0.0\n", + "MD\n", + "\u001b[38;5;15m\u001b[48;5;0mwould \u001b[0m\n", + "*******************************************************************************************************************\n", + "72\n", + "1.4588194328350998\n", + "VB\n", + "\u001b[38;5;226m\u001b[48;5;0mhelp\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "73\n", + "0.0\n", + "PRP$\n", + "\u001b[38;5;15m\u001b[48;5;0mmy \u001b[0m\n", + "*******************************************************************************************************************\n", + "74\n", + "3.173226871228209\n", + "NN\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[38;5;226m\u001b[48;5;0mbrain\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "75\n", + "0.0\n", + ".\n", + "\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "*******************************************************************************************************************\n", + "76\n", + "5.90809919263306\n", + "RB\n", + "\u001b[38;5;214m\u001b[48;5;0mstill\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/still , \u001b[0m\n", + "*******************************************************************************************************************\n", + "77\n", + "2.2313680234481628\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mi\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "78\n", + "7.241924210620825\n", + "NN\n", + "\u001b[38;5;226m\u001b[48;5;0munwilling\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "79\n", + "0.033503519227476186\n", + "TO\n", + "\u001b[38;5;226m\u001b[48;5;0mto\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "80\n", + "0.0\n", + "VB\n", + "\u001b[38;5;15m\u001b[48;5;0mplay \u001b[0m\n", + "*******************************************************************************************************************\n", + "81\n", + "2.7297515213736863\n", + "DT\n", + "\u001b[38;5;226m\u001b[48;5;0mthe\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "82\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mgames \u001b[0m\n", + "*******************************************************************************************************************\n", + "83\n", + "4.231741889869705\n", + "IN\n", + "\u001b[38;5;214m\u001b[48;5;0mfor\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/with \u001b[0m\n", + "*******************************************************************************************************************\n", + "84\n", + "0.8675317652760016\n", + "PRP\n", + "\u001b[38;5;226m\u001b[48;5;0mthem\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "85\n", + "2.1558967133083646\n", + "RB\n", + "\u001b[38;5;226m\u001b[48;5;0msometimes\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "86\n", + "0.0\n", + ".\n", + "\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "*******************************************************************************************************************\n", + "87\n", + "0.0\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mi\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "88\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mdidn \u001b[0m\n", + "*******************************************************************************************************************\n", + "89\n", + "0.0\n", + "POS\n", + "\u001b[38;5;15m\u001b[48;5;0m' \u001b[0m\n", + "*******************************************************************************************************************\n", + "90\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mt \u001b[0m\n", + "*******************************************************************************************************************\n", + "91\n", + "0.011093191090367771\n", + "VB\n", + "\u001b[38;5;226m\u001b[48;5;0mrealize\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "92\n", + "0.0\n", + "WRB\n", + "\u001b[38;5;15m\u001b[48;5;0mhow \u001b[0m\n", + "*******************************************************************************************************************\n", + "93\n", + "3.6692828920487384\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mright\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "94\n", + "0.0\n", + "PRP$\n", + "\u001b[38;5;15m\u001b[48;5;0mmy \u001b[0m\n", + "*******************************************************************************************************************\n", + "95\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mparents \u001b[0m\n", + "*******************************************************************************************************************\n", + "96\n", + "4.758635578869137\n", + "VBP\n", + "\u001b[38;5;214m\u001b[48;5;0mare\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/were \u001b[0m\n", + "*******************************************************************************************************************\n", + "97\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0muntil \u001b[0m\n", + "*******************************************************************************************************************\n", + "98\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mi \u001b[0m\n", + "*******************************************************************************************************************\n", + "99\n", + "1.4610567542265707\n", + "VBD\n", + "\u001b[38;5;226m\u001b[48;5;0mentered\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "100\n", + "0.0\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mhigh\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "101\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mschool \u001b[0m\n", + "*******************************************************************************************************************\n", + "102\n", + "0.0\n", + ".\n", + "\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "*******************************************************************************************************************\n", + "103\n", + "0.0\n", + "DT\n", + "\u001b[38;5;15m\u001b[48;5;0mthe \u001b[0m\n", + "*******************************************************************************************************************\n", + "104\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mgames \u001b[0m\n", + "*******************************************************************************************************************\n", + "105\n", + "0.0\n", + "PRP$\n", + "\u001b[38;5;15m\u001b[48;5;0mmy \u001b[0m\n", + "*******************************************************************************************************************\n", + "106\n", + "0.0\n", + "NNS\n", + "\u001b[38;5;15m\u001b[48;5;0mparents \u001b[0m\n", + "*******************************************************************************************************************\n", + "107\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0mtaught \u001b[0m\n", + "*******************************************************************************************************************\n", + "108\n", + "0.0\n", + "PRP\n", + "\u001b[38;5;15m\u001b[48;5;0mme \u001b[0m\n", + "*******************************************************************************************************************\n", + "109\n", + "9.636217093727145\n", + "WRB\n", + "\u001b[38;5;214m\u001b[48;5;0mwhere\u001b[0m\u001b[38;5;6m\u001b[48;5;0m/when \u001b[0m\n", + "*******************************************************************************************************************\n", + "110\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mi \u001b[0m\n", + "*******************************************************************************************************************\n", + "111\n", + "0.0\n", + "VBD\n", + "\u001b[38;5;15m\u001b[48;5;0mwas \u001b[0m\n", + "*******************************************************************************************************************\n", + "112\n", + "0.0\n", + "DT\n", + "\u001b[38;5;15m\u001b[48;5;0ma \u001b[0m\n", + "*******************************************************************************************************************\n", + "113\n", + "0.8537064036270944\n", + "NN\n", + "\u001b[38;5;226m\u001b[48;5;0mchild\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "114\n", + "0.0\n", + "VBD\n", + "\u001b[38;5;15m\u001b[48;5;0mturned \u001b[0m\n", + "*******************************************************************************************************************\n", + "115\n", + "0.0\n", + "RP\n", + "\u001b[38;5;15m\u001b[48;5;0mout \u001b[0m\n", + "*******************************************************************************************************************\n", + "116\n", + "0.0\n", + "TO\n", + "\u001b[38;5;15m\u001b[48;5;0mto \u001b[0m\n", + "*******************************************************************************************************************\n", + "117\n", + "0.0\n", + "VB\n", + "\u001b[38;5;15m\u001b[48;5;0mbe \u001b[0m\n", + "*******************************************************************************************************************\n", + "118\n", + "0.0\n", + "RB\n", + "\u001b[38;5;15m\u001b[48;5;0mvery \u001b[0m\n", + "*******************************************************************************************************************\n", + "119\n", + "0.9869002719874604\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0museful\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "120\n", + "1.9653529850905183\n", + "RB\n", + "\u001b[38;5;226m\u001b[48;5;0mlater\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "*******************************************************************************************************************\n", + "121\n", + "0.0\n", + "IN\n", + "\u001b[38;5;15m\u001b[48;5;0min \u001b[0m\n", + "*******************************************************************************************************************\n", + "122\n", + "0.0\n", + "PRP$\n", + "\u001b[38;5;15m\u001b[48;5;0mmy \u001b[0m\n", + "*******************************************************************************************************************\n", + "123\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mlife \u001b[0m\n", + "*******************************************************************************************************************\n", + "124\n", + "0.0\n", + ".\n", + "\u001b[38;5;15m\u001b[48;5;0m. \u001b[0m\n", + "平均gap:1.4890399906268712\n", + "**********************************display_suggestions********************************************************\n", + "| suggestion : position in text\n", + "---------------------------------------------------------------------------------------\n", + "| a : 33\n", + "| , : 41\n", + "| was used : 49\n", + "| watch : 51\n", + "| , : 53\n", + "| refuse : 57\n", + "| 去掉 to : 58\n", + "| me : 66\n", + "| still , : 76\n", + "| with : 83\n", + "| were : 96\n", + "| when : 109\n", + "*************************************************************************************************************\n", + "建议的数量是 12\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/home/qsj/miniconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 728\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 729\u001b[0;31m \u001b[0mident\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstdin_socket\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 730\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/qsj/miniconda3/lib/python3.6/site-packages/jupyter_client/session.py\u001b[0m in \u001b[0;36mrecv\u001b[0;34m(self, socket, mode, content, copy)\u001b[0m\n\u001b[1;32m 802\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 803\u001b[0;31m \u001b[0mmsg_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msocket\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_multipart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 804\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mzmq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mZMQError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/qsj/miniconda3/lib/python3.6/site-packages/zmq/sugar/socket.py\u001b[0m in \u001b[0;36mrecv_multipart\u001b[0;34m(self, flags, copy, track)\u001b[0m\n\u001b[1;32m 466\u001b[0m \"\"\"\n\u001b[0;32m--> 467\u001b[0;31m \u001b[0mparts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrack\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrack\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 468\u001b[0m \u001b[0;31m# have first part already, only loop while more to receive\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket.Socket.recv\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mzmq/backend/cython/socket.pyx\u001b[0m in \u001b[0;36mzmq.backend.cython.socket._recv_copy\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m/home/qsj/miniconda3/lib/python3.6/site-packages/zmq/backend/cython/checkrc.pxd\u001b[0m in \u001b[0;36mzmq.backend.cython.checkrc._check_rc\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: ", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0manalyze_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshow_firstk_probs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m200\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m \u001b[0manalyse_and_modify_and_review\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36manalyse_and_modify_and_review\u001b[0;34m()\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Please input the position you want to modify:\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0mtext\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodify_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/qsj/miniconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36mraw_input\u001b[0;34m(self, prompt)\u001b[0m\n\u001b[1;32m 702\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_ident\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 703\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parent_header\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 704\u001b[0;31m \u001b[0mpassword\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 705\u001b[0m )\n\u001b[1;32m 706\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/qsj/miniconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py\u001b[0m in \u001b[0;36m_input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 732\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 733\u001b[0m \u001b[0;31m# re-raise KeyboardInterrupt, to truncate traceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 734\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 735\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 736\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "import os\n", + "#text = [\"Last week I went to the theater. There are many person . Luckily I had very good seat. The plays was very interesting. However, I didn't enjoy it. A young man and a young woman were sitting behind me. They were talk loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angry. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'\"]\n", + "#text = [\"After the outbreak of the disease, the Ministry of Agriculture and rural areas immediately sent a supervision team to the local. Local Emergency Response Mechanism has been activated in accordance with the requirements, to take blockade, culling, harmless treatment, disinfection and other treatment measures to all disease and culling of pigs for harmless treatment. At the same time, all live pigs and their products are prohibited from transferring out of the blockade area, and live pigs are not allowed to be transported into the blockade area. At present, all the above measures have been implemented.\"]\n", + "#text = [\"me love yours.\"]\n", + "#text = [\"Mr. and Mrs.Zhang all work in our school. They live far from the school, and it takes them about a hour and a half to go to work every day. In their spare time, they are interesting in planting vegetables in their garden, that is on the rooftop of their house. They often get up earlier and water the vegetables together. They have also bought in some gardening tools.beside, they often get some useful informations from the internet. When summer came, they will invite their students pick the vegetables!\"]\n", + "text = [\"When I was little, Friday's night was our family game night. After supper, we would play card games of all sort in the sitting room. As the kid, I loved to watch cartoons,but no matter how many times I asked to watching them, my parents would not to let me. They would say to us that playing card games would help my brain. Still I unwilling to play the games for them sometimes. I didn't realize how right my parents are until I entered high school. The games my parents taught me where I was a child turned out to be very useful later in my life.\"]\n", + "def analyse_and_modify_and_review():\n", + " global text\n", + " analyze_text(text, show_firstk_probs=200)\n", + " while len(suggestions)>0:\n", + " display_suggestion()\n", + " print('建议的数量是',len(suggestions))\n", + " if len(suggestions) == 0:\n", + " break\n", + " else:\n", + " index = input(\"Please input the position you want to modify:\")\n", + " index = int(index)\n", + " text[0] = modify_text(index)\n", + " analyze_text(text, show_firstk_probs=200)\n", + " \n", + "analyse_and_modify_and_review()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text = [\"The trophy doesn't fit into the brown suitcase because the _ is too large.\"]\n", + "# text = [\"Mary beat John in the match because _ was very strong.\"]\n", + "features = convert_examples_to_features(convert_text_to_examples(text), tokenizer, print_info=False)\n", + "input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).to(device)\n", + "input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long).to(device)\n", + "mlm_logits, _ = model(input_ids, input_type_ids)\n", + "mlm_probs = F.softmax(mlm_logits, dim=-1)\n", + "tokens = features[0].tokens\n", + "top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text = [\n", + " # same / different\n", + " \"Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have the same hair color.\",\n", + " \"Tom has black hair. Mary has black hair. John has yellow hair. _ and Mary have different hair colors.\",\n", + " \"Tom has yellow hair. Mary has black hair. John has black hair. Mary and _ have the same hair color.\",\n", + " # because / although\n", + " \"John is taller/shorter than Mary because/although _ is older/younger.\",\n", + " \"The red ball is heavier/lighter than the blue ball because/although the _ ball is bigger/smaller.\",\n", + " \"Charles did a lot better/worse than his good friend Nancy on the test because/although _ had/hadn't studied so hard.\",\n", + " \"The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.\",\n", + " \"John thought that he would arrive earlier than Susan, but/and indeed _ was the first to arrive.\",\n", + " # reverse\n", + " \"John came then Mary came. They left in reverse order. _ left then _ left.\",\n", + " \"John came after Mary. They left in reverse order. _ left after _ .\",\n", + " \"John came first, then came Mary. They left in reverse order: _ left first, then left _ .\",\n", + " # compare\n", + " \"Though John is tall, Tom is taller than John. So John is _ than Tom.\",\n", + " \"Tom is taller than John. So _ is shorter than _.\",\n", + " # WSC-style: before /after\n", + " \"Mary came before/after John. _ was late/early .\",\n", + " # yes / no\n", + " \"Was Tom taller than Susan? Yes, _ was taller.\",\n", + " # right / wrong, epistemic modality\n", + " \"John said the rain was about to stop. Mary said the rain would continue. Later the rain stopped. _ was wrong.\",\n", + " \n", + " \"The trophy doesn't fit into the brown suitcase because/although the _ is too small/large.\",\n", + " \"John thanked Mary because _ had given help to _ . \",\n", + " \"John felt vindicated/crushed when his longtime rival Mary revealed that _ was the winner of the competition.\",\n", + " \"John couldn't see the stage with Mary in front of him because _ is so short/tall.\",\n", + " \"Although they ran at about the same speed, John beat Sally because _ had such a bad start.\",\n", + " \"The fish ate the worm. The _ was hungry/tasty.\",\n", + " \n", + " \"John beat Mary. _ won the game/e winner.\",\n", + "]\n", + "text" + ] + }, + { + "cell_type": "code", + "execution_count": 1345, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"max_position_embeddings\": 512,\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"type_vocab_size\": 2,\n", + " \"vocab_size\": 30522\n", + "}" + ] + }, + "execution_count": 1345, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "config" + ] + }, + { + "cell_type": "code", + "execution_count": 1346, + "metadata": {}, + "outputs": [], + "source": [ + "with open('WSC_switched_label.json') as f:\n", + " examples = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "with open('WSC_child_problem.json') as f:\n", + " cexamples = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "for ce in cexamples:\n", + " for s in ce['sentences']:\n", + " for a in s['answer0'] + s['answer1']:\n", + " a = a.lower()\n", + " if a not in tokenizer.vocab:\n", + " ce\n", + " print(a, 'not in vocab!!!')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "for ce in cexamples:\n", + " if len(ce['sentences']) > 0:\n", + " e = examples[ce['index']]\n", + " assert ce['index'] == e['index']\n", + " e['score'] = all([s['score'] for s in ce['sentences']])\n", + " assert len(set([s['adjacent_ref'] for s in ce['sentences']])) == 1, 'adjcent_refs are different!'\n", + " e['adjacent_ref'] = ce['sentences'][0]['adjacent_ref']" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "groups = defaultdict(list)\n", + "for e in examples:\n", + " if 'score' in e:\n", + " index = e['index']\n", + " if index < 252:\n", + " if index % 2 == 1:\n", + " index -= 1\n", + " elif index in [252, 253, 254]:\n", + " index = 252\n", + " else:\n", + " if index % 2 == 0:\n", + " index -= 1\n", + " groups[index].append(e)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(2, 'fit into:large/small', False),\n", + " (4, 'thank:receive/give', False),\n", + " (6, 'call:successful available', True),\n", + " (8, 'ask:repeat answer', False),\n", + " (10, 'zoom by:fast/slow', False),\n", + " (12, 'vindicated/crushed:be the winner', False),\n", + " (14, 'lift:weak heavy', False),\n", + " (16, 'crash through:[hard]/[soft]', False),\n", + " (18, '[block]:short/tall', False),\n", + " (20, 'down to:top/bottom', False),\n", + " (22, 'beat:good/bad', False),\n", + " (24, 'roll off:anchored level', False),\n", + " (26, 'above/below', False),\n", + " (28, 'better/worse:study hard', False),\n", + " (30, 'after/before:far away', False),\n", + " (32, 'be upset with:buy from not work/sell not work', True),\n", + " (34, '?yell at comfort:upset', False),\n", + " (36, 'above/below:moved first', False),\n", + " (38, 'although/because', False),\n", + " (40, 'bully:punish rescue', False),\n", + " (42, 'pour:empty/full', False),\n", + " (44, 'know:nosy indiscreet', False),\n", + " (46, 'explain:convince/understand', True),\n", + " (48, '?know tell:so/because', True),\n", + " (50, 'beat:younger/older', False),\n", + " (56, 'clog:cleaned removed', True),\n", + " (58, '?immediately follow:short delayed', False),\n", + " (60, '?between:see see around', True),\n", + " (64, 'but/and', False),\n", + " (66, 'clean:put in the trash put in the drawer', False),\n", + " (68, 'because/but', False),\n", + " (70, 'out of:handy lighter', False),\n", + " (72, 'put:tall high', False),\n", + " (74, 'show:good famous', True),\n", + " (76, 'pay for:generous grateful', False),\n", + " (78, 'but', False),\n", + " (80, 'if', False),\n", + " (82, 'if', False),\n", + " (84, 'fool:get/lose', False),\n", + " (88, 'wait:impatient cautious', False),\n", + " (90, 'give birth:woman baby', True),\n", + " (92, '?stop normal/stop abnormal:strange', False),\n", + " (96, 'eat:hungry tasty', False),\n", + " (98, 'put ... into filled with ... :get in/get out', False),\n", + " (100, 'up:at the bottom/at the top', False),\n", + " (102, 'crash through:removed repaired', False),\n", + " (104, 'stab:taken to the police station taken to the hospital', False),\n", + " (106, 'hear ... humming and whistling:annoyed/annoying', True),\n", + " (108, 'see ... juggling watermelons:impressed/impressive', True),\n", + " (114, 'tell lies: truthful skeptical', True),\n", + " (130, 'but:disappointed', True),\n", + " (132, 'visit:invite come out/invite come in', True),\n", + " (134, 'take classes from:eager known to speak it fluently', False),\n", + " (138, 'cover:out gone', True),\n", + " (144, 'tuck:work sleep', True),\n", + " (150, 'influence:later/earlier', False),\n", + " (152, 'can not cut:thick small', False),\n", + " (154, 'attack:kill guard', False),\n", + " (156, 'attack:bold nervous', False),\n", + " (160, 'change:hard:easy', False),\n", + " (166, 'alive:is/was', False),\n", + " (168, 'infant:twelve years old twelve months old', False),\n", + " (170, 'better equipped and large:defeated/victorious', False),\n", + " (178, 'interview:persistent cooperative', False),\n", + " (186, 'be full of:minority/majority', False),\n", + " (188, 'like over:more/fewer', False),\n", + " (190, 'place on all:not enough/too many', True),\n", + " (192, 'stick:leave have', True),\n", + " (196, 'follow:admire/influence', True),\n", + " (198, 'fit through:wide/narrow', False),\n", + " (200, 'trade:dowdy/great', False),\n", + " (202, 'hire/hire oneself to:take care of', True),\n", + " (204, 'promise/order', False),\n", + " (208, 'mother:education place', True),\n", + " (210, 'knock:get an answer/answer', True),\n", + " (212, 'pay:receive/deliver', False),\n", + " (218, '?', False),\n", + " (220, 'say check:move take', False),\n", + " (222, '?', False),\n", + " (224, 'give a life:drive alone walk', False),\n", + " (226, 'pass the plate:full/hungry', False),\n", + " (228, 'pass:turn over turn next', False),\n", + " (232, 'stretch pat', True),\n", + " (234, 'accept share', False),\n", + " (236, 'speak:break silence break concentration', False),\n", + " (240, 'carry:leg ache leg dangle', True),\n", + " (242, 'carry:in arms in bassinet', False),\n", + " (244, 'hold:against chest against will', True),\n", + " (250, 'stop', False),\n", + " (252, 'even though/because/not', False),\n", + " (255, 'give:not hungry/hungry', False),\n", + " (259, 'ask for a favor:refuse/be refused`', False),\n", + " (261, 'cede:less popular/more popular', False),\n", + " (263, 'not pass although:see open/open', True),\n", + " (271, 'suspect regret', True)]" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def filter_dict(d, keys=['index', 'sentence', 'correct_answer', 'relational_word', 'is_associative', 'score']):\n", + " return {k: d[k] for k in d if k in keys}\n", + "\n", + "# ([[filter_dict(e) for e in eg] for eg in groups.values() if eg[0]['relational_word'] != 'none' and all([e['score'] for e in eg])])# / len([eg for eg in groups.values() if eg[0]['relational_word'] != 'none'])\n", + "[(index, eg[0]['relational_word'], all([e['score'] for e in eg])) for index, eg in groups.items() if eg[0]['relational_word'] != 'none']\n", + "# len([filter_dict(e) for e in examples if 'score' in e and not e['score'] and e['adjacent_ref']])\n", + "# for e in examples:\n", + "# if e['index'] % 2 == 0:\n", + "# print(e['sentence'])" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "179" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(['because' in e['sentence'] for e in examples]) + \\\n", + "sum(['so ' in e['sentence'] for e in examples]) + \\\n", + "sum(['but ' in e['sentence'] for e in examples]) + \\\n", + "sum(['though' in e['sentence'] for e in examples])" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "# with open('WSC_switched_label.json', 'w') as f:\n", + "# json.dump(examples, f)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "vis_attn_topk = 3\n", + "\n", + "def has_chinese_label(labels):\n", + " labels = [label.split('->')[0].strip() for label in labels]\n", + " r = sum([len(label) > 1 for label in labels if label not in ['BOS', 'EOS']]) * 1. / (len(labels) - 1)\n", + " return 0 < r < 0.5 # r == 0 means empty query labels used in self attention\n", + "\n", + "def _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col, color='b'):\n", + " assert len(query_labels) == attn.size(0)\n", + " assert len(key_labels) == attn.size(1)\n", + "\n", + " ax1.set_xlim([-1, 1])\n", + " ax1.set_xticks([])\n", + " ax2 = ax1.twinx()\n", + " nlabels = max(len(key_labels), len(query_labels))\n", + " pos = range(nlabels)\n", + " \n", + " if 'self' in attn_name and col < ncols - 1:\n", + " query_labels = ['' for _ in query_labels]\n", + "\n", + " for ax, labels in [(ax1, key_labels), (ax2, query_labels)]:\n", + " ax.set_yticks(pos)\n", + " if has_chinese_label(labels):\n", + " ax.set_yticklabels(labels, fontproperties=zhfont)\n", + " else:\n", + " ax.set_yticklabels(labels)\n", + " ax.set_ylim([nlabels - 1, 0])\n", + " ax.tick_params(width=0, labelsize='xx-large')\n", + "\n", + " for spine in ax.spines.values():\n", + " spine.set_visible(False)\n", + "\n", + "# mask, attn = filter_attn(attn)\n", + " for qi in range(attn.size(0)):\n", + "# if not mask[qi]:\n", + "# continue\n", + "# for ki in range(attn.size(1)):\n", + " for ki in attn[qi].topk(vis_attn_topk)[1]:\n", + " a = attn[qi, ki]\n", + " ax1.plot((-1, 1), (ki, qi), color, alpha=a)\n", + "# print(attn.mean(dim=0).topk(5)[0])\n", + "# ax1.barh(pos, attn.mean(dim=0).data.cpu().numpy())\n", + "\n", + "def plot_layer_attn(result_tuple, attn_name='dec_self_attns', layer=0, heads=None):\n", + " hypo, nheads, labels_dict = result_tuple\n", + " key_labels, query_labels = labels_dict[attn_name]\n", + " if heads is None:\n", + " heads = range(nheads)\n", + " else:\n", + " nheads = len(heads)\n", + " \n", + " stride = 2 if attn_name == 'dec_enc_attns' else 1\n", + " nlabels = max(len(key_labels), len(query_labels))\n", + " rcParams['figure.figsize'] = 20, int(round(nlabels * stride * nheads / 8 * 1.0))\n", + " \n", + " rows = nheads // ncols * stride\n", + " fig, axes = plt.subplots(rows, ncols)\n", + " \n", + " # for head in range(nheads):\n", + " for head_i, head in enumerate(heads):\n", + " row, col = head_i * stride // ncols, head_i * stride % ncols\n", + " ax1 = axes[row, col]\n", + " attn = hypo[attn_name][layer][head]\n", + " _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col)\n", + " if attn_name == 'dec_enc_attns':\n", + " col = col + 1\n", + " axes[row, col].axis('off') # next subfig acts as blank place holder\n", + " # plt.suptitle('%s with %d heads, Layer %d' % (attn_name, nheads, layer), fontsize=20)\n", + " plt.show() \n", + " \n", + "ncols = 4" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'BertSelfAttention' object has no attribute 'attention_probs'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mattn_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'enc_self_attns'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mhypo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mattn_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbert\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattention\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattention_probs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_hidden_layers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mkey_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mquery_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokens\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mlabels_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mattn_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mkey_labels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery_labels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mresult_tuple\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mhypo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_attention_heads\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mattn_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'enc_self_attns'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mhypo\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mattn_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbert\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattention\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattention_probs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_hidden_layers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mkey_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mquery_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokens\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mlabels_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mattn_name\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mkey_labels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mquery_labels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mresult_tuple\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mhypo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_attention_heads\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/qsj/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 516\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmodules\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 517\u001b[0m raise AttributeError(\"'{}' object has no attribute '{}'\".format(\n\u001b[0;32m--> 518\u001b[0;31m type(self).__name__, name))\n\u001b[0m\u001b[1;32m 519\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'BertSelfAttention' object has no attribute 'attention_probs'" + ] + } + ], + "source": [ + "attn_name = 'enc_self_attns'\n", + "hypo = {attn_name: [model.bert.encoder.layer[i].attention.self.attention_probs[0] for i in range(config.num_hidden_layers)]}\n", + "key_labels = query_labels = tokens\n", + "labels_dict = {attn_name: (key_labels, query_labels)}\n", + "result_tuple = (hypo, config.num_attention_heads, labels_dict)\n", + "plot_layer_attn(result_tuple, attn_name=attn_name, layer=10, heads=None)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/likunlin_final.ipynb b/likunlin_final.ipynb new file mode 100644 index 00000000000000..6bbf623bf3d641 --- /dev/null +++ b/likunlin_final.ipynb @@ -0,0 +1,2542 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten.\n", + "Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead.\n", + "Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable.\n", + "Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable.\n", + "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "import nltk\n", + "import numpy as np\n", + "import math\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "from pylab import rcParams\n", + "\n", + "import torch\n", + "import torch.nn.functional as F\n", + "from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig\n", + "from examples.extract_features import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "05/27/2019 11:51:05 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased/vocab.txt\n", + "05/27/2019 11:51:05 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased\n", + "05/27/2019 11:51:05 - INFO - pytorch_pretrained_bert.modeling - Model config {\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.1,\n", + " \"hidden_size\": 768,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 3072,\n", + " \"max_position_embeddings\": 512,\n", + " \"num_attention_heads\": 12,\n", + " \"num_hidden_layers\": 12,\n", + " \"type_vocab_size\": 2,\n", + " \"vocab_size\": 30522\n", + "}\n", + "\n", + "05/27/2019 11:51:08 - INFO - pytorch_pretrained_bert.modeling - Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n" + ] + } + ], + "source": [ + "class Args:\n", + " def __init__(self):\n", + " pass\n", + " \n", + "args = Args()\n", + "args.no_cuda = True #不用GPU\n", + "\n", + "CONFIG_NAME = 'bert_config.json'\n", + "BERT_DIR = '/nas/pretrain-bert/pretrain-pytorch/bert-base-uncased'\n", + "config_file = os.path.join(BERT_DIR, CONFIG_NAME)\n", + "config = BertConfig.from_json_file(config_file)\n", + "\n", + "try:\n", + " tokenizer = BertTokenizer.from_pretrained(os.path.join(BERT_DIR, 'vocab.txt'))\n", + "except:\n", + " tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", + "#tokenizer.tokenize = nltk.word_tokenize\n", + "\n", + "model = BertForMaskedLM.from_pretrained(BERT_DIR)\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() and not args.no_cuda else \"cpu\")\n", + "_ = model.to(device)\n", + "_ = model.eval()\n", + "\n", + "input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids = [],[],[],[],[],[]\n", + "suggestions = {} #外部变量,需要传到前端\n", + "original_tokens = [] #外部变量,需要传到前端" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt\r\n" + ] + } + ], + "source": [ + "ls /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BertForPreTraining:\n", + "Outputs:\n", + " if `masked_lm_labels` and `next_sentence_label` are not `None`:\n", + " Outputs the total_loss which is the sum of the masked language modeling loss and the next\n", + " sentence classification loss.\n", + " if `masked_lm_labels` or `next_sentence_label` is `None`:\n", + " Outputs a tuple comprising\n", + " - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and\n", + " - the next sentence classification logits of shape [batch_size, 2]." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "from_pretrained:\n", + "Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.\n", + "Download and cache the pre-trained model file if needed." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "import re\n", + "def convert_text_to_examples(text): \n", + " '''功能:\n", + " 把输入的文本变成一个实例,一个实例中包含text_a,text_b(text_b用于是否为上下句的任务,该任务不使用此功能)\n", + " 输入:\n", + " text:一个列表结构,列表中包含原始文本字符串,由于仅完成mlm任务,所以text列表中仅包含一个字符串,就是待检查的字符串\n", + " 输出:\n", + " example:实例,其中包含:\n", + " unique_id:此任务仅用到0\n", + " text_a:text列表内的字符串\n", + " text_b:此任务下该变量为None\n", + " '''\n", + " examples = []\n", + " unique_id = 0\n", + " if True:\n", + " for line in text:\n", + " line = line.strip()\n", + " text_a = None\n", + " text_b = None\n", + " m = re.match(r\"^(.*) \\|\\|\\| (.*)$\", line) #想要匹配这样的字符串'You are my sunshine. ||| I love you.'\n", + " \n", + " if m is None:\n", + " text_a = line\n", + " else:\n", + " text_a = m.group(1) #匹配的第一句,比如You are my sunshine,my only sunshine.\n", + " text_b = m.group(2) #匹配的第二句,比如I love you.\n", + " \n", + " examples.append(\n", + " InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))\n", + " unique_id += 1\n", + " return examples\n", + "#print(convert_text_to_examples(['I love you. The cat is so cute.'])[0].text_a)\n", + "\n", + "def convert_examples_to_features(examples, tokenizer, append_special_tokens=True, replace_mask=True, print_info=False):\n", + " '''功能:\n", + " 把实例变成一个特征列表\n", + " 输入:\n", + " examples:实例,convert_text_to_examples()函数的输出\n", + " tokenizer:BERT的tokenizer,用于将文本进行各种处理,它可以把一个text转变成tokens,把tokens变成每个token在词典中的编号以及逆运算\n", + " append_special_tokens:是否允许在生成的tokens中加入特殊符号,也就是[CLS]、[MASK]和[SEP],默认为True\n", + " replace_mask:不明\n", + " print_info:不明\n", + " 输出:\n", + " features:每一个feature包含:\n", + " unique_id:编号,目前实现的功能features里面仅有一个feature\n", + " tokens=tokens,tokens:是形如['i','love','you','.']的一个列表\n", + " input_ids=input_ids:字符串中的每个单词在词典中的index序列\n", + " input_mask=input_mask:一堆1\n", + " input_type_ids=input_type_ids)):对text_a,text_b的区分,用于上下句任务,对于本任务,该参数为一个列表,其中包含token长度个的0\n", + " '''\n", + " features = []\n", + " for (ex_index, example) in enumerate(examples):\n", + " tokens_a = tokenizer.tokenize(example.text_a) #tokenize的作用是把\"i love you.\"变成['i','love','you','.']\n", + " tokens_b = None\n", + " if example.text_b:\n", + " tokens_b = tokenizer.tokenize(example.text_b)\n", + "\n", + " tokens = []\n", + " input_type_ids = [] #segment embedding\n", + " if append_special_tokens: #输入参数中默认为true\n", + " tokens.append(\"[CLS]\")\n", + " input_type_ids.append(0)\n", + " for token in tokens_a:\n", + " if replace_mask and token == '_': # XD\n", + " token = \"[MASK]\"\n", + " tokens.append(token)\n", + " input_type_ids.append(0)\n", + " if append_special_tokens:\n", + " tokens.append(\"[SEP]\")\n", + " input_type_ids.append(0)\n", + "\n", + " if tokens_b:\n", + " for token in tokens_b:\n", + " if replace_mask and token == '_': # XD\n", + " token = \"[MASK]\"\n", + " tokens.append(token)\n", + " input_type_ids.append(1)\n", + " if append_special_tokens:\n", + " tokens.append(\"[SEP]\")\n", + " input_type_ids.append(1)\n", + " input_ids = tokenizer.convert_tokens_to_ids(tokens) #把原来句子中的词语编成在字典中的编号\n", + " input_mask = [1] * len(input_ids) \n", + " \n", + " if ex_index < 5:\n", + "# logger.info(\"*** Example ***\")\n", + "# logger.info(\"unique_id: %s\" % (example.unique_id))\n", + " logger.info(\"tokens: %s\" % \" \".join([str(x) for x in tokens]))\n", + "# logger.info(\"input_ids: %s\" % \" \".join([str(x) for x in input_ids]))\n", + "# logger.info(\"input_mask: %s\" % \" \".join([str(x) for x in input_mask]))\n", + "# logger.info(\n", + "# \"input_type_ids: %s\" % \" \".join([str(x) for x in input_type_ids]))\n", + " \n", + " features.append(\n", + " InputFeatures(\n", + " unique_id=example.unique_id,#编号,目前实现的功能features里面仅有一个feature\n", + " tokens=tokens,#形如['i','love','you','.']的一个列表\n", + " input_ids=input_ids,#字符串中的每个单词在词典中的index序列\n", + " input_mask=input_mask, #一堆1\n", + " input_type_ids=input_type_ids)) #第0类和第1类,对text_a,text_b的区分,本代码中全都是零\n", + " return features \n", + "\n", + "def copy_and_mask_feature(feature, step, masked_tokens=None): \n", + " '''\n", + " 功能:\n", + " 输入feature生成训练的批次数以及mask好的训练素材\n", + " 输入:\n", + " feature:convert_examples_to_features函数的输出\n", + " step:两个[mask]位置的步长\n", + " masked_tokens:默认为None,在程序中没有使用\n", + " '''\n", + " import copy\n", + " tokens = feature.tokens\n", + " len_token = len(tokens)\n", + " if len_token 0\n", + " masked_feature_copies = []\n", + " for i in batches: #用[mask]依次掩盖每一个位置\n", + " feature_copy = copy.deepcopy(feature)\n", + " masked_pos = i\n", + " while masked_pos < len_token:\n", + " feature_copy.input_ids[masked_pos] = tokenizer.vocab[\"[MASK]\"]\n", + " masked_pos = masked_pos + step\n", + " masked_feature_copies.append(feature_copy)\n", + " return masked_feature_copies, batches\n", + "\n", + "#masked_feature_copies, batches = copy_and_mask_feature(features[0],3)\n", + "#print(masked_feature_copies[0].input_ids) #结果[101, 1045, 2293, 103, 102]\n", + "#print(batches) #结果是一个range(0,5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "analyzed_cache = {}\n", + "from pattern.en import conjugate, lemma, lexeme,PRESENT,SG\n", + "#print (lemma('gave'))\n", + "#print (lexeme('production'))\n", + "#print (conjugate(verb='give',tense=PRESENT,number=SG))\n", + "def process_text(text): \n", + " '''\n", + " 功能:\n", + " 处理输入文本,将文本按句子分成若干token,得出原来text中index位置的单词在x句子的y位置,还得出各个句子类别码\n", + " 输入:\n", + " text:文本字符串,注意区别\n", + " 输出:\n", + " input_ids_sen:二维列表,第一维列表的元素是每个句子的input_ids列表\n", + " input_type_ids_sen:二维列表,第一维列表的元素是每个句子的input_type_ids列表\n", + " in_sentence:通过这个二维数组可以很方便的通过在完整text中的下标找到这个下标所在的句子和在句子中的下标\n", + " sentences:字符串列表,列表中每一个元素是一个句子字符串\n", + " entire_ids:整个text的input_ids\n", + " entire_type_ids:整个text的input_type_ids\n", + " '''\n", + " token =[]\n", + " entire_type_ids = []\n", + " token0 = tokenizer.tokenize(text)\n", + " token.append('[CLS]')\n", + " entire_type_ids.append(0)\n", + " for i in token0:\n", + " token.append(i)\n", + " entire_type_ids.append(0)\n", + " token.append('[SEP]')\n", + " entire_type_ids.append(0)\n", + " \n", + " entire_ids = tokenizer.convert_tokens_to_ids(token)\n", + " in_sentence = [[0,0]] \n", + " sentence_n = 0\n", + " index = 1\n", + " for i in range(1,len(token)-1):\n", + " in_sentence.append([sentence_n,index]) #每个token中的词在所在句中的位置表示出来,以及该位置在哪一句中\n", + " index = index + 1 #比如,位置i这个词在第sentence句的index位置上\n", + " if token[i] == '.':\n", + " sentence_n = sentence_n + 1\n", + " index = 1\n", + " sentences = text.split(\".\")\n", + " \n", + " sen_token = []\n", + " input_ids_sen = []\n", + " input_type_ids_sen = []\n", + " for i,sentence in enumerate(sentences):\n", + " sentence = sentence + '.'\n", + " sentences[i] = sentences[i] + '.'\n", + " token = []\n", + " input_type_ids = []\n", + " tokens = tokenizer.tokenize(sentence)\n", + " token.append('[CLS]')\n", + " input_type_ids.append(0) \n", + " for i in tokens:\n", + " token.append(i)\n", + " input_type_ids.append(0) \n", + " token.append('[SEP]') \n", + " input_type_ids.append(0)\n", + " input_ids_sen.append(tokenizer.convert_tokens_to_ids(token))\n", + " input_type_ids_sen.append(input_type_ids)\n", + " return input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def get_word(index):\n", + " '''\n", + " 输入:\n", + " index:在完整text中的位置\n", + " 输出\n", + " word:该位置上的单词\n", + " '''\n", + " word_id = entire_ids[index]\n", + " word = tokenizer.ids_to_tokens[word_id]\n", + " return word\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import copy\n", + "import nltk\n", + "from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,PRESENT,SG,INFINITIVE, PRESENT, PAST, FUTURE, PROGRESSIVE\n", + "\n", + "def give_suggestion(input_ids_,input_type_ids_,id_in_sen,alternative_word,threshold):\n", + " '''\n", + " 功能:\n", + " 给出指定文本指定位置的推荐用词\n", + " 输入:\n", + " input_ids_:要分析的文本的input_ids\n", + " input_type_ids_:要分析的文本的的input_type_ids\n", + " id_in_sen:要分析的文本中[MASK]的位置下标,也就是需要给出建议用词的位置\n", + " alternative_word:推荐的备选词范围\n", + " threshold:阈值\n", + " 输出:\n", + " suggestion:推荐\n", + " need:推荐的是否是备选词中的词\n", + " suggestion_prob:推荐词填在id_in_sen位置的概率\n", + " top_of_alternative:备选词中最值得推荐的词\n", + " '''\n", + " input_ids = copy.deepcopy(input_ids_)\n", + " input_type_ids = copy.deepcopy(input_type_ids_)\n", + " word0 = input_ids[id_in_sen]\n", + " word0 = tokenizer.ids_to_tokens[word0]\n", + " list_word_id = []\n", + " \n", + " input_ids[id_in_sen] = tokenizer.vocab[\"[MASK]\"]\n", + " T_input_ids = torch.tensor([input_ids], dtype=torch.long) #把input_ids增加了一个维度\n", + " T_input_type_ids = torch.tensor([input_type_ids], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样\n", + " T_input_ids = T_input_ids.to(device) #拿去GPU\n", + " T_input_type_ids = T_input_type_ids.to(device)\n", + "\n", + " mlm_logits, _ = model(T_input_ids, T_input_type_ids)\n", + " mlm_probs = F.softmax(mlm_logits, dim=-1)\n", + " reduced_mlm_probs = mlm_probs[0][id_in_sen]\n", + "\n", + " top_ind = reduced_mlm_probs.argmax().item()\n", + " top_prob = reduced_mlm_probs.max().item() \n", + " \n", + " list_word = []\n", + " \n", + " top_of_alternative = None\n", + " if len(alternative_word)>0:\n", + " list_word_prob = {}\n", + " for word in alternative_word:\n", + " try:\n", + " list_word_id.append(tokenizer.vocab[word])\n", + " list_word.append(word)\n", + " except KeyError:\n", + " pass\n", + "\n", + " for word,word_id in zip(list_word,list_word_id):\n", + " list_word_prob.update({word:float(reduced_mlm_probs[word_id].data)})\n", + " prob_ord = sorted(list_word_prob.items(),key = lambda x:x[1],reverse = True)\n", + " \n", + " top_prob_word = prob_ord[0][1]\n", + " top_of_alternative = prob_ord[0][0]\n", + " gap = math.log(top_prob) - math.log(top_prob_word)\n", + " \n", + " if gap < threshold:\n", + " suggestion = prob_ord[0][0]\n", + " suggestion_prob = prob_ord[0][1]\n", + " need = 1\n", + " else:\n", + " suggestion = tokenizer.ids_to_tokens[top_ind]\n", + " suggestion_prob = top_prob\n", + " need = 0\n", + " #print(\"gap = \" + str(gap))\n", + " #print(prob_ord)\n", + " else:\n", + " suggestion = tokenizer.ids_to_tokens[top_ind]\n", + " suggestion_prob = top_prob\n", + " need = 0\n", + " \n", + " return suggestion,need,suggestion_prob,top_of_alternative \n", + "\n", + "#返回变量5\n", + "#suggestion -> 最值得推荐的词\n", + "#need -> 是否需要可选词中的一个\n", + "#suggestion_prob ->最值得推荐的词的概率\n", + "#top_of_alternative -> 可选词中最值得推荐的\n", + "#suggestion,need,suggestion_prob,top_of_alternative = give_suggestion(input_ids_,input_type_ids_,id_in_sen,alternative_word,threshold)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from spacy.lemmatizer import Lemmatizer\n", + "from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES\n", + "from pattern.en import comparative, superlative\n", + "from pattern.en import suggest\n", + "from nltk.stem.lancaster import LancasterStemmer\n", + "from nltk.stem.porter import PorterStemmer\n", + "from nltk.stem import SnowballStemmer\n", + "import enchant\n", + "d = enchant.Dict(\"en_US\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "stemmers=[]\n", + "stemmers.append(LancasterStemmer()) \n", + "stemmers.append(SnowballStemmer(\"english\"))\n", + "stemmers.append(PorterStemmer())\n", + "lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)\n", + "def word_convert(word,new_word,Stemmer):\n", + " '''\n", + " 功能:\n", + " 根据提供的word和可能的变形new_word,得到正确的变形,例如给出basic,basicly得到basically\n", + " 输入:\n", + " word:需要变形的词\n", + " new_word:猜想的变形\n", + " 输出:\n", + " suggest_word:推荐的正确变形\n", + " '''\n", + " suggest_word = None\n", + " word_stem = Stemmer().stem(word)\n", + " suggest_ = new_word\n", + " \n", + " suggest_list = suggest(suggest_)\n", + "\n", + " if len(word) 0.95):# or word_[1] > 0.95 :\n", + " suggest_word = word_[0]\n", + " break \n", + " if word_[1] < 0.001:\n", + " break\n", + " stem_list = []\n", + " for stemmer in stemmers:\n", + " suggest_stem = stemmer.stem(word_[0])\n", + " if flag == 1 and suggest_stem[:-1] in word_stem and word_stem[:3] in suggest_stem[:3]: #一般是去后缀\n", + " suggest_word = word_[0]\n", + " break\n", + " elif flag == 0 and word_stem in suggest_stem and word_[0][-1:] in suggest_[-1:]: #一般是加后缀,后缀一定要一样\n", + " suggest_word = word_[0]\n", + " break\n", + " \n", + " if suggest_word != None:\n", + " break\n", + " return suggest_word \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "stemmers=[]\n", + "stemmers.append(LancasterStemmer()) \n", + "stemmers.append(SnowballStemmer(\"english\"))\n", + "stemmers.append(PorterStemmer())\n", + "lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)\n", + "def word_convert(word,new_word,Stemmer):\n", + " '''\n", + " 说明;\n", + " 与上面的区别是使用的拼写改错算法不同,上面那个平均速度慢,但更符合我的要求,这个平均速度更快\n", + " 功能:\n", + " 根据提供的word和可能的变形new_word,得到正确的变形,例如给出basic,basicly得到basically\n", + " 输入:\n", + " word:需要变形的词\n", + " new_word:猜想的变形\n", + " Stemmer:词根提取器\n", + " 输出:\n", + " suggest_word:推荐的正确变形\n", + " '''\n", + " if d.check(new_word)==True: #如果发现new_word拼写正确,则直接返回\n", + " return new_word\n", + " else:\n", + " suggest_word = None\n", + " word_stem = Stemmer().stem(word)\n", + " suggest_ = new_word\n", + " suggest_list = d.suggest(suggest_) #可能的正确单词列表\n", + "\n", + " if len(word)death,success->succeed无能为力'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "'''下面是词性转换系列函数\n", + " 功能:\n", + " 词性转变系列函数\n", + " 输入:\n", + " word:原形词\n", + " 输出:\n", + " suggest_word:推荐的变形\n", + " suggest_list:推荐的变形列表\n", + " 说明:\n", + " 词性变化的能力有限,对于有些特殊变形,比如die->death,success->succeed无能为力'''" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def adj_to_adv(word):\n", + " suggest_word = None\n", + " if(word == \"good\"):\n", + " return \"well\"\n", + " else:\n", + " suggest_ = word + 'ly'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " return suggest_word\n", + "#如果形容词副词同形,那么他会返回none,但是不影响计算,因为形容词副词同形啊\n", + "\n", + "\n", + "def adv_to_adj(word):\n", + " suggest_word = None\n", + " if(word == \"well\"):\n", + " return \"good\" \n", + " elif word[-2:] == 'ly':\n", + " suggest_ = word[:-2]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " return suggest_word\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def adj_to_anything(word):#形容词变成其他词性\n", + " suggest_word = None\n", + " suggest_list = []\n", + " if word[-1:] == 'y': #举例 healthy->health\n", + " suggest_ = word[:-1]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-3:] == 'ful':#举例 successful->success\n", + " suggest_ = word[:-3]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-3:] == 'ive': #举例 active -> act\n", + " suggest_ = word[:-4]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-2:] == 'ed': #举例 interested->interest->interesting\n", + " suggest_ = word[:-2]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word) \n", + " suggest_ = suggest_ + 'ing'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word) \n", + " \n", + " elif word[-3:] == 'ing':#举例 interesting->interest->interested\n", + " suggest_ = word[:-3]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " suggest_ = suggest_ + 'ed'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word) \n", + " \n", + " elif word[-4:] == 'less': #举例 careless -> care\n", + " suggest_ = word[:-4]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-2:] == 'ly': #举例: friendly -> friend , lovely -> love\n", + " suggest_ = word[:-2]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " \n", + " elif word[-1:] == 't': #举例 different -> different\n", + " suggest_ = word[:-1]\n", + " suggest_ = suggest_ + 'ce'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-3:] == 'ous': #举例 dangerous -> danger\n", + " suggest_ = word[:-3]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-2:] == 'al': #举例 original -> origin\n", + " suggest_ = word[:-2]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-4:] == 'able':\n", + " suggest_ = word[:-4]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-2:] == 'en': #举例 woolen -> wool\n", + " suggest_ = word[:-2]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-2:] == 'ic': \n", + " suggest_ = word + 'al'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word) \n", + " suggest_ = word[:-2]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word) \n", + " elif word[-3:] == 'ish':\n", + " suggest_ = word[:-3]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word == None:\n", + " suggest_ = word[:-3]\n", + " suggest_ = suggest_ + 'and'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer) \n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-3:] == 'ese':\n", + " suggest_ = word[:-3]\n", + " suggest_ = suggest_ + 'a'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer) \n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " elif word[-3:] == 'ian':\n", + " suggest_ = word[:-1]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word == None:\n", + " suggest_ = word[:-3]\n", + " suggest_ = suggest_ + 'y'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " if suggest_word == None:\n", + " HouZhui_list = ['ment','ness','tion','ture','sion','ty','y','tive','sive']\n", + " for HouZhui in HouZhui_list:\n", + " suggest_ = word + HouZhui\n", + " new_word = word_convert(word,suggest_,PorterStemmer)\n", + " if new_word != None:\n", + " suggest_word = new_word\n", + " suggest_list.append(suggest_word)\n", + " suggest_list = list(set(suggest_list)) \n", + " return suggest_list\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def N_to_anything(word):#名词变成其他词性\n", + " suggest_list = []\n", + " list_HouZhui = ['y','ful','tive','sive','ed','ing','less','ly','ous','al','able','en','tic','ish','ance','er','or']\n", + " list_QianZhui = ['a']\n", + " if word[-4:] in ['ment','ness','tion','ture','sion','tive','sive']:\n", + " suggest_ = word[:-4]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " else:\n", + " for HouZhui in list_HouZhui:\n", + " suggest_ = word + HouZhui\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " for QianZhui in list_QianZhui:\n", + " suggest_ = QianZhui + word\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " if word[-2:] == 'ce':\n", + " suggest_ = word[:-2]\n", + " suggest_ = suggest_ + 't'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word) \n", + " elif word[-4:] == 'land':\n", + " suggest_ = word[:-4]\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word == None:\n", + " suggest_ = suggest_ + 'lish'\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word) \n", + " #print(suggest_list)\n", + " suggest_list = list(set(suggest_list))\n", + " return suggest_list\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def V_to_anything(word):#动词变成其他词性\n", + " suggest_word = None\n", + " suggest_list = []\n", + "\n", + " HouZhui_list = ['ful','tive','sive','ed','less','ly','ous','al','able','en','tic','ish','ance','tion','sion','ment','er','or','ee']\n", + " for HouZhui in HouZhui_list:\n", + " suggest_ = word + HouZhui\n", + " suggest_word = word_convert(word,suggest_,PorterStemmer)\n", + " if suggest_word != None:\n", + " suggest_list.append(suggest_word)\n", + " suggest_list = list(set(suggest_list))\n", + " return suggest_list\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + " 功能:\n", + " 生成形容词,副词关联词表\n", + " 输入:\n", + " word:形容词/副词\n", + " 输出:\n", + " list_word:为没有添加词的其他形式,包括三音节以下词的比较级最高级\n", + " list_word2:为三音节及以上的词的比较级最高级,如果输入形容词比较级最高级没有more/most,该列表为空\n", + " 说明:\n", + " 由于三音节形容词/副词的比较级,最高级为more/most+原形容词/副词,所以特别把形容词/副词和其他词性变形区分出来\n", + "'''\n", + "\n", + "def build_like_word_adj(word): #创建类似形容词列表\n", + " list_word = []\n", + " list_word2 = [] #把比较级最高级带more的放在这里\n", + " lemmas = lemmatizer(word, u'adj')\n", + " #print(lemmas)\n", + " for i in lemmas:\n", + " list_word.append(i)\n", + " word_er = comparative(i)\n", + " if \"more\" in word_er: #把比较级带more,most的词放在另一个列表list_word2\n", + " list_word2.append(word_er)\n", + " else:\n", + " list_word.append(word_er)\n", + " word_est = superlative(i)\n", + " if \"most\" in word_est:\n", + " list_word2.append(word_est)\n", + " else:\n", + " list_word.append(word_est)\n", + " word_adv = adj_to_adv(i)\n", + " if word_adv != None:\n", + " list_word.append(word_adv)\n", + " list_N = adj_to_anything(word)\n", + " for N in list_N:\n", + " list_word.append(N)\n", + " \n", + " list_word = list(set(list_word))\n", + " return list_word,list_word2\n", + "\n", + "def build_like_word_adv(word): #创建类似形容词列表\n", + " list_word = []\n", + " list_word2 = []\n", + " list_special = ['however','seldom','often','never','otherwise']\n", + " if word in list_special:\n", + " list_word = [word]\n", + " list_word2 = []\n", + " else:\n", + " lemmas = lemmatizer(word, u'adj')\n", + " #print(lemmas)\n", + " for i in lemmas:\n", + " list_word.append(i)\n", + " word_er = comparative(i)\n", + " if \"more\" in word_er:\n", + " list_word2.append(word_er)\n", + " else:\n", + " list_word.append(word_er)\n", + " word_est = superlative(i)\n", + " if \"most\" in word_est:\n", + " list_word2.append(word_est)\n", + " else:\n", + " list_word.append(word_est)\n", + " word_adv = adv_to_adj(i)\n", + " if word_adv != None:\n", + " list_word.append(word_adv)\n", + " list_word = list(set(list_word))\n", + " return list_word,list_word2\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + " 功能:\n", + " 根据检查的位置整理出放入BERT模型的input_ids,input_type_ids以及检查位置在input_ids中的下标位置\n", + " pre_training_input_in_sentence得到检查位置所在句子的信息\n", + " pre_training_input_entire得到检查位置在完整text中的信息\n", + " 输入:\n", + " index:在完整text中的位置\n", + " 输出:\n", + " word:该下标下的单词\n", + " input_ids:tokens的对应字典id列表\n", + " input_type_ids:零列表\n", + " id_in_sen:检查位置在句子中的下标(pre_training_input_in_sentence的返回)\n", + " index:检查位置在完整text中的下标,其实就是输入的下标\n", + "'''\n", + "def pre_training_input_in_sentence(index): \n", + " sentence_id = in_sentence[index][0]\n", + " id_in_sen = in_sentence[index][1]\n", + " word = input_ids_sen[sentence_id][id_in_sen]\n", + " word = tokenizer.ids_to_tokens[word]\n", + " input_ids = copy.deepcopy(input_ids_sen[sentence_id])\n", + " input_type_ids = copy.deepcopy(input_type_ids_sen[sentence_id])\n", + "\n", + " return word,input_ids,input_type_ids,id_in_sen\n", + "\n", + "def pre_training_input_entire(index): \n", + " word = entire_ids[index]\n", + " word = tokenizer.ids_to_tokens[word]\n", + " input_ids = copy.deepcopy(entire_ids)\n", + " input_type_ids = copy.deepcopy(entire_type_ids)\n", + "\n", + " return word,input_ids,input_type_ids,index\n", + "\n", + "#[101, 1045, 2572, 3153, 2006, 1996, 2754, 1012, 102]\n", + "#[101, 1045, 2572, 3153, 2006, 1996, 2754, 1012, 1045, 2018, 1037, 2200, 2204, 2835, 1012, 1996, 2377, 2001, 2200, 5875, 1012, 102]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "from pattern import en\n", + "from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,INFINITIVE, PRESENT, PAST, FUTURE, PROGRESSIVE\n", + "\n", + "\n", + "'''\n", + " 功能:\n", + " 1.judge_and_suggestion系列函数,这个系列函数是在analyse之前做的一个预先判断处理,判断的是该位置原来词的相关词中有没有可以代替它的词\n", + " 2.当相关词中有词的可能性和原词的可能性的差距大于阈值,则认为原词是错的,可以用相关词替换\n", + " 3.替换词的gap还要经过后续的检查才能决定他是不是最好的推荐,这一步骤放在了show_abnormals里\n", + " 输入:\n", + " prob:该位置可能性列表\n", + " original:该位置原先的词\n", + " list_word:该位置相关词表\n", + " threhold:门槛,也就是阈值\n", + " 输出:\n", + " judge:判断原来的词是否正确,0表示需要换词,1表示不需要换词或者说相关词里面没一个合适的\n", + " suggestion:相关词中最好的推荐\n", + " gap_with_totally_top:备选词中概率最高的和所有词中概率最高的之间的gap,可以换的词也有可能因为gap太大而遭到拒绝\n", + "'''\n", + "def judge_and_suggestion(prob,original,list_word,threhold):\n", + " top_prob = 0\n", + " list_word = list_word + [original]\n", + " original_prob = prob[tokenizer.vocab[original]]\n", + " best = None\n", + " suggestion = None\n", + " for word in list_word:\n", + " try:\n", + " word_id = tokenizer.vocab[word]\n", + " prob_word = prob[word_id]\n", + " if prob_word > top_prob:\n", + " top_prob = prob_word\n", + " best_word = word\n", + " except KeyError:#有的词enchant认为是正确的拼写,bert的词典里却没有,比如tiring,这种情况暂时没法解决,但是实际上bert不认的词会自动分词\n", + " pass\n", + "\n", + " totally_top = prob.max().item() #最高的概率(不需要知道概率最大的词是哪一个)\n", + " gap_with_origin = math.log(top_prob) - math.log(original_prob) #备选词中最大概率和原来的词的概率的差\n", + " gap_with_totally_top = math.log(totally_top) - math.log(top_prob) #所有词中最高的概率和备选词中最高的概率的差\n", + " \n", + " if gap_with_origin > threhold:\n", + " suggestion = best_word\n", + " return 0,suggestion,gap_with_totally_top\n", + " else:\n", + " return 1,suggestion,gap_with_totally_top\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'分析各种词性系列函数\\n 功能:对第一遍检查得出的有问题的位置的单词,根据不同的词性进行不同步骤的分析\\n 输入:\\n index:在原文中的错误位置\\n prob:该位置可能性列表\\n gap:原文该位置的词和概率最高的词之间的gap\\n top_word:概率最高的词\\n threshold:免检查门槛\\n threshold2:免修正门槛(勉强不算错)\\n threshold3:用推荐词替换的最低要求,大于该阈值才可以替换\\n 输出:\\n suggestion:给出的修改建议,修改建议不局限于错误位置\\n 说明:\\n 不仅局限于错误位置的分析是通过预添加或者去掉一个token,多进行一次model计算\\n'" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "'''分析各种词性系列函数\n", + " 功能:对第一遍检查得出的有问题的位置的单词,根据不同的词性进行不同步骤的分析\n", + " 输入:\n", + " index:在原文中的错误位置\n", + " prob:该位置可能性列表\n", + " gap:原文该位置的词和概率最高的词之间的gap\n", + " top_word:概率最高的词\n", + " threshold:免检查门槛\n", + " threshold2:免修正门槛(勉强不算错)\n", + " threshold3:用推荐词替换的最低要求,大于该阈值才可以替换\n", + " 输出:\n", + " suggestion:给出的修改建议,修改建议不局限于错误位置\n", + " 说明:\n", + " 不仅局限于错误位置的分析是通过预添加或者去掉一个token,多进行一次model计算\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + " 这是一个相关代词的词典,容易混淆的词放在一个列表中\n", + "\n", + "'''\n", + "like_he = ['he','his','him','himself','who', 'whom', 'whose']\n", + "like_she = ['she','her','herself','hers','who', 'whom', 'whose']\n", + "like_it = ['it','its','itself','who', 'whom', 'whose']\n", + "like_i = ['i','me','my','myself','mine']\n", + "like_you = ['you','your','yourself','yourselves']\n", + "like_we = ['we','us','our','ours','ourselves']\n", + "like_they = ['they','them','their','theirs']\n", + "\n", + "like_this = ['this', 'these'] \n", + "like_that = ['that','those'] \n", + "pronoun_Question = ['who', 'whom', 'whose', 'which', 'what', 'whoever', 'whichever', 'whatever'] #疑问代词\n", + "pronoun_relation = ['that', 'which', 'who', 'whom', 'whose', 'as'] #关系代词\n", + "like_some = ['some','any']\n", + "like_few = ['few','little']\n", + "like_many = ['many','much']\n", + "like_other = ['another','other']\n", + "\n", + "pronoun = [like_he,like_she,like_it,like_i,like_you,like_we,like_they,like_this,like_that,pronoun_Question,pronoun_relation,like_some,like_few,like_many,like_other]\n", + "pronoun_dictionary = {}\n", + "pronoun_list = []\n", + "for list_word in pronoun:\n", + " pronoun_list = pronoun_list + list_word\n", + " for word in list_word:\n", + " pronoun_dictionary.update({word:list_word})" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import copy\n", + "import nltk\n", + "from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,PRESENT,SG,INFINITIVE, PRESENT, PAST, FUTURE, PROGRESSIVE\n", + "\n", + "def analyse_V(index,prob,gap,top_word,threshold,threshold2,threshold3):\n", + "#这是一个处理动词语法问题的函数,输入为问题词在text的token中的下标index\n", + " if gap < threshold:\n", + " return None\n", + " #******************************top_word暗示我应该是不定式**************************\n", + " if top_word in [\"to\",\"for\"]:\n", + " wordV,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " input_ids.insert(index,tokenizer.vocab['to'])\n", + " input_type_ids.append(0)\n", + " list_word = [conjugate(verb=wordV,tense=PRESENT,person = 1)]\n", + " suggestion,need,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,5) \n", + " if need == 1:\n", + " return 'to ' + suggestion \n", + " \n", + " #*****************************判断是不是时态或者拼写错误,又或者是其他词性********\n", + " wordV = get_word(index)\n", + " #这三种是不涉及位置变化的检查,根据生成词表的速度从快到慢依次检查,之后也不需要再生成词表\n", + "\n", + " list_V = lexeme(wordV)\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordV,list_V,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion\n", + " \n", + " list_others = V_to_anything(conjugate(verb=wordV,tense=PRESENT,person = 1))\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordV,list_others,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion \n", + " \n", + " list_spell_correct = d.suggest(wordV)\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordV,list_spell_correct,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion\n", + " \n", + " if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求\n", + " return None\n", + " \n", + " front_word = get_word(index - 1)\n", + " behind_word = get_word(index + 1)\n", + " #**************************************判断是不是缺介词***************************\n", + " list_IN = [\"to\",\"at\",\"in\",\"on\",\"by\",\"for\",\"from\",\"with\",\"about\",\"against\",\"along\",\"among\",\"around\",\"as\",\"before\",\"behind\",\"below\",\"beside\",\"between\",\"during\",\"besides\",\"into\",\"near\",\"over\",\"through\",\"under\",\"without\",\"after\",\"above\",\"of\"]\n", + " if behind_word not in list_IN:\n", + " print(\"检查点\")\n", + " wordV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)\n", + " input_ids.insert(id_in_sen + 1,tokenizer.vocab['at'])#就随便插入一个东西,占位子\n", + " input_type_ids.append(0)\n", + " suggestion_IN,need_IN,_,_ = give_suggestion(input_ids,input_type_ids,id_in_sen + 1,list_IN,2)\n", + " if need_IN == 1:\n", + " input_ids[id_in_sen + 1] = tokenizer.vocab[suggestion_IN]\n", + " list_word = list_V\n", + " suggestion_V,need,_,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,5)\n", + " if need == 1:\n", + " suggestion = suggestion_V + ' ' + suggestion_IN\n", + " return suggestion\n", + " \n", + " need_to_will = need_be = 0\n", + " \n", + " #**************************************判断是不是不定式或者将来时*************************** \n", + " if front_word not in [\"to\",\"will\"]:\n", + " wordV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)\n", + " input_ids.insert(id_in_sen,tokenizer.vocab['to'])#就随便插入一个东西,占位子\n", + " input_type_ids.append(0)\n", + " try:\n", + " input_ids[id_in_sen + 1] = tokenizer.vocab[conjugate(verb=wordV,tense=PRESENT,person = 1)]\n", + " suggestion_to_will,need_to_will,prob0,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,[\"to\",\"will\"],1)\n", + " except KeyError:\n", + " need_to_will = 0\n", + " #**************************************判断是不是被动语态或者进行时******************* \n", + " list_be = lexeme('be')\n", + " list_be = lexeme('be')[:8] #把否定去掉 \n", + " #********************是不是被动语态**************** \n", + "\n", + " wordV,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " input_ids.insert(index,tokenizer.vocab['be'])#就随便插入一个东西,占位子\n", + " input_type_ids.append(0)\n", + " try:\n", + " input_ids[index + 1]=tokenizer.vocab[conjugate(verb=wordV,tense=PAST,aspect=PROGRESSIVE)]\n", + " suggestion1,need_be1,prob1,_ = give_suggestion(input_ids,input_type_ids,index,list_be,1)\n", + " except KeyError:\n", + " need_be1 = 0\n", + " \n", + " #********************是不是现在分词**************** \n", + " try:\n", + " input_ids[index + 1]=tokenizer.vocab[conjugate(verb=wordV,tense=PRESENT,aspect=PROGRESSIVE)]\n", + " suggestion2,need_be2,prob2,_ = give_suggestion(input_ids,input_type_ids,index,list_be,1)\n", + " #print(tokenizer.convert_ids_to_tokens(input_ids))\n", + " except KeyError:\n", + " need_be2 = 0\n", + "\n", + " #***************************选择是不定式还是被动语态还是进行时****************************\n", + " prob_max = 0\n", + " if need_to_will == 1:\n", + " prob_max = max(prob_max,prob0)\n", + " if need_be1 == 1:\n", + " prob_max = max(prob_max,prob1)\n", + " if need_be2 == 1:\n", + " prob_max = max(prob_max,prob2)\n", + "\n", + " if need_to_will == 1 and prob_max == prob0:\n", + " need_be = 0\n", + " if need_be1 == 1 and prob_max == prob1:\n", + " need_to_will = 0\n", + " need_be = 1\n", + " be_ = suggestion1\n", + " if need_be2 == 1 and prob_max == prob2:\n", + " need_to_will = 0\n", + " need_be = 1\n", + " be_ = suggestion2\n", + " #*************************************************处理各种语法******************************************************************\n", + " if need_to_will == 1:\n", + " wordV,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " input_ids.insert(index,tokenizer.vocab[suggestion_to_will])\n", + " input_type_ids.append(0)\n", + " list_word = [conjugate(verb=wordV,tense=PRESENT,person = 1),conjugate(verb=wordV,tense=PRESENT,aspect=PROGRESSIVE)]\n", + " suggestion,need,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,5)\n", + " if need == 1:\n", + " return 'to ' + suggestion\n", + " else:\n", + " return top_word\n", + "\n", + " elif need_be == 1:\n", + " #********************************被动语态或者进行时*****************\n", + " wordV,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " input_ids.insert(index,tokenizer.vocab[be_])\n", + " input_type_ids.append(0)\n", + " list_word = lexeme(wordV)\n", + " suggestion,need,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,5)\n", + " if need == 1:\n", + " return be_ + ' '+ suggestion\n", + " else:\n", + " return top_word\n", + " else:\n", + " return top_word\n", + " \n", + " return suggestion\n", + " \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "def analyse_adj(index,prob,gap,top_word,threshold,threshold2,threshold3):\n", + " if gap < threshold:\n", + " return None\n", + " wordADJ = get_word(index)\n", + " #*****************************判断是不是时态或者拼写错误,又或者是其他词性********\n", + " list_word,list_word2 = build_like_word_adj(wordADJ)\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordADJ,list_word,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion \n", + " \n", + " list_spell_correct = d.suggest(wordADJ)\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordADJ,list_spell_correct,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion\n", + " \n", + " #list_word = list_word + list_spell_correct\n", + " front_word = get_word(index - 1)\n", + " behind_word = get_word(index + 1)\n", + " if front_word in ['more','most'] and len(list_word2) == 0:\n", + " #判断是不是比较级使用错误,如果该形容词比较级/最高级不需要加more/most,但是前面有more/most\n", + " wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) \n", + " del input_ids[id_in_sen - 1]\n", + " del input_type_ids[0]\n", + " suggestion3,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen - 1,list_word,min(threshold2, gap - threshold3))\n", + " return '去掉前面 ' + get_word(index - 1)+ ' 原位置改成 ' + suggestion3\n", + " \n", + " elif behind_word in ['##er','##r'] and len(list_word2) != 0:\n", + " #判断是不是比较级使用错误,如果该形容词比较级/最高级需要more/most,但是错写成形容词+er/est\n", + " wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) \n", + " input_ids[id_in_sen] = tokenizer.vocab['more']\n", + " suggestion5,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen + 1,list_word,min(threshold2, gap - threshold3))\n", + " return '去掉后面 '+ get_word(index + 1) + ' 原位置改成 '+ 'more' + ' ' + suggestion5 \n", + " \n", + " elif behind_word in ['##est','##st'] and len(list_word2) != 0:\n", + " #判断是不是比较级使用错误,如果该形容词比较级/最高级需要more/most,但是错写成形容词+er/est\n", + " wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) \n", + " input_ids[id_in_sen] = tokenizer.vocab['most']\n", + " suggestion5,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen + 1,list_word,min(threshold2, gap - threshold3))\n", + " return '去掉后面 '+ get_word(index + 1) + ' 原位置改成 '+ 'most' + ' ' + suggestion5 \n", + " \n", + " if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求\n", + " return None\n", + " \n", + " if front_word not in ['this','that','these','those','more','most']:#检查形容词前面是否需要加冠词或者是需要more,most的比较级,最高级抑或是be动词\n", + " wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) \n", + " input_ids.insert(id_in_sen,tokenizer.vocab[\"[MASK]\"])\n", + " input_type_ids.append(0)\n", + " list_front = ['the','a','an','this','that','these','those','some','any','all','more','most','am','is','are','was','were'] \n", + " suggestion,need_front,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_front,2)\n", + " if need_front == 1:\n", + " wordADJ,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " input_ids.insert(index,tokenizer.vocab[suggestion])\n", + " input_type_ids.append(0)\n", + " suggestion2,need,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,min(threshold2, gap - threshold3)) \n", + " if need == 1:\n", + " return suggestion + ' ' + suggestion2\n", + " else:\n", + " return top_word\n", + " \n", + " return top_word\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "def analyse_adv(index,prob,gap,top_word,threshold,threshold2,threshold3):\n", + " if gap < threshold:\n", + " return None\n", + " \n", + " wordADV = get_word(index)\n", + " if wordADV in ['not']:\n", + " return None\n", + " #*****************************判断是不是时态或者拼写错误,又或者是其他词性********\n", + " \n", + " list_word,list_word2 = build_like_word_adv(wordADV)\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordADV,list_word,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion \n", + " \n", + " list_spell_correct = d.suggest(wordADV)\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordADV,list_spell_correct,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion\n", + "\n", + " if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求\n", + " return None\n", + " \n", + " #list_word = list_word + list_spell_correct\n", + " if get_word(index - 1) in ['more','most'] and len(list_word2) == 0:\n", + " #判断是不是比较级使用错误,这个if语句处理:该形容词比较级/最高级不需要加more/most,但是前面有more/most \n", + " wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) \n", + " del input_ids[id_in_sen - 1]\n", + " del input_type_ids[0]\n", + " suggestion3,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen - 1,list_word,5)\n", + " return '去掉前面 ' + get_word(index - 1)+ ' 原位置改成 ' + suggestion3\n", + " \n", + " elif get_word(index + 1) in ['##er','##r'] and len(list_word2) != 0:\n", + " #判断是不是比较级使用错误,如果该形容词比较级/最高级需要more/most,但是错写成形容词+er/est\n", + " wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) \n", + " input_ids[id_in_sen] = tokenizer.vocab['more']\n", + " suggestion5,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen+1,list_word,5)\n", + " return '去掉后面 '+ get_word(index + 1) + ' 原位置改成 '+ 'more' + ' ' + suggestion5 \n", + " \n", + " elif get_word(index + 1) in ['##est','##st'] and len(list_word2) != 0:\n", + " #判断是不是比较级使用错误,如果该形容词比较级/最高级需要more/most,但是错写成形容词+er/est\n", + " wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) \n", + " input_ids[id_in_sen] = tokenizer.vocab['most']\n", + " suggestion5,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen+1,list_word,5)\n", + " return '去掉后面 '+ get_word(index + 1) + ' 原位置改成 '+ 'most' + ' ' + suggestion5 \n", + "\n", + " else:\n", + " #检查形容词前面是否需要加冠词或者是需要more,most的比较级,最高级,be动词\n", + " wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)\n", + " input_ids.insert(id_in_sen,tokenizer.vocab[\"[MASK]\"])\n", + " input_type_ids.append(0)\n", + " list_front = ['the','a','an','this','that','these','those','some','any','all','more','most','am','is','are','was','were'] \n", + " suggestion,need_front,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_front,2)\n", + " if need_front == 1:\n", + " wordADV,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " input_ids.insert(index,tokenizer.vocab[suggestion])\n", + " input_type_ids.append(0)\n", + " #print(tokenizer.convert_ids_to_tokens(input_ids))\n", + " suggestion2,need,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,5) \n", + " if need == 1:\n", + " return suggestion + ' ' + suggestion2\n", + " else:\n", + " return top_word\n", + " else:\n", + " wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)\n", + " input_ids.insert(id_in_sen + 1,tokenizer.vocab[\",\"])\n", + " input_type_ids.append(0)\n", + " suggestion3,need_douhao,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,2)\n", + " if need_douhao == 1:\n", + " return suggestion3 + ' ,'\n", + " else:\n", + " return top_word\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['grandparents', 'grandpas']" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "N_to_anything(\"grandpa\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from pattern.en import article,referenced,pluralize, singularize\n", + "import nltk\n", + "def analyse_N(index,prob,gap,top_word,threshold,threshold2,threshold3):\n", + " #这是一个处理名词语法问题的函数,输入为问题词在text的token中的下标index\n", + " if gap < threshold:\n", + " return None\n", + " \n", + " wordN = get_word(index)\n", + " #*****************************判断是不是时态或者拼写错误,又或者是其他词性********\n", + " word_tag = nltk.pos_tag([wordN])\n", + " if word_tag[0][1] == \"NN\":\n", + " N_ = wordN\n", + " N_s= pluralize(wordN)\n", + " else:\n", + " N_ = singularize(wordN)\n", + " N_s= wordN\n", + " list_N = [N_,N_s]\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordN,list_N,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion\n", + " \n", + " list_others = N_to_anything(N_)\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordN,list_others,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion \n", + " \n", + " list_spell_correct = d.suggest(wordN)\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordN,list_spell_correct,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion\n", + "\n", + " #***********************************************************************************************************************************\n", + " need_DT = 0 #表示是否需要在前面加冠词 \n", + " wordN,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index)\n", + "\n", + " #*****************************************判断是否需要冠词或介词************************************************************************ \n", + " list_DT = ['the','a','an']\n", + " front_word = get_word(index - 1)\n", + " if front_word in list_DT:#如果前一个词就是冠词,那么一定不需要再往前面加介词或冠词\n", + " if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求\n", + " return None\n", + " else:\n", + " return top_word\n", + " \n", + " input_ids.insert(id_in_sen,tokenizer.vocab[\"[MASK]\"])\n", + " input_type_ids.append(0)\n", + " list_IN = [\"of\",'to',\"at\",\"in\",\"on\",\"by\",\"for\",\"from\",\"with\",\"about\",\"against\",\"along\",\"among\",\"around\",\"as\",\"before\",\"behind\",\"below\",\"beside\",\"between\",\"during\",\"besides\",\"into\",\"near\",\"over\",\"through\",\"under\",\"without\",\"after\",\"above\"]\n", + " list_DT_IN = list_DT + list_IN\n", + " suggestion,need_DT_IN,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_DT_IN,2)\n", + " if need_DT_IN == 0:#不需要冠词或介词\n", + " if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求\n", + " return None\n", + " else:\n", + " return top_word\n", + " \n", + " elif need_DT_IN == 1:#需要冠词或介词\n", + " wordN,input_ids,input_type_ids,index = pre_training_input_entire(index)\n", + " input_ids.insert(index,tokenizer.vocab[suggestion])\n", + " input_type_ids.append(0)\n", + " suggestion2,need,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_N ,min(9.5,gap - threshold3))\n", + " if need == 1:\n", + " return suggestion + ' ' + suggestion2\n", + " \n", + " if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求\n", + " return None\n", + " else:\n", + " return top_word\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def analyse_pronoun(index,prob,gap,top_word,threshold,threshold2,threshold3):\n", + " #这是一个处理代词语法问题的函数,输入为问题词在text的token中的下标index\n", + " if gap < threshold:\n", + " return None\n", + " \n", + " wordPROP = get_word(index)\n", + " #*****************************判断是不是时态或者拼写错误,又或者是其他代词********\n", + " try:\n", + " list_PROP = pronoun_dictionary[wordPROP]\n", + " except:\n", + " list_PROP = []\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordPROP,list_PROP,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion \n", + "\n", + " if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求\n", + " return None\n", + " else:\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordPROP,pronoun_list,threshold3)#在所有代词里面选择\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion \n", + " else:\n", + " return None\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def analyse_DT(index,prob,gap,top_word,threshold,threshold2,threshold3):\n", + " if gap < threshold:\n", + " return None \n", + " \n", + " wordDT = get_word(index)\n", + " if wordDT in [\"every\",'per','each','no']:#有实际意义,不做修改\n", + " return None\n", + "\n", + " if wordDT in ['some']:\n", + " list_word = ['some','any','a','an']\n", + " elif wordDT in ['any']:\n", + " list_word = ['some','any',\"every\",'per','each']\n", + " elif wordDT in ['this','that','these','those']:\n", + " list_word = ['this','that','these','those']\n", + " elif wordDT in ['the','a','an']:\n", + " list_word = ['the','a','an','some','any']\n", + " elif wordDT in ['another','other']:\n", + " list_word = ['another','other']\n", + " elif wordDT in ['all','both']:\n", + " list_word = ['all','both']\n", + " else:\n", + " list_word = [wordDT]\n", + " \n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordDT,list_word,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion \n", + " \n", + " if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求\n", + " return None\n", + " \n", + " elif top_word in [\"at\",\"in\",\"on\",\"by\",\"for\",\"from\",\"with\",\"about\",\"against\",\"along\",\"among\",\"around\",\"as\",\"before\",\"behind\",\"below\",\"beside\",\"between\",\"during\",\"besides\",\"into\",\"near\",\"over\",\"through\",\"under\",\"without\",\"after\",\"above\",\"of\",'to']:\n", + " return top_word + ' ' + wordDT\n", + " else:\n", + " if top_word in ['some','any','this','that','these','those','the','a','an']:\n", + " return top_word\n", + " elif wordDT in ['another','other','all','both']:\n", + " return None\n", + " else:\n", + " return \"去掉 \" + wordDT\n", + "#print(analyse_DT(77))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "def analyse_IN(index,prob,gap,top_word,threshold,threshold2,threshold3):\n", + " #检查介词,确认需不需要删掉或者换介词\n", + " if gap < threshold:\n", + " return None \n", + " \n", + " wordIN = get_word(index)\n", + " if wordIN in ['before',\"after\",\"above\",\"below\",\"underneath\",\"beneath\",\"without\"]:#有实际意义,不做修改\n", + " return None\n", + " list_word = [\"at\",\"in\",\"on\",\"by\",\"for\",\"from\",\"with\",\"about\",\"against\",\"along\",\"among\",\"around\",\"as\",\"before\",\"behind\",\"below\",\"beside\",\"between\",\"during\",\"besides\",\"into\",\"near\",\"over\",\"through\",\"under\",\"without\",\"after\",\"above\",\"of\",'to']\n", + " \n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordIN,list_word,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion \n", + " \n", + " list_spell_correct = d.suggest(wordIN)\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordIN,list_spell_correct,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion \n", + " \n", + " if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求\n", + " return None\n", + " elif top_word in u',.!?[]()<>\"\\'':\n", + " return top_word\n", + " else:\n", + " return \"去掉 \" + wordIN\n", + "#print(analyse_IN(76))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "def analyse_CC(index,prob,gap,top_word,threshold,threshold2,threshold3):\n", + " if gap < threshold:\n", + " return None \n", + " \n", + " wordCC = get_word(index)\n", + " list_CC = [\"but\",\"because\",\"yet\",\"still\",\"however\",\"although\",\"so\",\"thus\",\"and\",\"or\",\"too\",\"either\",\"or\",\"neither\",\"nor\",\"when\",\"while\",\"as\",\"whenever\",\"since\",\"until\",\"till\",\",\"]\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordCC,list_CC,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion \n", + " \n", + " if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求\n", + " return None\n", + " else:\n", + " return None\n" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "def analyse_MD(index,prob,gap,top_word,threshold,threshold2,threshold3):\n", + " if gap < threshold:\n", + " return None \n", + " \n", + " wordMD = get_word(index)\n", + " if wordMD in ['can','could']:\n", + " list_MD = ['can','could']\n", + " elif wordMD in ['may','might']:\n", + " list_MD = ['may','might']\n", + " elif wordMD in ['shall','should']:\n", + " list_MD = ['shall','should'] \n", + " elif wordMD in ['will','would']:\n", + " list_MD = ['will','would'] \n", + " elif wordMD in ['dare','dared']:\n", + " list_MD = ['dare','dared'] \n", + " else:\n", + " list_MD = [wordMD]\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordMD,list_MD,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion \n", + " \n", + " if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求\n", + " return None\n", + " else:\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "def analyse_biaodian(index,prob,gap,top_word,threshold,threshold2,threshold3):\n", + " if gap < threshold:\n", + " return None \n", + " \n", + " biaodian = get_word(index) \n", + " biaodian_list = ['.',',',';','!','?','\"',\"'\",',','。','’','‘','“','”','and','but']\n", + " judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,biaodian,biaodian_list,threshold3)\n", + " if judge==0 and gap_with_totally_top < threshold2:\n", + " return suggestion \n", + " \n", + " if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求\n", + " return None\n", + " else:\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + " 功能:\n", + " 这是几个和拼写检查相关函数\n", + " correct_spelling:用于发现text中拼写错误,写成不存在的词的情况,并暂时把它改成存在的词,这样再放入模型训练,完成之后的步骤\n", + " token_Align:展示拼写错误时需要将原来错误的词显示出来,由于BERT的tokenize会把错误的词分段,造成未知序号的混乱,因而需要将原来的token和被correct的token位置对齐\n", + " 这两个函数需要配合使用\n", + "'''\n", + "import enchant\n", + "import re\n", + "d = enchant.Dict(\"en_US\")\n", + "from pattern.en import suggest\n", + "\n", + "def C_trans_to_E(string): #标点符号转换函数\n", + " E_pun = u',.!?[]()<>\"\\'\"\\'.:;'\n", + " C_pun = u',。!?【】()《》“‘”’.:'\n", + " table= {ord(f):ord(t) for f,t in zip(C_pun,E_pun)}\n", + " return string.translate(table)\n", + "\n", + "def process_biaodian(text):#把标点和字母分开,使得用split分词能把标点分成单独的token,顺便把中文标点变成英文标点\n", + " text1 = ''\n", + " for character in text[0]: \n", + " if character in u',.!?[]()<>\"\\':-;,。!?【】()《》“‘”’.%':\n", + " character1 = C_trans_to_E(character)\n", + " text1 = text1 + ' '+character1+' '\n", + " else:\n", + " text1 = text1 + character \n", + " return [text1]\n", + "\n", + "def correct_spelling(text):\n", + " #text:原本可能带有拼写错误的文本\n", + " #返回[correct_text]:不带拼写错误的文本,外面套上中括号,保持列表的形式\n", + " global suggestions\n", + " correct_text = ''\n", + " text0 = text\n", + " text1 = ''\n", + " \n", + " tokens = text.split(' ')\n", + " for token in tokens: #给拼写错误的单词标上‘错’\n", + " if token not in ['.',',',';','!','?','\"',\"'\",',','。','’','‘','“','”',\"\\r\\n\",\"\"]:\n", + " if d.check(token)==False and token != suggest(token)[0][0]:\n", + " word = '不' + suggest(token)[0][0] #pattern的suggestion \n", + " else:\n", + " word = token\n", + " elif token == \"\\r\\n\":\n", + " word = '换'\n", + " else:\n", + " word = token\n", + " correct_text = correct_text + ' ' + word\n", + " tokens = tokenizer.tokenize(correct_text) \n", + " length = len(tokens)\n", + " correct_text = \"\"\n", + " i = 0\n", + " while(i < length):\n", + "\n", + " if tokens[i] == '不':#中文乱码\n", + " suggestions.update({i+1:tokens[i+1]})#给外部变量suggestions添加错误\n", + " del tokens[i]\n", + " length = length - 1\n", + " elif tokens[i][0:2] == '##':\n", + " word = tokens[i][2:]\n", + " correct_text = correct_text + word \n", + " i = i+1\n", + " else:\n", + " token = tokens[i]\n", + " if token not in [\"'\"]:\n", + " word = ' '+ token\n", + " else:\n", + " word = token\n", + " \n", + " correct_text = correct_text + word \n", + " i = i+1\n", + " return [correct_text]\n", + "\n", + "\n", + "def token_Align(tokens,text): \n", + " #tokens是拼写修正之后的文本的分词结果\n", + " #text是原本可能带有拼写错误的文本\n", + " #返回的是text的分词结果\n", + " original_tokens = tokenizer.tokenize(text)\n", + " original_tokens = ['[CLS]'] + original_tokens + ['[SEP]']\n", + " print(original_tokens)\n", + " length = len(tokens)\n", + " i = 0\n", + " while(i < min(length - 1,len(original_tokens) - 1)):\n", + " if original_tokens[i] == tokens[i] or original_tokens[i+1] == tokens[i+1] or original_tokens[i+2] == tokens[i+2] or original_tokens[i+3] == tokens[i+3]:\n", + " i = i+1\n", + " continue\n", + " else:\n", + " if original_tokens[i][:2] == \"##\":\n", + " original_tokens[i-1] = original_tokens[i-1] + original_tokens[i][2:]\n", + " del original_tokens[i]\n", + " elif original_tokens[i+1][:2] == \"##\":\n", + " original_tokens[i] = original_tokens[i] + original_tokens[i+1][2:]\n", + " del original_tokens[i+1] \n", + " elif tokens[i] == '[UNK]':\n", + " original_tokens.insert(i,'[UNK]')\n", + " else:\n", + " if original_tokens[i+1] == tokens[i] or original_tokens[i+2] == tokens[i+1] or original_tokens[i+3] == tokens[i+2]:\n", + " if re.match(r'[a-z]',original_tokens[i]) == None :\n", + " original_tokens[i] = original_tokens[i] + original_tokens[i+1]\n", + " del original_tokens[i+1] \n", + " elif original_tokens[i] == tokens[i+1] or original_tokens[i+1] == tokens[i+2] or original_tokens[i+2] == tokens[i+3]:\n", + " original_tokens.insert(i,' ')\n", + " i = i + 1\n", + " \n", + " return original_tokens\n", + "\n", + "def split_text(text0,threshold1,threshold2):\n", + " #把文章分成一定长度的文段,保证GPU可以正常使用以及BERT模型不会超过最大的embeding\n", + " #当计数大于threshold1并且达到句尾时,将文本分开\n", + " #当计数大于threshold2并且达到分段位置时,将文本分开\n", + " #我们希望尽量能按照段落分,因此threshold2要比threshold1稍小一些\n", + " texts = []\n", + " text = ''\n", + " tokens = text0[0].split(' ')\n", + " count_tokens = 0\n", + " last_HuanHang = -1\n", + " new_tokens = []\n", + " for token in tokens:\n", + " if token == '':\n", + " continue\n", + " count_tokens = count_tokens + 1\n", + " text = text + ' '+ token\n", + " if (token == '.'and count_tokens > threshold1) or (token == '\\r\\n' and count_tokens > threshold2):\n", + " texts.append([text])\n", + " text = ''\n", + " count_tokens = 0\n", + " if count_tokens > 0: \n", + " texts.append([text]) \n", + " return texts" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[' i drive at home .']" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text = \"I arive at home.\"\n", + "correct_spelling(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['[CLS]', 'that', 'will', 'generate', 'the', 'ur', '##l', 'string', '/', 'about', '?', 'name', '=', 'ze', '##it', ',', 'you', 'can', 'use', 'every', 'property', 'as', 'defined', 'in', 'the', 'node', '.', 'j', '##s', 'ur', '##l', 'module', 'documentation', '.', '[SEP]']\n", + "[CLS] [CLS]\n", + "that that\n", + "will will\n", + "generate generate\n", + "the the\n", + "curl url\n", + "string string\n", + "about /about\n", + "? ?\n", + "name name\n", + "= =\n", + "ze ze\n", + "##st ##it\n", + ", ,\n", + "you you\n", + "can can\n", + "use use\n", + "every every\n", + "property property\n", + "as as\n", + "defined defined\n", + "in in\n", + "the the\n", + "node node\n", + ". .\n", + "is js\n", + "curl url\n", + "module module\n", + "documentation documentation\n", + ". .\n", + "[SEP] [SEP]\n" + ] + }, + { + "data": { + "text/plain": [ + "[[' that will generate the url string /about ? name = zeit , you can use every property as defined in the node .'],\n", + " [' js url module documentation .']]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text = [\"that will generate the url string /about ? name = zeit , you can use every property as defined in the node .js url module documentation .\"]\n", + "text = process_biaodian(text)\n", + "new_text = correct_spelling(text[0])\n", + "tokens = tokenizer.tokenize(new_text[0])\n", + "tokens = ['[CLS]'] + tokens + ['[SEP]']\n", + "original_tokens = token_Align(tokens,text[0])\n", + "len_ = len(tokens)\n", + "for i in range(0,len_):\n", + " print(tokens[i],original_tokens[i])\n", + " \n", + "split_text(text,20,15)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "import nltk\n", + "from pattern.en import conjugate, lemma, lexeme,PRESENT,SG\n", + "'''\n", + " 这是一个输出BERT模型训练结果的函数,方便查看调试\n", + "'''\n", + "def show_lm_probs(tokens, input_ids, probs, topk=5, firstk=20): #输出结果的函数,要最高概率topk个输出\n", + " def print_pair(token, prob, end_str='', hit_mark=' '):\n", + " if i < firstk:\n", + " # token = token.replace('', '').replace('\\n', '/n')\n", + " print('{}{: >3} | {: <12}'.format(hit_mark, int(round(prob*100)), token), end=end_str)\n", + " \n", + " ret = None\n", + " for i in range(len(tokens)):\n", + " ind_ = input_ids[i].item() if input_ids is not None else tokenizer.vocab[tokens[i]]\n", + " prob_ = probs[i][ind_].item() #这个probs是该字符串第i个位置上填上词典上各个词的概率,prob_是词典上原来天的这个词的概率\n", + " print_pair(tokens[i], prob_, end_str='\\t')\n", + " values, indices = probs[i].topk(topk)\n", + " #print(values, indices)\n", + " #print(\"****************************************************************************************************\")\n", + " top_pairs = []\n", + " for j in range(topk):\n", + " ind, prob = indices[j].item(), values[j].item()\n", + " hit_mark = '*' if ind == ind_ else ' '\n", + " token = tokenizer.ids_to_tokens[ind]\n", + " print_pair(token, prob, hit_mark=hit_mark, end_str='' if j < topk - 1 else '\\n')\n", + " top_pairs.append((token, prob))\n", + " if tokens[i] == \"[MASK]\":\n", + " ret = top_pairs\n", + " return ret " + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "def analyse_prob(prob,token):\n", + " ind_ = tokenizer.vocab[token]\n", + " prob_ = prob[ind_].item()\n", + " top_prob = prob.max().item()\n", + " top_ind = prob.argmax().item()\n", + " top_word = tokenizer.ids_to_tokens[top_ind] #可能性最高的词\n", + " gap = math.log(top_prob) - math.log(prob_) #计算两个词之间的差距 \n", + " return top_word,gap" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "import colored\n", + "from colored import stylize\n", + "import spacy\n", + "nlp = spacy.load('en')\n", + "from nltk.corpus import wordnet as wn\n", + "\n", + "def analyse_词性(token,tag):\n", + " if 'VB' in tag: #如果是动词的各种时态\n", + " tag0 = \"v\"\n", + " elif \"JJ\" in tag : #形容词\n", + " tag0 = \"a\"\n", + " elif \"RB\" in tag: #副词\n", + " tag0 = \"r\"\n", + " elif \"NN\" in tag: #名词\n", + " tag0 = \"n\"\n", + " else:\n", + " return tag\n", + " if wn.morphy(token, tag0)==None:\n", + " tag = nltk.pos_tag([token])[0][1]\n", + " return tag\n", + " \n", + "def show_abnormals(tokens,probs,text,show_suggestions=False): #多加了一个参数text,用来生成原来的token的\n", + " global suggestions\n", + " global original_tokens\n", + " original_tokens = token_Align(tokens,text)\n", + " def gap2color(mode):\n", + " if mode == 1:\n", + " return 'yellow_1'\n", + " elif mode == 2:\n", + " return 'orange_1'\n", + " else:\n", + " return 'red_1'\n", + " \n", + " def print_token(token, suggestion, gap ,mode):\n", + " if gap == 0 and mode == 1:\n", + " print(stylize(token + ' ', colored.fg('white') + colored.bg('black')), end='')\n", + " else:\n", + " print(stylize(token, colored.fg(gap2color(mode)) + colored.bg('black')), end='')\n", + " if show_suggestions and mode > 1:\n", + " print(stylize('/' + str(suggestion) + ' ', colored.fg('green' if gap > 10 else 'cyan') + colored.bg('black')), end='')\n", + " else:\n", + " print(stylize(' ', colored.fg(gap2color(mode)) + colored.bg('black')), end='')\n", + "\n", + " \n", + " avg_gap = 0.\n", + " tokens_tag = nltk.pos_tag(tokens) #给整个text做词性标注\n", + " for i in range(1, len(tokens) - 1): # skip first [CLS] and last [SEP]\n", + " if tokens[i]=='[UNK]':\n", + " continue\n", + " top_word,gap = analyse_prob(probs[i],tokens[i])\n", + " print()\n", + " print(\"*******************************************************************************************************************\")\n", + " print(i)\n", + " print(gap)\n", + " avg_gap += gap\n", + " suggestion = None\n", + " tag = tokens_tag[i][1]#当前tokens的词性\n", + " tag = analyse_词性(tokens[i],tag)\n", + " print(tag)\n", + " \n", + " if 'VB' in tag: #如果是动词的各种时态\n", + " suggestion = analyse_V(i,probs[i],gap,top_word,2.5 ,8 ,1.8)\n", + " \n", + " elif \"DT\" == tag: #如果是冠词(冠词原则上不改变词性)\n", + " suggestion = analyse_DT(i,probs[i],gap,top_word,3 ,4 ,1)\n", + " \n", + " elif \"JJ\" in tag : #形容词\n", + " suggestion = analyse_adj(i,probs[i],gap,top_word,6 ,8 ,2)\n", + " \n", + " elif \"RB\" in tag: #副词\n", + " suggestion = analyse_adv(i,probs[i],gap,top_word,5 ,8 ,2)\n", + " \n", + " elif \"PRP\" in tag: #代词\n", + " suggestion = analyse_pronoun(i,probs[i],gap,top_word,4 ,5 ,1.5)\n", + " \n", + " elif \"NN\" in tag: #名词\n", + " suggestion = analyse_N(i,probs[i],gap,top_word,4 ,10 ,2.2)\n", + " \n", + " elif \"CC\" in tag: #连词\n", + " suggestion = analyse_CC(i,probs[i],gap,top_word,2 ,2.5 ,1.5)\n", + " \n", + " elif \"IN\" == tag or 'TO' == tag: #介词\n", + " suggestion = analyse_IN(i,probs[i],gap,top_word,3.5 ,4 ,1.5)\n", + " \n", + " elif 'MD' in tag: #情态动词\n", + " suggestion = analyse_MD(i,probs[i],gap,top_word,3 ,4 ,1.5)\n", + " \n", + " elif \"CD\" in tag: #数词直接pass\n", + " pass \n", + " \n", + " elif \"WDT\" == tag and gap > 3.5: #who,which,that那些\n", + " suggestion = top_word #推荐的词一般比较准\n", + " \n", + " elif tokens[i] in u',.!?[]()<>\"\\':,。!?【】()《》“‘”’.':\n", + " suggestion = analyse_biaodian(i,probs[i],gap,top_word,1.3 ,2 ,1)\n", + " \n", + " elif gap > 5:\n", + " suggestion = top_word\n", + " \n", + " if (suggestion != None and suggestion.lower() != tokens[i] and suggestion.lower() != original_tokens[i]): #修改存在并且是另外一个词\n", + " suggestions.update({i:suggestion})\n", + " mode = 2\n", + " elif suggestions.__contains__(i)==True: #这是因为之前在拼写检查时已经修改了该位置的单词\n", + " if original_tokens[i] == tokens[i]:\n", + " del suggestions[i]\n", + " mode = 1\n", + " else:\n", + " mode = 2\n", + " suggestion = suggestions[i]\n", + " else:\n", + " if original_tokens[i] != tokens[i]:\n", + " mode = 2\n", + " suggestions[i] = tokens[i]\n", + " suggestion = tokens[i]\n", + " else:\n", + " mode = 1\n", + " \n", + " print_token(original_tokens[i], suggestion, gap, mode)\n", + " print()\n", + " print(original_tokens[i],tokens[i],suggestion,mode)\n", + " avg_gap /= (len(tokens) - 2)\n", + " print()\n", + " print('平均gap:'+ str(avg_gap))\n", + " return avg_gap" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def analyze_part_text(text, masked_tokens=None, show_suggestions=True, show_firstk_probs=500):\n", + " step = 15 #用于训练加速的步长,每15个token被mask一个位置\n", + " global input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids,suggestions,original_tokens\n", + " suggestions = {}#清空全局变量\n", + " text = process_biaodian(text)\n", + " text0 = text #保存有拼写错误的文本\n", + " text = correct_spelling(text[0]) #拼写修正过得文本\n", + " print(\"********************************\")\n", + " print(text)\n", + " print(\"********************************\")\n", + " #黄金搭档token_Align放在show_abnormals里面了\n", + " input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids = process_text(text[0])\n", + " \n", + " examples = convert_text_to_examples(text)\n", + " features = convert_examples_to_features(examples, tokenizer, print_info=False)\n", + " given_mask = \"[MASK]\" in features[0].tokens\n", + " if not given_mask or masked_tokens is not None:\n", + " assert len(features) == 1\n", + " features, batches = copy_and_mask_feature(features[0],step, masked_tokens=masked_tokens)\n", + " #print(len(features))\n", + "\n", + " input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) #把input_ids增加了一个维度,变成[n_features,sequence_len]\n", + " #这里的n_features实际上是句子有多少批训练\n", + "\n", + " input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样\n", + " input_ids = input_ids.to(device) \n", + " input_type_ids = input_type_ids.to(device)\n", + " \n", + " time_start=time.time()\n", + " mlm_logits= model(input_ids)\n", + " time_end=time.time()\n", + " print('time cost1',time_end-time_start,'s')\n", + " \n", + " mlm_probs = F.softmax(mlm_logits, dim=-1) \n", + " tokens = features[0].tokens #为了输出,[mask]在input_ids里面表示出来,features的token都一样\n", + " print(tokens)\n", + " if not given_mask or masked_tokens is not None:\n", + " bsz, seq_len, vocab_size = mlm_probs.size() #三个维度分别是batch_size, sequence_length, vocab_size\n", + " assert bsz == len(batches)\n", + " reduced_mlm_probs = torch.Tensor(1, len(tokens), vocab_size)\n", + " for i in batches:\n", + " pos = i\n", + " while pos < len(tokens):\n", + " reduced_mlm_probs[0, pos] = mlm_probs[i, pos]\n", + " pos = pos + step\n", + " mlm_probs = reduced_mlm_probs #压缩一下大小,节约不必要浪费的空间(只需要第i个batch里面[mask]位置的词汇表概率即可)\n", + " top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=show_firstk_probs) #传入的probs是二维的\n", + " if not given_mask:\n", + " avg_gap = show_abnormals(tokens,mlm_probs[0],text0[0], show_suggestions=show_suggestions)\n", + " return suggestions,original_tokens,avg_gap\n" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [], + "source": [ + "def analyze_text(text, masked_tokens=None, show_suggestions=True, show_firstk_probs=500):\n", + " suggestions = {}\n", + " avg_gap = 0\n", + " new_part_suggestions = {}\n", + " original_tokens = ['[CLS]','[SEP]']\n", + " text = process_biaodian(text)\n", + " text0 = text #保存有拼写错误的文本\n", + " texts = split_text(text,50,40)\n", + " accumulate_length = 0\n", + " remainer = 2 #[CLS]和[SEP]\n", + " for text0 in texts:\n", + " part_suggestions,part_original_tokens,part_avg_gap = analyze_part_text(text0, masked_tokens, show_suggestions, show_firstk_probs)\n", + " for key in part_suggestions:\n", + " new_part_suggestions[key + accumulate_length] = part_suggestions[key]\n", + " tokens_length = len(part_original_tokens)\n", + " accumulate_length = accumulate_length + tokens_length - remainer\n", + " suggestions.update(new_part_suggestions)\n", + " original_tokens = original_tokens[:-1] + part_original_tokens[1:]\n", + " avg_gap = avg_gap + part_avg_gap*(tokens_length - 2)\n", + " avg_gap = avg_gap/(accumulate_length-1)\n", + " return suggestions,original_tokens,avg_gap" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "05/14/2019 16:02:15 - INFO - examples.extract_features - tokens: [CLS] when i was little , friday ' s night was our family game night . after supper , we would play card games of all sort in the sitting room . as the kid , i loved to watch cartoons , but no matter how many times i asked for watching them , my parents would not to let me . [SEP]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "********************************\n", + "[\" when i was little , friday' s night was our family game night . after supper , we would play card games of all sort in the sitting room . as the kid , i loved to watch cartoons , but no matter how many times i asked for watching them , my parents would not to let me .\"]\n", + "********************************\n", + "time cost1 0.5431368350982666 s\n", + "['[CLS]', 'when', 'i', 'was', 'little', ',', 'friday', \"'\", 's', 'night', 'was', 'our', 'family', 'game', 'night', '.', 'after', 'supper', ',', 'we', 'would', 'play', 'card', 'games', 'of', 'all', 'sort', 'in', 'the', 'sitting', 'room', '.', 'as', 'the', 'kid', ',', 'i', 'loved', 'to', 'watch', 'cartoons', ',', 'but', 'no', 'matter', 'how', 'many', 'times', 'i', 'asked', 'for', 'watching', 'them', ',', 'my', 'parents', 'would', 'not', 'to', 'let', 'me', '.', '[SEP]']\n", + " 0 | [CLS] \t 2 | . 1 | the 1 | ) 1 | , 1 | \" \n", + " 97 | when \t* 97 | when 3 | since 0 | until 0 | while 0 | as \n", + " 88 | i \t* 88 | i 7 | she 2 | he 0 | we 0 | cassie \n", + " 100 | was \t*100 | was 0 | were 0 | turned 0 | got 0 | became \n", + " 6 | little \t 12 | twelve 11 | younger 10 | eight * 6 | little 5 | older \n", + " 100 | , \t*100 | , 0 | . 0 | again 0 | ... 0 | and \n", + " 0 | friday \t 47 | valentine 14 | mother 5 | father 4 | grandma 4 | children \n", + " 100 | ' \t*100 | ' 0 | ′ 0 | ` 0 | \" 0 | - \n", + " 100 | s \t*100 | s 0 | til 0 | n 0 | d 0 | o \n", + " 38 | night \t* 38 | night 13 | dinner 7 | game 6 | eve 4 | day \n", + " 93 | was \t* 93 | was 7 | became 0 | is 0 | were 0 | , \n", + " 6 | our \t 79 | a 10 | the * 6 | our 2 | my 0 | their \n", + " 2 | family \t 68 | favorite 7 | first * 2 | family 2 | favourite 2 | only \n", + " 2 | game \t 12 | fun 9 | dinner 5 | christmas 4 | ' 3 | entertainment\n", + " 85 | night \t* 85 | night 9 | day 1 | date 1 | tonight 1 | dinner \n", + " 96 | . \t* 96 | . 3 | and 1 | ; 0 | - 0 | : \n", + " 82 | after \t* 82 | after 5 | over 5 | during 4 | at 2 | before \n", + " 7 | supper \t 38 | dinner 15 | school 9 | midnight * 7 | supper 4 | lunch \n", + " 100 | , \t*100 | , 0 | together 0 | ##time 0 | and 0 | time \n", + " 99 | we \t* 99 | we 0 | they 0 | everyone 0 | people 0 | i \n", + " 98 | would \t* 98 | would 2 | could 0 | did 0 | might 0 | helped \n", + " 86 | play \t* 86 | play 10 | have 2 | watch 0 | enjoy 0 | hold \n", + " 8 | card \t 44 | board 23 | video * 8 | card 1 | family 1 | computer \n", + " 98 | games \t* 98 | games 1 | game 0 | ##io 0 | tricks 0 | matches \n", + " 100 | of \t*100 | of 0 | and 0 | in 0 | , 0 | to \n", + " 0 | all \t 91 | some 5 | any 1 | every 1 | a 0 | the \n", + " 2 | sort \t 33 | kinds 25 | types 16 | sorts 4 | sizes 2 | ages \n", + " 99 | in \t* 99 | in 0 | around 0 | inside 0 | outside 0 | at \n", + " 82 | the \t* 82 | the 15 | our 1 | my 0 | a 0 | his \n", + " 0 | sitting \t 52 | family 19 | living 14 | dining 2 | back 1 | common \n", + " 100 | room \t*100 | room 0 | area 0 | rooms 0 | hall 0 | ##room \n", + " 100 | . \t*100 | . 0 | and 0 | ; 0 | ! 0 | ... \n", + " 43 | as \t* 43 | as 16 | like 15 | with 7 | for 2 | unlike \n", + " 0 | the \t 100 | a 0 | an 0 | another * 0 | the 0 | one \n", + " 10 | kid \t 39 | youngest 27 | child * 10 | kid 5 | baby 3 | oldest \n", + " 99 | , \t* 99 | , 0 | prodigy 0 | then 0 | now 0 | here \n", + " 98 | i \t* 98 | i 0 | he 0 | we 0 | she 0 | dad \n", + " 27 | loved \t 36 | wanted * 27 | loved 17 | used 13 | liked 2 | tried \n", + " 100 | to \t*100 | to 0 | and 0 | playing 0 | watching 0 | being \n", + " 99 | watch \t* 99 | watch 1 | see 0 | watched 0 | play 0 | watching \n", + " 1 | cartoons \t 44 | games 18 | them 14 | movies 8 | cards 3 | kids \n", + " 95 | , \t* 95 | , 4 | . 0 | - 0 | ; 0 | ... \n", + " 38 | but \t 58 | and * 38 | but 2 | so 2 | because 0 | though \n", + " 100 | no \t*100 | no 0 | little 0 | the 0 | zero 0 | without \n", + " 100 | matter \t*100 | matter 0 | to 0 | telling 0 | idea 0 | tell \n", + " 100 | how \t*100 | how 0 | what 0 | however 0 | the 0 | who \n", + " 100 | many \t*100 | many 0 | often 0 | few 0 | numerous 0 | several \n", + " 99 | times \t* 99 | times 0 | kids 0 | nights 0 | years 0 | days \n", + " 97 | i \t* 97 | i 1 | we 0 | he 0 | people 0 | she \n", + " 4 | asked \t 32 | apologized 15 | begged 13 | paid 4 | wished * 4 | asked \n", + " 25 | for \t 41 | about * 25 | for 7 | while 6 | after 3 | without \n", + " 86 | watching \t* 86 | watching 1 | to 1 | just 1 | for 1 | playing \n", + " 72 | them \t* 72 | them 9 | cartoons 4 | it 3 | one 3 | movies \n", + " 100 | , \t*100 | , 0 | - 0 | . 0 | ... 0 | and \n", + " 99 | my \t* 99 | my 0 | his 0 | the 0 | our 0 | her \n", + " 32 | parents \t* 32 | parents 22 | mother 21 | father 11 | mom 4 | dad \n", + " 0 | would \t 33 | decided 29 | chose 8 | tried 6 | seemed 5 | knew \n", + " 0 | not \t 48 | refuse 36 | have 7 | agree 2 | promise 1 | want \n", + " 0 | to \t 58 | always 26 | have 4 | ever 3 | even 2 | really \n", + " 56 | let \t* 56 | let 9 | believe 8 | bother 4 | stop 3 | tell \n", + " 97 | me \t* 97 | me 2 | go 0 | up 0 | it 0 | on \n", + " 100 | . \t*100 | . 0 | ; 0 | ! 0 | ? 0 | ... \n", + " 0 | [SEP] \t 20 | \" 15 | but 7 | and 5 | so 3 | for \n", + "['[CLS]', 'when', 'i', 'was', 'little', ',', 'friday', \"'\", 's', 'night', 'was', 'our', 'family', 'game', 'night', '.', 'after', 'supper', ',', 'we', 'would', 'play', 'card', 'games', 'of', 'all', 'sort', 'in', 'the', 'sitting', 'room', '.', 'as', 'the', 'kid', ',', 'i', 'loved', 'to', 'watch', 'cartoons', ',', 'but', 'no', 'matter', 'how', 'many', 'times', 'i', 'asked', 'for', 'watching', 'them', ',', 'my', 'parents', 'would', 'not', 'to', 'let', 'me', '.', '[SEP]']\n", + "\n", + "*******************************************************************************************************************\n", + "1\n", + "0.0\n", + "WRB\n", + "\u001b[38;5;15m\u001b[48;5;0mwhen \u001b[0m\n", + "when when None 1\n", + "\n", + "*******************************************************************************************************************\n", + "2\n", + "0.0\n", + "NN\n", + "\u001b[38;5;15m\u001b[48;5;0mi \u001b[0m\n", + "i i None 1\n", + "\n", + "*******************************************************************************************************************\n", + "3\n", + "0.0\n", + "VBD\n", + "\u001b[38;5;15m\u001b[48;5;0mwas \u001b[0m\n", + "was was None 1\n", + "\n", + "*******************************************************************************************************************\n", + "4\n", + "0.6453734100348458\n", + "JJ\n", + "\u001b[38;5;226m\u001b[48;5;0mlittle\u001b[0m\u001b[38;5;226m\u001b[48;5;0m \u001b[0m\n", + "little little None 1\n", + "\n", + "*******************************************************************************************************************\n", + "5\n", + "0.0\n", + ",\n", + "\u001b[38;5;15m\u001b[48;5;0m, \u001b[0m\n", + ", , None 1\n", + "\n", + "*******************************************************************************************************************\n", + "6\n", + "5.662634394823419\n", + "NN\n" + ] + }, + { + "ename": "ValueError", + "evalue": "not enough values to unpack (expected 2, got 1)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;31m#text = [\"During my last winter holiday, I went to countryside with my father to visit my grandparents. I find a big change there. The first time I went there, they were living in a small house with dogs, ducks, and another animals. Last winter when I went here again, they had a big separate house to raise dozens of chicken. They also had a small pond which they raised fish. My grandpa said last summer they earned quite a lot by sell the fish. I felt happily that their life had improved. At the end of our trip,I told my father that I planned to return for every two years, but he agreed.\"]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mtime_start\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0manalyze_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshow_firstk_probs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m500\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 23\u001b[0m \u001b[0mtime_end\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'time cost'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mtime_end\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mtime_start\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m's'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36manalyze_text\u001b[0;34m(text, masked_tokens, show_suggestions, show_firstk_probs)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mremainer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m2\u001b[0m \u001b[0;31m#[CLS]和[SEP]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtext0\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtexts\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mpart_suggestions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mpart_original_tokens\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mpart_avg_gap\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0manalyze_part_text\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmasked_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshow_suggestions\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshow_firstk_probs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mpart_suggestions\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mnew_part_suggestions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0maccumulate_length\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpart_suggestions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36manalyze_part_text\u001b[0;34m(text, masked_tokens, show_suggestions, show_firstk_probs)\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0mtop_pairs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mshow_lm_probs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmlm_probs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfirstk\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mshow_firstk_probs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m#传入的probs是二维的\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mgiven_mask\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 50\u001b[0;31m \u001b[0mavg_gap\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mshow_abnormals\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokens\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mmlm_probs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mtext0\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshow_suggestions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mshow_suggestions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 51\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msuggestions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0moriginal_tokens\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mavg_gap\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36mshow_abnormals\u001b[0;34m(tokens, probs, text, show_suggestions)\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;34m\"NN\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtag\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m#名词\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 77\u001b[0;31m \u001b[0msuggestion\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0manalyse_N\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mprobs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mgap\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mtop_word\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m4\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0;36m10\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0;36m2.2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 78\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;34m\"CC\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtag\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m#连词\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36manalyse_N\u001b[0;34m(index, prob, gap, top_word, threshold, threshold2, threshold3)\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[0mlist_IN\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"of\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'to'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"at\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"in\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"on\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"by\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"for\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"from\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"with\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"about\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"against\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"along\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"among\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"around\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"as\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"before\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"behind\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"below\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"beside\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"between\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"during\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"besides\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"into\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"near\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"over\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"through\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"under\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"without\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"after\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"above\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0mlist_DT_IN\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist_DT\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mlist_IN\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 49\u001b[0;31m \u001b[0msuggestion\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mneed_DT_IN\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0m_\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mgive_suggestion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0minput_type_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mid_in_sen\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mlist_DT_IN\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 50\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mneed_DT_IN\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;31m#不需要冠词或介词\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mgap\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mthreshold2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;31m#没有可以替换的词,而且原本该位置的词就勉强符合要求\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36mgive_suggestion\u001b[0;34m(input_ids_, input_type_ids_, id_in_sen, alternative_word, threshold)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0mT_input_type_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mT_input_type_ids\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 33\u001b[0;31m \u001b[0mmlm_logits\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mT_input_ids\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mT_input_type_ids\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 34\u001b[0m \u001b[0mmlm_probs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mF\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msoftmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmlm_logits\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0mreduced_mlm_probs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmlm_probs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mid_in_sen\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: not enough values to unpack (expected 2, got 1)" + ] + } + ], + "source": [ + "import time\n", + "# text = [\"Who was Jim Henson? Jim Henson _ a puppeteer.\"]\n", + "#text = [\"Last week I went to the theater. There are many person . Luckily , I had very good seat. The plays was very interesting. However, I didn't enjoy it. A young man and a young woman were sitting behind me. They were talk loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angry. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'\"]\n", + "#text = [\"He is my friend.\"]\n", + "text = [\"When I was little, Friday's night was our family game night. After supper, we would play card games of all sort in the sitting room. As the kid, I loved to watch cartoons,but no matter how many times I asked for watching them, my parents would not to let me.They would say to us that playing card games would help my brain. Still I unwilling to play the games for them sometimes. \"]\n", + "\n", + "#text = [\"Last week I went to the theater. I had very good seat. The plays was very interesting. However, I didn't enjoy it. A young man and a young woman were sitting behind me. They were talk loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angry. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'\"]\n", + "# text = [\"After the outbreak of the disease, the Ministry of Agriculture and rural areas immediately sent a supervision team to the local. Local Emergency Response Mechanism has been activated in accordance with the requirements, to take blockade, culling, harmless treatment, disinfection and other treatment measures to all disease and culling of pigs for harmless treatment. At the same time, all live pigs and their products are prohibited from transferring out of the blockade area, and live pigs are not allowed to be transported into the blockade area. At present, all the above measures have been implemented.\"]\n", + "# text = [\"Early critics of Emily Dickinson's poetry mistook for simplemindedness the surface of artlessness that in fact she constructed with such innocence.\"]\n", + "#text = [\"The journey was long and tired. We left London at five o'clock in the evening and spend eight hours in the train. We had been travelled for 3 hours after someone appeared selling food and drinks. It was darkness all the time we were crossing Wales, but we could see nothing through the windows. When we finally arrived Holyhead nearly , everyone was sleeping. As soon as the train stopped, everybody come to life, grabbing their suitcases and rushing onto the platform.\"]\n", + "#text = [\"When I was little, Friday's night was our family game night. After supper, we would play card games of all sort in the sitting room. As the kid, I loved to watch cartoons,but no matter how many times I asked to watching them, my parents would not to let me. They would say to us that playing card games would help my brain. Still I unwilling to play the games for them sometimes. I didn't realize how right my parents are until I entered high school. The games my parents taught me where I was a child turned out to be very useful later in my life.\"]\n", + "#text = [\"Mr. and Mrs.Zhang all work in our school. They live far from the school, and it takes them about a hour and a half to go to work every day. In their spare time, they are interesting in planting vegetables in their garden, that is on the rooftop of their house. They often get up earlier and water the vegetables together. They have also bought for some gardening tools.beside, they often get some useful informations from the internet. When summer came, they will invite their students pick the vegetables!\"]\n", + "#text = ['The question is more easy than that.']\n", + "#text = [\"Last week I go to the zoo. I had a very good seat. The play was very interesting.\"]\n", + "#text =[\"Last week I went to the theater. I had very good seat. The play was very interesting.But I didn't enjoy it. A young man and a young woman were sitting behind me.They were talking loudly. I got very angry.\"]#因为外面有中括号,所以是二维的\n", + "#text = ['It was Monday morning, and the writeing class had just begin.We were tiring. Everyone was silent, wait to see who would be called upon to read his and her paragraph aloud. Some of us were confidont and eagerly take part in the class activity, others were nervous and anxious. I had done myself homework but I was shy. I was afraid that to speak in front of a larger group of people. At that moment, I remembered that my father once said, \"The classroom is a place for learning and that include leaning from textbooks, and mistake as well.\" Immediate, I raised my hand.']\n", + "#text = ['During my last winter holiday, I went to countryside with my father to visit my grandparents. I find a big change there. The first time I went there, they were living in a small house with dogs, ducks, and another animals. Last winter when I went here again, they had a big separate house to raise dozens of chicken. They also had a small pond which they raised fish. My grandpa said last summer they earned quite a lot by sell the fish. I felt happily that their life had improved. At the end of our trip,I told my father that I planned to return for every two years, but he agreed.']\n", + "#text = [\"what is justice ? what is good ? what kind of life is a happy life ? how can a justice ' s life benefit human beings ? is it certain that a justice ' s life must lead to happiness ? these problems have already been questioned thousands and hundreds years . they will continue to be questioned . this dissertation tries to discuss the connection between the city - state and the citizen . in the first and the second part of the dissertation the writer tries to make it clear what city - state , citizen and justice mean in the republic . plato ' s idea theory is explained in the third part . and how can his idea theory apply to the education system of the city - state and the happiness of the citizens . the fourth part reviewed old education system and educators , which includes poets and wise men . the poets are criticized for their negative effects to the youth . the fifth part is the education lawmaking of the ideal city - state , together with education means and education principle . the sixth and the seventh parts explain how can the city - state educate qualified soldiers and philosophers . they receive the same nation educate at first which is poetry educatdion and athletics education . some excellent soldiers go into higher category by the selection . they will receive philosopher ' s education , studying some specified subjects . then it makes a conclusion that the education is the only means to attain an ideal city - state .\"]\n", + "#text = ['The head of state immunity principle is an ancient principle of customary international law. Diplomatic privileges and immunity, monarchy personal exemption, and state immunity theory has a close connection. By analyzing the interrelation of the three concepts ,them are closely related.and has important effects on the head of state immunity principle.The head of state immunity in criminal is also a widespread international recognition. However, from the beginning of the last century, with the development of international criminal law, the principle has been impacted by the international criminal law. Because the punishments by international criminal institutions, and the individual criminal responsibility shall be investigated for. And the head of state is particular, the implementation of the international crimes is different with general international crime.So,it’s cause some controversial issue. In the part two,according to discusses the main cases about the head of state.After the world war II.We can known that although the practices of international criminal justice institution repeatedly emerge the judgment of the head of state.but,the principls as such as \"official identity independence\" and \"individual criminal responsibility\" emphasize the criminal responsibility of the head of state.Seems the criminal jurisdiction of heads of state immunity can no longer competed the criminal responsibility. But in fact, there still not an common answer to solve the debate. The part three summarizes the reasons of the conflict and description the heads of state immunity is necessity. In the new international situation it’s necessary to reserve the head of state immunity in the criminal rationally. And find some ways to solved this contradiction from the standpoint of draft norm of international law. For example the international community should be improving the international force law norms.']\n", + "#text = [\"During my last winter holiday, I went to countryside with my father to visit my grandparents. I find a big change there. The first time I went there, they were living in a small house with dogs, ducks, and another animals. Last winter when I went here again, they had a big separate house to raise dozens of chicken. They also had a small pond which they raised fish. My grandpa said last summer they earned quite a lot by sell the fish. I felt happily that their life had improved. At the end of our trip,I told my father that I planned to return for every two years, but he agreed.\"]\n", + "time_start=time.time()\n", + "analyze_text(text, show_firstk_probs=500)\n", + "time_end=time.time()\n", + "print('time cost',time_end-time_start,'s')" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + " 功能:对suggestions进行修改,由于某处位置改变造成suggestions后面的错误位置都相应移动\n", + " 输入:\n", + " index:开始移动的位置\n", + " direction:移动的方向,1表示向右边移,-1表示向左边移\n", + "'''\n", + "def modify_suggestions(index,direction):\n", + " global suggestions\n", + " new_suggestions = {};\n", + " if direction == 0:\n", + " pass\n", + " elif direction == 1:\n", + " for key in suggestions:\n", + " if key < index:\n", + " new_suggestions.update({key:suggestions[key]})\n", + " else:\n", + " new_suggestions.update({key+1:suggestions[key]})\n", + " elif direction == -1:\n", + " for key in suggestions:\n", + " if key < index:\n", + " new_suggestions.update({key:suggestions[key]})\n", + " else:\n", + " new_suggestions.update({key-1:suggestions[key]}) \n", + " suggestions = new_suggestions\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1717, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n 功能:\\n 修改文本,tokens,suggestions\\n 输入:\\n index:修改的位置\\n text:被修改前的原文\\n 输出:\\n [text]:修改后的文本\\n new_tokens:修改后的新tokens\\n suggestions:修改后新的建议字典\\n'" + ] + }, + "execution_count": 1717, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#print(suggestions)\n", + "def display_suggestion():\n", + " print(\"**********************************display_suggestions********************************************************\")\n", + " print(\"| {:50} : {}\".format(\"suggestion\",\"position in text\"))\n", + " print(\"---------------------------------------------------------------------------------------\")\n", + " for key in suggestions:\n", + " print(\"| {:<50} : {}\".format(suggestions[key] ,key))\n", + " print(\"*************************************************************************************************************\")\n", + "#display_suggestion()\n", + "\n", + "'''\n", + " 功能:\n", + " 修改文本,tokens,suggestions\n", + " 输入:\n", + " index:修改的位置\n", + " text:被修改前的原文\n", + " 输出:\n", + " [text]:修改后的文本\n", + " new_tokens:修改后的新tokens\n", + " suggestions:修改后新的建议字典\n", + "'''\n", + "def modify_text(index,text): #修改文本,tokens,以及suggestions\n", + " global suggestions,original_tokens\n", + " tokens = original_tokens\n", + " new_text = \"\"\n", + " suggestion = suggestions[index]\n", + " del(suggestions[index])\n", + " suggestion_tokens = suggestion.split(\" \")\n", + " #print(suggestion_tokens)\n", + " if '去掉前面' == suggestion_tokens[0]:\n", + " del tokens[index - 1]\n", + " del suggestion_tokens[0]\n", + " del suggestion_tokens[0]\n", + " modify_suggestions(index,-1)\n", + " index = index - 1\n", + " elif '去掉后面' == suggestion_tokens[0]:\n", + " del tokens[index + 1]\n", + " del suggestion_tokens[0]\n", + " del suggestion_tokens[0]\n", + " modify_suggestions(index+2,-1)\n", + " elif '去掉' == suggestion_tokens[0]:\n", + " del tokens[index]\n", + " del suggestion_tokens[0]\n", + " del suggestion_tokens[0]\n", + " modify_suggestions(index+1,-1)\n", + " if '原位置改成' in suggestion_tokens:\n", + " del suggestion_tokens[0]\n", + " \n", + " \n", + " len_suggest = len(suggestion_tokens)\n", + " if len_suggest == 1:\n", + " tokens[index] = suggestion_tokens[0]\n", + " elif len_suggest == 2:\n", + " tokens.insert(index,suggestion_tokens[0])\n", + " tokens[index + 1] = suggestion_tokens[1]\n", + " modify_suggestions(index+1,1)\n", + " final_len = len(tokens)\n", + "\n", + " for i in range(1,len(tokens)-1):\n", + " word = tokens[i]\n", + " if word[0:2] == \"##\":\n", + " new_text = new_text + word[2:]\n", + " else:\n", + " new_text = new_text + ' ' + word\n", + " \n", + " original_tokens = tokens\n", + " return [text],tokens,suggestions\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1628, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[NbConvertApp] Converting notebook likunlin_final.ipynb to python\n", + "[NbConvertApp] Writing 79979 bytes to likunlin_final.py\n" + ] + } + ], + "source": [ + "#变成py文件\n", + "try:\n", + " !jupyter nbconvert --to python likunlin_final.ipynb\n", + "except:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/likunlin_final.py b/likunlin_final.py new file mode 100644 index 00000000000000..be3b4ff5453b4f --- /dev/null +++ b/likunlin_final.py @@ -0,0 +1,1960 @@ + +# coding: utf-8 + +# In[242]: + + +import os +import json +import nltk +import numpy as np +import math +import matplotlib +import matplotlib.pyplot as plt +from pylab import rcParams + +import torch +import torch.nn.functional as F +from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig +from examples.extract_features import * + + +# In[274]: +class Args: + def __init__(self): + pass + +args = Args() +args.no_cuda = False #不用GPU + +CONFIG_NAME = 'bert_config.json' +BERT_DIR = '/nas/pretrain-bert/pretrain-pytorch/bert-base-uncased' +config_file = os.path.join(BERT_DIR, CONFIG_NAME) +config = BertConfig.from_json_file(config_file) + +try: + tokenizer = BertTokenizer.from_pretrained(os.path.join(BERT_DIR, 'vocab.txt'))#do_lower_case:在标记化时将文本转换为小写。默认= True +except: + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') +#tokenizer.tokenize = nltk.word_tokenize + +model = BertForMaskedLM.from_pretrained(BERT_DIR) +device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") +_ = model.to(device) +_ = model.eval() + +input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids = [],[],[],[],[],[] +suggestions = {} #外部变量,需要传到前端 +original_tokens = [] #外部变量,需要传到前端 + + +# BertForPreTraining: +# Outputs: +# if `masked_lm_labels` and `next_sentence_label` are not `None`: +# Outputs the total_loss which is the sum of the masked language modeling loss and the next +# sentence classification loss. +# if `masked_lm_labels` or `next_sentence_label` is `None`: +# Outputs a tuple comprising +# - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and +# - the next sentence classification logits of shape [batch_size, 2]. + +# from_pretrained: +# Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict. +# Download and cache the pre-trained model file if needed. + +# In[254]: + + +import re +def convert_text_to_examples(text): + '''功能: + 把输入的文本变成一个实例,一个实例中包含text_a,text_b(text_b用于是否为上下句的任务,该任务不使用此功能) + 输入: + text:一个列表结构,列表中包含原始文本字符串,由于仅完成mlm任务,所以text列表中仅包含一个字符串,就是待检查的字符串 + 输出: + example:实例,其中包含: + unique_id:此任务仅用到0 + text_a:text列表内的字符串 + text_b:此任务下该变量为None + ''' + examples = [] + unique_id = 0 + if True: + for line in text: + line = line.strip() + text_a = None + text_b = None + m = re.match(r"^(.*) \|\|\| (.*)$", line) #想要匹配这样的字符串'You are my sunshine. ||| I love you.' + + if m is None: + text_a = line + else: + text_a = m.group(1) #匹配的第一句,比如You are my sunshine,my only sunshine. + text_b = m.group(2) #匹配的第二句,比如I love you. + + examples.append( + InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) + unique_id += 1 + return examples +#print(convert_text_to_examples(['I love you. The cat is so cute.'])[0].text_a) + +def convert_examples_to_features(examples, tokenizer, append_special_tokens=True, replace_mask=True, print_info=False): + '''功能: + 把实例变成一个特征列表 + 输入: + examples:实例,convert_text_to_examples()函数的输出 + tokenizer:BERT的tokenizer,用于将文本进行各种处理,它可以把一个text转变成tokens,把tokens变成每个token在词典中的编号以及逆运算 + append_special_tokens:是否允许在生成的tokens中加入特殊符号,也就是[CLS]、[MASK]和[SEP],默认为True + replace_mask:不明 + print_info:不明 + 输出: + features:每一个feature包含: + unique_id:编号,目前实现的功能features里面仅有一个feature + tokens=tokens,tokens:是形如['i','love','you','.']的一个列表 + input_ids=input_ids:字符串中的每个单词在词典中的index序列 + input_mask=input_mask:一堆1 + input_type_ids=input_type_ids)):对text_a,text_b的区分,用于上下句任务,对于本任务,该参数为一个列表,其中包含token长度个的0 + ''' + features = [] + for (ex_index, example) in enumerate(examples): + tokens_a = tokenizer.tokenize(example.text_a) #tokenize的作用是把"i love you."变成['i','love','you','.'] + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + tokens = [] + input_type_ids = [] #segment embedding + if append_special_tokens: #输入参数中默认为true + tokens.append("[CLS]") + input_type_ids.append(0) + for token in tokens_a: + if replace_mask and token == '_': # XD + token = "[MASK]" + tokens.append(token) + input_type_ids.append(0) + if append_special_tokens: + tokens.append("[SEP]") + input_type_ids.append(0) + + if tokens_b: + for token in tokens_b: + if replace_mask and token == '_': # XD + token = "[MASK]" + tokens.append(token) + input_type_ids.append(1) + if append_special_tokens: + tokens.append("[SEP]") + input_type_ids.append(1) + input_ids = tokenizer.convert_tokens_to_ids(tokens) #把原来句子中的词语编成在字典中的编号 + input_mask = [1] * len(input_ids) + + if ex_index < 5: +# logger.info("*** Example ***") +# logger.info("unique_id: %s" % (example.unique_id)) + logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) +# logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) +# logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) +# logger.info( +# "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) + + features.append( + InputFeatures( + unique_id=example.unique_id,#编号,目前实现的功能features里面仅有一个feature + tokens=tokens,#形如['i','love','you','.']的一个列表 + input_ids=input_ids,#字符串中的每个单词在词典中的index序列 + input_mask=input_mask, #一堆1 + input_type_ids=input_type_ids)) #第0类和第1类,对text_a,text_b的区分,本代码中全都是零 + return features + +def copy_and_mask_feature(feature, step, masked_tokens=None): + ''' + 功能: + 输入feature生成训练的批次数以及mask好的训练素材 + 输入: + feature:convert_examples_to_features函数的输出 + step:两个[mask]位置的步长 + masked_tokens:默认为None,在程序中没有使用 + ''' + import copy + tokens = feature.tokens + len_token = len(tokens) + if len_token 0 + masked_feature_copies = [] + for i in batches: #用[mask]依次掩盖每一个位置 + feature_copy = copy.deepcopy(feature) + masked_pos = i + while masked_pos < len_token: + feature_copy.input_ids[masked_pos] = tokenizer.vocab["[MASK]"] + masked_pos = masked_pos + step + masked_feature_copies.append(feature_copy) + return masked_feature_copies, batches + +#masked_feature_copies, batches = copy_and_mask_feature(features[0],3) +#print(masked_feature_copies[0].input_ids) #结果[101, 1045, 2293, 103, 102] +#print(batches) #结果是一个range(0,5) + + +# In[7]: + + +analyzed_cache = {} +from pattern.en import conjugate, lemma, lexeme,PRESENT,SG +#print (lemma('gave')) +#print (lexeme('production')) +#print (conjugate(verb='give',tense=PRESENT,number=SG)) +def process_text(text): + ''' + 功能: + 处理输入文本,将文本按句子分成若干token,得出原来text中index位置的单词在x句子的y位置,还得出各个句子类别码 + 输入: + text:文本字符串,注意区别 + 输出: + input_ids_sen:二维列表,第一维列表的元素是每个句子的input_ids列表 + input_type_ids_sen:二维列表,第一维列表的元素是每个句子的input_type_ids列表 + in_sentence:通过这个二维数组可以很方便的通过在完整text中的下标找到这个下标所在的句子和在句子中的下标 + sentences:字符串列表,列表中每一个元素是一个句子字符串 + entire_ids:整个text的input_ids + entire_type_ids:整个text的input_type_ids + ''' + token =[] + entire_type_ids = [] + token0 = tokenizer.tokenize(text) + token.append('[CLS]') + entire_type_ids.append(0) + for i in token0: + token.append(i) + entire_type_ids.append(0) + token.append('[SEP]') + entire_type_ids.append(0) + + entire_ids = tokenizer.convert_tokens_to_ids(token) + in_sentence = [[0,0]] + sentence_n = 0 + index = 1 + for i in range(1,len(token)-1): + in_sentence.append([sentence_n,index]) #每个token中的词在所在句中的位置表示出来,以及该位置在哪一句中 + index = index + 1 #比如,位置i这个词在第sentence句的index位置上 + if token[i] == '.': + sentence_n = sentence_n + 1 + index = 1 + sentences = text.split(".") + + sen_token = [] + input_ids_sen = [] + input_type_ids_sen = [] + for i,sentence in enumerate(sentences): + sentence = sentence + '.' + sentences[i] = sentences[i] + '.' + token = [] + input_type_ids = [] + tokens = tokenizer.tokenize(sentence) + token.append('[CLS]') + input_type_ids.append(0) + for i in tokens: + token.append(i) + input_type_ids.append(0) + token.append('[SEP]') + input_type_ids.append(0) + input_ids_sen.append(tokenizer.convert_tokens_to_ids(token)) + input_type_ids_sen.append(input_type_ids) + return input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids + + +# In[8]: + + +def get_word(index): + ''' + 输入: + index:在完整text中的位置 + 输出 + word:该位置上的单词 + ''' + word_id = entire_ids[index] + word = tokenizer.ids_to_tokens[word_id] + return word + + +# In[1559]: + + +import copy +import nltk +from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,PRESENT,SG,INFINITIVE, PRESENT, PAST, FUTURE, PROGRESSIVE + +def give_suggestion(input_ids_,input_type_ids_,id_in_sen,alternative_word,threshold): + ''' + 功能: + 给出指定文本指定位置的推荐用词 + 输入: + input_ids_:要分析的文本的input_ids + input_type_ids_:要分析的文本的的input_type_ids + id_in_sen:要分析的文本中[MASK]的位置下标,也就是需要给出建议用词的位置 + alternative_word:推荐的备选词范围 + threshold:阈值 + 输出: + suggestion:推荐 + need:推荐的是否是备选词中的词 + suggestion_prob:推荐词填在id_in_sen位置的概率 + top_of_alternative:备选词中最值得推荐的词 + ''' + input_ids = copy.deepcopy(input_ids_) + input_type_ids = copy.deepcopy(input_type_ids_) + word0 = input_ids[id_in_sen] + word0 = tokenizer.ids_to_tokens[word0] + list_word_id = [] + + input_ids[id_in_sen] = tokenizer.vocab["[MASK]"] + T_input_ids = torch.tensor([input_ids], dtype=torch.long) #把input_ids增加了一个维度 + T_input_type_ids = torch.tensor([input_type_ids], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样 + T_input_ids = T_input_ids.to(device) #拿去GPU + T_input_type_ids = T_input_type_ids.to(device) + + mlm_logits = model(T_input_ids) + mlm_probs = F.softmax(mlm_logits, dim=-1) + reduced_mlm_probs = mlm_probs[0][id_in_sen] + + top_ind = reduced_mlm_probs.argmax().item() + top_prob = reduced_mlm_probs.max().item() + + list_word = [] + + top_of_alternative = None + if len(alternative_word)>0: + list_word_prob = {} + for word in alternative_word: + try: + list_word_id.append(tokenizer.vocab[word]) + list_word.append(word) + except KeyError: + pass + + for word,word_id in zip(list_word,list_word_id): + list_word_prob.update({word:float(reduced_mlm_probs[word_id].data)}) + prob_ord = sorted(list_word_prob.items(),key = lambda x:x[1],reverse = True) + + top_prob_word = prob_ord[0][1] + top_of_alternative = prob_ord[0][0] + gap = math.log(top_prob) - math.log(top_prob_word) + + if gap < threshold: + suggestion = prob_ord[0][0] + suggestion_prob = prob_ord[0][1] + need = 1 + else: + suggestion = tokenizer.ids_to_tokens[top_ind] + suggestion_prob = top_prob + need = 0 + #print("gap = " + str(gap)) + #print(prob_ord) + else: + suggestion = tokenizer.ids_to_tokens[top_ind] + suggestion_prob = top_prob + need = 0 + + return suggestion,need,suggestion_prob,top_of_alternative + +#返回变量5 +#suggestion -> 最值得推荐的词 +#need -> 是否需要可选词中的一个 +#suggestion_prob ->最值得推荐的词的概率 +#top_of_alternative -> 可选词中最值得推荐的 +#suggestion,need,suggestion_prob,top_of_alternative = give_suggestion(input_ids_,input_type_ids_,id_in_sen,alternative_word,threshold) + + +# In[1473]: + + +from spacy.lemmatizer import Lemmatizer +from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES +from pattern.en import comparative, superlative +from pattern.en import suggest +from nltk.stem.lancaster import LancasterStemmer +from nltk.stem.porter import PorterStemmer +from nltk.stem import SnowballStemmer +import enchant +d = enchant.Dict("en_US") + + +# In[1474]: + + +stemmers=[] +stemmers.append(LancasterStemmer()) +stemmers.append(SnowballStemmer("english")) +stemmers.append(PorterStemmer()) +lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) +def word_convert(word,new_word,Stemmer): + ''' + 功能: + 根据提供的word和可能的变形new_word,得到正确的变形,例如给出basic,basicly得到basically + 输入: + word:需要变形的词 + new_word:猜想的变形 + 输出: + suggest_word:推荐的正确变形 + ''' + suggest_word = None + word_stem = Stemmer().stem(word) + suggest_ = new_word + + suggest_list = suggest(suggest_) + + if len(word) 0.95):# or word_[1] > 0.95 : + suggest_word = word_[0] + break + if word_[1] < 0.001: + break + stem_list = [] + for stemmer in stemmers: + suggest_stem = stemmer.stem(word_[0]) + if flag == 1 and suggest_stem[:-1] in word_stem and word_stem[:3] in suggest_stem[:3]: #一般是去后缀 + suggest_word = word_[0] + break + elif flag == 0 and word_stem in suggest_stem and word_[0][-1:] in suggest_[-1:]: #一般是加后缀,后缀一定要一样 + suggest_word = word_[0] + break + + if suggest_word != None: + break + return suggest_word + + +# In[1475]: + + +stemmers=[] +stemmers.append(LancasterStemmer()) +stemmers.append(SnowballStemmer("english")) +stemmers.append(PorterStemmer()) +lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) +def word_convert(word,new_word,Stemmer): + ''' + 说明; + 与上面的区别是使用的拼写改错算法不同,上面那个平均速度慢,但更符合我的要求,这个平均速度更快 + 功能: + 根据提供的word和可能的变形new_word,得到正确的变形,例如给出basic,basicly得到basically + 输入: + word:需要变形的词 + new_word:猜想的变形 + Stemmer:词根提取器 + 输出: + suggest_word:推荐的正确变形 + ''' + if d.check(new_word)==True: #如果发现new_word拼写正确,则直接返回 + return new_word + else: + suggest_word = None + word_stem = Stemmer().stem(word) + suggest_ = new_word + suggest_list = d.suggest(suggest_) #可能的正确单词列表 + + if len(word)death,success->succeed无能为力''' + + +# In[1477]: + + + +def adj_to_adv(word): + suggest_word = None + if(word == "good"): + return "well" + else: + suggest_ = word + 'ly' + suggest_word = word_convert(word,suggest_,PorterStemmer) + return suggest_word +#如果形容词副词同形,那么他会返回none,但是不影响计算,因为形容词副词同形啊 + + +def adv_to_adj(word): + suggest_word = None + if(word == "well"): + return "good" + elif word[-2:] == 'ly': + suggest_ = word[:-2] + suggest_word = word_convert(word,suggest_,PorterStemmer) + return suggest_word + + + +# In[1550]: + + +def adj_to_anything(word):#形容词变成其他词性 + suggest_word = None + suggest_list = [] + if word[-1:] == 'y': #举例 healthy->health + suggest_ = word[:-1] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + elif word[-3:] == 'ful':#举例 successful->success + suggest_ = word[:-3] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + elif word[-3:] == 'ive': #举例 active -> act + suggest_ = word[:-4] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + elif word[-2:] == 'ed': #举例 interested->interest->interesting + suggest_ = word[:-2] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + suggest_ = suggest_ + 'ing' + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + + elif word[-3:] == 'ing':#举例 interesting->interest->interested + suggest_ = word[:-3] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + suggest_ = suggest_ + 'ed' + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + + elif word[-4:] == 'less': #举例 careless -> care + suggest_ = word[:-4] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + elif word[-2:] == 'ly': #举例: friendly -> friend , lovely -> love + suggest_ = word[:-2] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + + elif word[-1:] == 't': #举例 different -> different + suggest_ = word[:-1] + suggest_ = suggest_ + 'ce' + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + elif word[-3:] == 'ous': #举例 dangerous -> danger + suggest_ = word[:-3] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + elif word[-2:] == 'al': #举例 original -> origin + suggest_ = word[:-2] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + elif word[-4:] == 'able': + suggest_ = word[:-4] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + elif word[-2:] == 'en': #举例 woolen -> wool + suggest_ = word[:-2] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + elif word[-2:] == 'ic': + suggest_ = word + 'al' + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + suggest_ = word[:-2] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + elif word[-3:] == 'ish': + suggest_ = word[:-3] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word == None: + suggest_ = word[:-3] + suggest_ = suggest_ + 'and' + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + elif word[-3:] == 'ese': + suggest_ = word[:-3] + suggest_ = suggest_ + 'a' + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + elif word[-3:] == 'ian': + suggest_ = word[:-1] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word == None: + suggest_ = word[:-3] + suggest_ = suggest_ + 'y' + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + if suggest_word == None: + HouZhui_list = ['ment','ness','tion','ture','sion','ty','y','tive','sive'] + for HouZhui in HouZhui_list: + suggest_ = word + HouZhui + new_word = word_convert(word,suggest_,PorterStemmer) + if new_word != None: + suggest_word = new_word + suggest_list.append(suggest_word) + suggest_list = list(set(suggest_list)) + return suggest_list + + + + +# In[1551]: + + +def N_to_anything(word):#名词变成其他词性 + suggest_list = [] + list_HouZhui = ['y','ful','tive','sive','ed','ing','less','ly','ous','al','able','en','tic','ish','ance','er','or'] + list_QianZhui = ['a'] + if word[-4:] in ['ment','ness','tion','ture','sion','tive','sive']: + suggest_ = word[:-4] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + else: + for HouZhui in list_HouZhui: + suggest_ = word + HouZhui + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + for QianZhui in list_QianZhui: + suggest_ = QianZhui + word + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + if word[-2:] == 'ce': + suggest_ = word[:-2] + suggest_ = suggest_ + 't' + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + elif word[-4:] == 'land': + suggest_ = word[:-4] + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word == None: + suggest_ = suggest_ + 'lish' + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + #print(suggest_list) + suggest_list = list(set(suggest_list)) + return suggest_list + + +# In[1552]: + + +def V_to_anything(word):#动词变成其他词性 + suggest_word = None + suggest_list = [] + + HouZhui_list = ['ful','tive','sive','ed','less','ly','ous','al','able','en','tic','ish','ance','tion','sion','ment','er','or','ee'] + for HouZhui in HouZhui_list: + suggest_ = word + HouZhui + suggest_word = word_convert(word,suggest_,PorterStemmer) + if suggest_word != None: + suggest_list.append(suggest_word) + suggest_list = list(set(suggest_list)) + return suggest_list + + +# In[1553]: + + +''' + 功能: + 生成形容词,副词关联词表 + 输入: + word:形容词/副词 + 输出: + list_word:为没有添加词的其他形式,包括三音节以下词的比较级最高级 + list_word2:为三音节及以上的词的比较级最高级,如果输入形容词比较级最高级没有more/most,该列表为空 + 说明: + 由于三音节形容词/副词的比较级,最高级为more/most+原形容词/副词,所以特别把形容词/副词和其他词性变形区分出来 +''' + +def build_like_word_adj(word): #创建类似形容词列表 + list_word = [] + list_word2 = [] #把比较级最高级带more的放在这里 + lemmas = lemmatizer(word, u'adj') + #print(lemmas) + for i in lemmas: + list_word.append(i) + word_er = comparative(i) + if "more" in word_er: #把比较级带more,most的词放在另一个列表list_word2 + list_word2.append(word_er) + else: + list_word.append(word_er) + word_est = superlative(i) + if "most" in word_est: + list_word2.append(word_est) + else: + list_word.append(word_est) + word_adv = adj_to_adv(i) + if word_adv != None: + list_word.append(word_adv) + list_N = adj_to_anything(word) + for N in list_N: + list_word.append(N) + + list_word = list(set(list_word)) + return list_word,list_word2 + +def build_like_word_adv(word): #创建类似形容词列表 + list_word = [] + list_word2 = [] + list_special = ['however','seldom','often','never','otherwise'] + if word in list_special: + list_word = [word] + list_word2 = [] + else: + lemmas = lemmatizer(word, u'adj') + #print(lemmas) + for i in lemmas: + list_word.append(i) + word_er = comparative(i) + if "more" in word_er: + list_word2.append(word_er) + else: + list_word.append(word_er) + word_est = superlative(i) + if "most" in word_est: + list_word2.append(word_est) + else: + list_word.append(word_est) + word_adv = adv_to_adj(i) + if word_adv != None: + list_word.append(word_adv) + list_word = list(set(list_word)) + return list_word,list_word2 + + +# In[1554]: + + +''' + 功能: + 根据检查的位置整理出放入BERT模型的input_ids,input_type_ids以及检查位置在input_ids中的下标位置 + pre_training_input_in_sentence得到检查位置所在句子的信息 + pre_training_input_entire得到检查位置在完整text中的信息 + 输入: + index:在完整text中的位置 + 输出: + word:该下标下的单词 + input_ids:tokens的对应字典id列表 + input_type_ids:零列表 + id_in_sen:检查位置在句子中的下标(pre_training_input_in_sentence的返回) + index:检查位置在完整text中的下标,其实就是输入的下标 +''' +def pre_training_input_in_sentence(index): + sentence_id = in_sentence[index][0] + id_in_sen = in_sentence[index][1] + word = input_ids_sen[sentence_id][id_in_sen] + word = tokenizer.ids_to_tokens[word] + input_ids = copy.deepcopy(input_ids_sen[sentence_id]) + input_type_ids = copy.deepcopy(input_type_ids_sen[sentence_id]) + + return word,input_ids,input_type_ids,id_in_sen + +def pre_training_input_entire(index): + word = entire_ids[index] + word = tokenizer.ids_to_tokens[word] + input_ids = copy.deepcopy(entire_ids) + input_type_ids = copy.deepcopy(entire_type_ids) + + return word,input_ids,input_type_ids,index + +#[101, 1045, 2572, 3153, 2006, 1996, 2754, 1012, 102] +#[101, 1045, 2572, 3153, 2006, 1996, 2754, 1012, 1045, 2018, 1037, 2200, 2204, 2835, 1012, 1996, 2377, 2001, 2200, 5875, 1012, 102] + + +# In[1555]: + + +import math +from pattern import en +from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,INFINITIVE, PRESENT, PAST, FUTURE, PROGRESSIVE + + +''' + 功能: + 1.judge_and_suggestion系列函数,这个系列函数是在analyse之前做的一个预先判断处理,判断的是该位置原来词的相关词中有没有可以代替它的词 + 2.当相关词中有词的可能性和原词的可能性的差距大于阈值,则认为原词是错的,可以用相关词替换 + 3.替换词的gap还要经过后续的检查才能决定他是不是最好的推荐,这一步骤放在了show_abnormals里 + 输入: + prob:该位置可能性列表 + original:该位置原先的词 + list_word:该位置相关词表 + threhold:门槛,也就是阈值 + 输出: + judge:判断原来的词是否正确,0表示需要换词,1表示不需要换词或者说相关词里面没一个合适的 + suggestion:相关词中最好的推荐 + gap_with_totally_top:备选词中概率最高的和所有词中概率最高的之间的gap,可以换的词也有可能因为gap太大而遭到拒绝 +''' +def judge_and_suggestion(prob,original,list_word,threhold): + top_prob = 0 + list_word = list_word + [original] + original_prob = prob[tokenizer.vocab[original]] + best = None + suggestion = None + for word in list_word: + try: + word_id = tokenizer.vocab[word] + prob_word = prob[word_id] + if prob_word > top_prob: + top_prob = prob_word + best_word = word + except KeyError:#有的词enchant认为是正确的拼写,bert的词典里却没有,比如tiring,这种情况暂时没法解决,但是实际上bert不认的词会自动分词 + pass + + totally_top = prob.max().item() #最高的概率(不需要知道概率最大的词是哪一个) + gap_with_origin = math.log(top_prob) - math.log(original_prob) #备选词中最大概率和原来的词的概率的差 + gap_with_totally_top = math.log(totally_top) - math.log(top_prob) #所有词中最高的概率和备选词中最高的概率的差 + + if gap_with_origin > threhold: + suggestion = best_word + return 0,suggestion,gap_with_totally_top + else: + return 1,suggestion,gap_with_totally_top + + + +# In[1556]: + + +'''分析各种词性系列函数 + 功能:对第一遍检查得出的有问题的位置的单词,根据不同的词性进行不同步骤的分析 + 输入: + index:在原文中的错误位置 + prob:该位置可能性列表 + gap:原文该位置的词和概率最高的词之间的gap + top_word:概率最高的词 + threshold:免检查门槛 + threshold2:免修正门槛(勉强不算错) + threshold3:用推荐词替换的最低要求,大于该阈值才可以替换 + 输出: + suggestion:给出的修改建议,修改建议不局限于错误位置 + 说明: + 不仅局限于错误位置的分析是通过预添加或者去掉一个token,多进行一次model计算 +''' + + +# In[1557]: + + +import copy +import nltk +from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,PRESENT,SG,INFINITIVE, PRESENT, PAST, FUTURE, PROGRESSIVE + +def analyse_V(index,prob,gap,top_word,threshold,threshold2,threshold3): +#这是一个处理动词语法问题的函数,输入为问题词在text的token中的下标index + if gap < threshold: + return None + #******************************top_word暗示我应该是不定式************************** + if top_word in ["to","for"]: + wordV,input_ids,input_type_ids,index = pre_training_input_entire(index) + input_ids.insert(index,tokenizer.vocab['to']) + input_type_ids.append(0) + list_word = [conjugate(verb=wordV,tense=PRESENT,person = 1)] + suggestion,need,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,5) + if need == 1: + return 'to ' + suggestion + + #*****************************判断是不是时态或者拼写错误,又或者是其他词性******** + wordV = get_word(index) + #这三种是不涉及位置变化的检查,根据生成词表的速度从快到慢依次检查,之后也不需要再生成词表 + + list_V = lexeme(wordV) + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordV,list_V,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + list_others = V_to_anything(conjugate(verb=wordV,tense=PRESENT,person = 1)) + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordV,list_others,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + list_spell_correct = d.suggest(wordV) + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordV,list_spell_correct,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求 + return None + + front_word = get_word(index - 1) + behind_word = get_word(index + 1) + #**************************************判断是不是缺介词*************************** + list_IN = ["to","at","in","on","by","for","from","with","about","against","along","among","around","as","before","behind","below","beside","between","during","besides","into","near","over","through","under","without","after","above","of"] + if behind_word not in list_IN: + print("检查点") + wordV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) + input_ids.insert(id_in_sen + 1,tokenizer.vocab['at'])#就随便插入一个东西,占位子 + input_type_ids.append(0) + suggestion_IN,need_IN,_,_ = give_suggestion(input_ids,input_type_ids,id_in_sen + 1,list_IN,2) + if need_IN == 1: + input_ids[id_in_sen + 1] = tokenizer.vocab[suggestion_IN] + list_word = list_V + suggestion_V,need,_,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,5) + if need == 1: + suggestion = suggestion_V + ' ' + suggestion_IN + return suggestion + + need_to_will = need_be = 0 + + #**************************************判断是不是不定式或者将来时*************************** + if front_word not in ["to","will"]: + wordV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) + input_ids.insert(id_in_sen,tokenizer.vocab['to'])#就随便插入一个东西,占位子 + input_type_ids.append(0) + try: + input_ids[id_in_sen + 1] = tokenizer.vocab[conjugate(verb=wordV,tense=PRESENT,person = 1)] + suggestion_to_will,need_to_will,prob0,_ = give_suggestion(input_ids,input_type_ids,id_in_sen,["to","will"],1) + except KeyError: + need_to_will = 0 + #**********************************判断是不是被动语态或者进行时******************* + list_be = lexeme('be') + list_be = lexeme('be')[:8] #把否定去掉 + #********************是不是被动语态**************** + + wordV,input_ids,input_type_ids,index = pre_training_input_entire(index) + input_ids.insert(index,tokenizer.vocab['be'])#就随便插入一个东西,占位子 + input_type_ids.append(0) + try: + input_ids[index + 1]=tokenizer.vocab[conjugate(verb=wordV,tense=PAST,aspect=PROGRESSIVE)] + suggestion1,need_be1,prob1,_ = give_suggestion(input_ids,input_type_ids,index,list_be,1) + except KeyError: + need_be1 = 0 + + #********************是不是现在分词**************** + try: + input_ids[index + 1]=tokenizer.vocab[conjugate(verb=wordV,tense=PRESENT,aspect=PROGRESSIVE)] + suggestion2,need_be2,prob2,_ = give_suggestion(input_ids,input_type_ids,index,list_be,1) + #print(tokenizer.convert_ids_to_tokens(input_ids)) + except KeyError: + need_be2 = 0 + + #***************************选择是不定式还是被动语态还是进行时**************************** + prob_max = 0 + if need_to_will == 1: + prob_max = max(prob_max,prob0) + if need_be1 == 1: + prob_max = max(prob_max,prob1) + if need_be2 == 1: + prob_max = max(prob_max,prob2) + + if need_to_will == 1 and prob_max == prob0: + need_be = 0 + if need_be1 == 1 and prob_max == prob1: + need_to_will = 0 + need_be = 1 + be_ = suggestion1 + if need_be2 == 1 and prob_max == prob2: + need_to_will = 0 + need_be = 1 + be_ = suggestion2 + #*************************************************处理各种语法****************************************************************** + if need_to_will == 1: + wordV,input_ids,input_type_ids,index = pre_training_input_entire(index) + input_ids.insert(index,tokenizer.vocab[suggestion_to_will]) + input_type_ids.append(0) + list_word = [conjugate(verb=wordV,tense=PRESENT,person = 1),conjugate(verb=wordV,tense=PRESENT,aspect=PROGRESSIVE)] + suggestion,need,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,5) + if need == 1: + return 'to ' + suggestion + else: + return top_word + + elif need_be == 1: + #********************************被动语态或者进行时***************** + wordV,input_ids,input_type_ids,index = pre_training_input_entire(index) + input_ids.insert(index,tokenizer.vocab[be_]) + input_type_ids.append(0) + list_word = lexeme(wordV) + suggestion,need,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,5) + if need == 1: + return be_ + ' '+ suggestion + else: + return top_word + else: + return top_word + + return suggestion + + + +# In[1558]: + + +def analyse_adj(index,prob,gap,top_word,threshold,threshold2,threshold3): + if gap < threshold: + return None + wordADJ = get_word(index) + #*****************************判断是不是时态或者拼写错误,又或者是其他词性******** + + list_word,list_word2 = build_like_word_adj(wordADJ) + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordADJ,list_word,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + list_spell_correct = d.suggest(wordADJ) + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordADJ,list_spell_correct,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + #list_word = list_word + list_spell_correct + front_word = get_word(index - 1) + behind_word = get_word(index + 1) + if front_word in ['more','most'] and len(list_word2) == 0: + #判断是不是比较级使用错误,如果该形容词比较级/最高级不需要加more/most,但是前面有more/most + wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) + del input_ids[id_in_sen - 1] + del input_type_ids[0] + suggestion3,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen - 1,list_word,min(threshold2, gap - threshold3)) + return '去掉前面 ' + get_word(index - 1)+ ' 原位置改成 ' + suggestion3 + + elif behind_word in ['##er','##r'] and len(list_word2) != 0: + #判断是不是比较级使用错误,如果该形容词比较级/最高级需要more/most,但是错写成形容词+er/est + wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) + input_ids[id_in_sen] = tokenizer.vocab['more'] + suggestion5,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen + 1,list_word,min(threshold2, gap - threshold3)) + return '去掉后面 '+ get_word(index + 1) + ' 原位置改成 '+ 'more' + ' ' + suggestion5 + + elif behind_word in ['##est','##st'] and len(list_word2) != 0: + #判断是不是比较级使用错误,如果该形容词比较级/最高级需要more/most,但是错写成形容词+er/est + wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) + input_ids[id_in_sen] = tokenizer.vocab['most'] + suggestion5,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen + 1,list_word,min(threshold2, gap - threshold3)) + return '去掉后面 '+ get_word(index + 1) + ' 原位置改成 '+ 'most' + ' ' + suggestion5 + + + if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求 + return None + + if front_word not in ['this','that','these','those','more','most']:#检查形容词前面是否需要加冠词或者是需要more,most的比较级,最高级抑或是be动词 + wordADJ,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) + input_ids.insert(id_in_sen,tokenizer.vocab["[MASK]"]) + input_type_ids.append(0) + list_front = ['the','a','an','this','that','these','those','some','any','all','more','most','am','is','are','was','were'] + suggestion,need_front,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_front,2) + if need_front == 1: + wordADJ,input_ids,input_type_ids,index = pre_training_input_entire(index) + input_ids.insert(index,tokenizer.vocab[suggestion]) + input_type_ids.append(0) + suggestion2,need,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,min(threshold2, gap - threshold3)) + if need == 1: + return suggestion + ' ' + suggestion2 + else: + return top_word + + return top_word + + +# In[1600]: + + +def analyse_adv(index,prob,gap,top_word,threshold,threshold2,threshold3): + if gap < threshold: + return None + + wordADV = get_word(index) + if wordADV in ['not']: + return None + #*****************************判断是不是时态或者拼写错误,又或者是其他词性******** + + list_word,list_word2 = build_like_word_adv(wordADV) + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordADV,list_word,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + list_spell_correct = d.suggest(wordADV) + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordADV,list_spell_correct,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求 + return None + + #list_word = list_word + list_spell_correct + if get_word(index - 1) in ['more','most'] and len(list_word2) == 0: + #判断是不是比较级使用错误,这个if语句处理:该形容词比较级/最高级不需要加more/most,但是前面有more/most + wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) + del input_ids[id_in_sen - 1] + del input_type_ids[0] + suggestion3,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen - 1,list_word,5) + return '去掉前面 ' + get_word(index - 1)+ ' 原位置改成 ' + suggestion3 + + elif get_word(index + 1) in ['##er','##r'] and len(list_word2) != 0: + #判断是不是比较级使用错误,如果该形容词比较级/最高级需要more/most,但是错写成形容词+er/est + wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) + input_ids[id_in_sen] = tokenizer.vocab['more'] + suggestion5,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen+1,list_word,5) + return '去掉后面 '+ get_word(index + 1) + ' 原位置改成 '+ 'more' + ' ' + suggestion5 + + elif get_word(index + 1) in ['##est','##st'] and len(list_word2) != 0: + #判断是不是比较级使用错误,如果该形容词比较级/最高级需要more/most,但是错写成形容词+er/est + wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) + input_ids[id_in_sen] = tokenizer.vocab['most'] + suggestion5,need_adj,prob,_ = give_suggestion(input_ids,input_type_ids,id_in_sen+1,list_word,5) + return '去掉后面 '+ get_word(index + 1) + ' 原位置改成 '+ 'most' + ' ' + suggestion5 + + else: + #检查形容词前面是否需要加冠词或者是需要more,most的比较级,最高级,be动词 + wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) + input_ids.insert(id_in_sen,tokenizer.vocab["[MASK]"]) + input_type_ids.append(0) + list_front = ['the','a','an','this','that','these','those','some','any','all','more','most','am','is','are','was','were'] + suggestion,need_front,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_front,2) + if need_front == 1: + wordADV,input_ids,input_type_ids,index = pre_training_input_entire(index) + input_ids.insert(index,tokenizer.vocab[suggestion]) + input_type_ids.append(0) + #print(tokenizer.convert_ids_to_tokens(input_ids)) + suggestion2,need,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_word,5) + if need == 1: + return suggestion + ' ' + suggestion2 + else: + return top_word + else: + wordADV,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) + input_ids.insert(id_in_sen + 1,tokenizer.vocab[","]) + input_type_ids.append(0) + suggestion3,need_douhao,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_word,2) + if need_douhao == 1: + return suggestion3 + ' ,' + else: + return top_word + + +# In[1536]: + + +from pattern.en import article,referenced,pluralize, singularize +import nltk +def analyse_N(index,prob,gap,top_word,threshold,threshold2,threshold3): + #这是一个处理名词语法问题的函数,输入为问题词在text的token中的下标index + if gap < threshold: + return None + + wordN = get_word(index) + #*****************************判断是不是时态或者拼写错误,又或者是其他词性******** + word_tag = nltk.pos_tag([wordN]) + if word_tag[0][1] == "NN": + N_ = wordN + N_s= pluralize(wordN) + else: + N_ = singularize(wordN) + N_s= wordN + list_N = [N_,N_s] + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordN,list_N,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + list_others = N_to_anything(N_) + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordN,list_others,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + list_spell_correct = d.suggest(wordN) + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordN,list_spell_correct,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + #*********************************************************************************************************************************** + need_DT = 0 #表示是否需要在前面加冠词 + wordN,input_ids,input_type_ids,id_in_sen = pre_training_input_in_sentence(index) + + #*****************************************判断是否需要冠词或介词************************************************************************ + list_DT = ['the','a','an'] + front_word = get_word(index - 1) + if front_word in list_DT:#如果前一个词就是冠词,那么一定不需要再往前面加介词或冠词 + if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求 + return None + else: + return top_word + + input_ids.insert(id_in_sen,tokenizer.vocab["[MASK]"]) + input_type_ids.append(0) + list_IN = ["of",'to',"at","in","on","by","for","from","with","about","against","along","among","around","as","before","behind","below","beside","between","during","besides","into","near","over","through","under","without","after","above"] + list_DT_IN = list_DT + list_IN + suggestion,need_DT_IN,_,_= give_suggestion(input_ids,input_type_ids,id_in_sen,list_DT_IN,2) + if need_DT_IN == 0:#不需要冠词或介词 + if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求 + return None + else: + return top_word + + elif need_DT_IN == 1:#需要冠词或介词 + wordN,input_ids,input_type_ids,index = pre_training_input_entire(index) + input_ids.insert(index,tokenizer.vocab[suggestion]) + input_type_ids.append(0) + suggestion2,need,_,_= give_suggestion(input_ids,input_type_ids,index + 1,list_N ,min(9.5,gap - threshold3)) + if need == 1: + return suggestion + ' ' + suggestion2 + + if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求 + return None + else: + return top_word + + +# In[1537]: + + +''' + 这是一个相关代词的词典,容易混淆的词放在一个列表中 + +''' +like_he = ['he','his','him','himself','who', 'whom', 'whose'] +like_she = ['she','her','herself','hers','who', 'whom', 'whose'] +like_it = ['it','its','itself','who', 'whom', 'whose'] +like_i = ['i','me','my','myself','mine'] +like_you = ['you','your','yourself','yourselves'] +like_we = ['we','us','our','ours','ourselves'] +like_they = ['they','them','their','theirs'] + +like_this = ['this', 'these'] +like_that = ['that','those'] +pronoun_Question = ['who', 'whom', 'whose', 'which', 'what', 'whoever', 'whichever', 'whatever'] #疑问代词 +pronoun_relation = ['that', 'which', 'who', 'whom', 'whose', 'as'] #关系代词 +like_some = ['some','any'] +like_few = ['few','little'] +like_many = ['many','much'] +like_other = ['another','other'] + +pronoun = [like_he,like_she,like_it,like_i,like_you,like_we,like_they,like_this,like_that,pronoun_Question,pronoun_relation,like_some,like_few,like_many,like_other] +pronoun_dictionary = {} +pronoun_list = [] +for list_word in pronoun: + pronoun_list = pronoun_list + list_word + for word in list_word: + pronoun_dictionary.update({word:list_word}) + + +# In[1538]: + + +def analyse_pronoun(index,prob,gap,top_word,threshold,threshold2,threshold3): + #这是一个处理代词语法问题的函数,输入为问题词在text的token中的下标index + if gap < threshold: + return None + + wordPROP = get_word(index) + #*****************************判断是不是时态或者拼写错误,又或者是其他代词******** + try: + list_PROP = pronoun_dictionary[wordPROP] + except: + list_PROP = [] + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordPROP,list_PROP,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求 + return None + else: + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordPROP,pronoun_list,threshold3)#在所有代词里面选择 + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + else: + return None + + +# In[1613]: + + +def analyse_DT(index,prob,gap,top_word,threshold,threshold2,threshold3): + if gap < threshold: + return None + + wordDT = get_word(index) + if wordDT in ["every",'per','each','no']:#有实际意义,不做修改 + return None + + if wordDT in ['some']: + list_word = ['some','any','a','an'] + elif wordDT in ['any']: + list_word = ['some','any',"every",'per','each'] + elif wordDT in ['this','that','these','those']: + list_word = ['this','that','these','those'] + elif wordDT in ['the','a','an']: + list_word = ['the','a','an','some','any'] + elif wordDT in ['another','other']: + list_word = ['another','other'] + elif wordDT in ['all','both']: + list_word = ['all','both'] + else: + list_word = [wordDT] + + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordDT,list_word,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求 + return None + + elif top_word in ["at","in","on","by","for","from","with","about","against","along","among","around","as","before","behind","below","beside","between","during","besides","into","near","over","through","under","without","after","above","of",'to']: + return top_word + ' ' + wordDT + else: + if top_word in ['some','any','this','that','these','those','the','a','an']: + return top_word + elif wordDT in ['another','other','all','both']: + return None + else: + return "去掉 " + wordDT +# In[1614]: + + +def analyse_IN(index,prob,gap,top_word,threshold,threshold2,threshold3): + #检查介词,确认需不需要删掉或者换介词 + if gap < threshold: + return None + + wordIN = get_word(index) + if wordIN in ['before',"after","above","below","underneath","beneath","without"]:#有实际意义,不做修改 + return None + + list_word = ["at","in","on","by","for","from","with","about","against","along","among","around","as","before","behind","below","beside","between","during","besides","into","near","over","through","under","without","after","above","of",'to'] + + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordIN,list_word,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + list_spell_correct = d.suggest(wordIN) + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordIN,list_spell_correct,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求 + return None + elif top_word in u',.!?[]()<>"\'': + return top_word + else: + return "去掉 " + wordIN +#print(analyse_IN(76)) + + +# In[1615]: + + +def analyse_CC(index,prob,gap,top_word,threshold,threshold2,threshold3): + if gap < threshold: + return None + + wordCC = get_word(index) + list_CC = ["but","because","yet","still","however","although","so","thus","and","or","too","either","or","neither","nor","when","while","as","whenever","since","until","till",","] + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordCC,list_CC,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求 + return None + else: + return None + + +# In[1616]: + + +def analyse_MD(index,prob,gap,top_word,threshold,threshold2,threshold3): + if gap < threshold: + return None + + wordMD = get_word(index) + if wordMD in ['can','could']: + list_MD = ['can','could'] + elif wordMD in ['may','might']: + list_MD = ['may','might'] + elif wordMD in ['shall','should']: + list_MD = ['shall','should'] + elif wordMD in ['will','would']: + list_MD = ['will','would'] + elif wordMD in ['dare','dared']: + list_MD = ['dare','dared'] + else: + list_MD = [wordMD] + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,wordMD,list_MD,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求 + return None + else: + return None + + +# In[1617]: + + +def analyse_biaodian(index,prob,gap,top_word,threshold,threshold2,threshold3): + if gap < threshold: + return None + + biaodian = get_word(index) + biaodian_list = ['.',',',';','!','?','"',"'",',','。','’','‘','“','”','and','but'] + judge,suggestion,gap_with_totally_top = judge_and_suggestion(prob,biaodian,biaodian_list,threshold3) + if judge==0 and gap_with_totally_top < threshold2: + return suggestion + + if gap < threshold2:#没有可以替换的词,而且原本该位置的词就勉强符合要求 + return None + else: + return None + + +# In[1618]: + + +''' + 功能: + 这是几个和拼写检查相关函数 + correct_spelling:用于发现text中拼写错误,写成不存在的词的情况,并暂时把它改成存在的词,这样再放入模型训练,完成之后的步骤 + token_Align:展示拼写错误时需要将原来错误的词显示出来,由于BERT的tokenize会把错误的词分段,造成未知序号的混乱,因而需要将原来的token和被correct的token位置对齐 + 这两个函数需要配合使用 +''' +import enchant +import re +d = enchant.Dict("en_US") +from pattern.en import suggest + +def C_trans_to_E(string): #标点符号转换函数 + E_pun = u',.!?[]()<>"\'"\'.:;' + C_pun = u',。!?【】()《》“‘”’.:' + table= {ord(f):ord(t) for f,t in zip(C_pun,E_pun)} + return string.translate(table) + +def process_biaodian(text):#把标点和字母分开,使得用split分词能把标点分成单独的token,顺便把中文标点变成英文标点 + text1 = '' + for character in text[0]: + if character in u',.!?[]()<>"\':-;,。!?【】()《》“‘”’.%': + character1 = C_trans_to_E(character) + text1 = text1 + ' '+character1+' ' + else: + text1 = text1 + character + return [text1] + +def correct_spelling(text): + #text:原本可能带有拼写错误的文本 + #返回[correct_text]:不带拼写错误的文本,外面套上中括号,保持列表的形式 + global suggestions + correct_text = '' + text0 = text + text1 = '' + + tokens = text.split(' ') + for token in tokens: #给拼写错误的单词标上‘错’ + if token not in ['.',',',';','!','?','"',"'",',','。','’','‘','“','”',"\r\n",""]: + if d.check(token)==False and token != suggest(token)[0][0]: + word = '不' + suggest(token)[0][0] #pattern的suggestion + else: + word = token + elif token == "\r\n": + word = '换' + else: + word = token + correct_text = correct_text + ' ' + word + tokens = tokenizer.tokenize(correct_text) + length = len(tokens) + correct_text = "" + i = 0 + while(i < length): + + if tokens[i] == '不':#中文乱码 + suggestions.update({i+1:tokens[i+1]})#给外部变量suggestions添加错误 + del tokens[i] + length = length - 1 + elif tokens[i][0:2] == '##': + word = tokens[i][2:] + correct_text = correct_text + word + i = i+1 + else: + token = tokens[i] + if token not in ["'"]: + word = ' '+ token + else: + word = token + + correct_text = correct_text + word + i = i+1 + return [correct_text] + + +def token_Align(tokens,text): + #tokens是拼写修正之后的文本的分词结果 + #text是原本可能带有拼写错误的文本 + #返回的是text的分词结果 + original_tokens = tokenizer.tokenize(text) + original_tokens = ['[CLS]'] + original_tokens + ['[SEP]'] + print(original_tokens) + length = len(tokens) + i = 0 + while(i < min(length - 1,len(original_tokens) - 1)): + tokens_length = min(length - 1,len(original_tokens) - 1) + if original_tokens[i] == tokens[i] or (i+1 threshold1) or (token == '\r\n' and count_tokens > threshold2): + texts.append([text]) + text = '' + count_tokens = 0 + if count_tokens > 0: + texts.append([text]) + return texts + +# In[1619]: + + +import nltk +from pattern.en import conjugate, lemma, lexeme,PRESENT,SG +''' + 这是一个输出BERT模型训练结果的函数,方便查看调试 +''' +def show_lm_probs(tokens, input_ids, probs, topk=5, firstk=20): #输出结果的函数,要最高概率topk个输出 + def print_pair(token, prob, end_str='', hit_mark=' '): + if i < firstk: + # token = token.replace('', '').replace('\n', '/n') + print('{}{: >3} | {: <12}'.format(hit_mark, int(round(prob*100)), token), end=end_str) + + ret = None + for i in range(len(tokens)): + ind_ = input_ids[i].item() if input_ids is not None else tokenizer.vocab[tokens[i]] + prob_ = probs[i][ind_].item() #这个probs是该字符串第i个位置上填上词典上各个词的概率,prob_是词典上原来天的这个词的概率 + print_pair(tokens[i], prob_, end_str='\t') + values, indices = probs[i].topk(topk) + #print(values, indices) + #print("****************************************************************************************************") + top_pairs = [] + for j in range(topk): + ind, prob = indices[j].item(), values[j].item() + hit_mark = '*' if ind == ind_ else ' ' + token = tokenizer.ids_to_tokens[ind] + print_pair(token, prob, hit_mark=hit_mark, end_str='' if j < topk - 1 else '\n') + top_pairs.append((token, prob)) + if tokens[i] == "[MASK]": + ret = top_pairs + return ret + + +# In[1621]: + + +def analyse_prob(prob,token): + ind_ = tokenizer.vocab[token] + prob_ = prob[ind_].item() + top_prob = prob.max().item() + top_ind = prob.argmax().item() + top_word = tokenizer.ids_to_tokens[top_ind] #可能性最高的词 + gap = math.log(top_prob) - math.log(prob_) #计算两个词之间的差距 + return top_word,gap + + +# In[1622]: + + +import colored +from colored import stylize +import spacy +nlp = spacy.load('en') +from nltk.corpus import wordnet as wn + +def analyse_词性(token,tag): + if 'VB' in tag: #如果是动词的各种时态 + tag0 = "v" + elif "JJ" in tag : #形容词 + tag0 = "a" + elif "RB" in tag: #副词 + tag0 = "r" + elif "NN" in tag: #名词 + tag0 = "n" + else: + return tag + if wn.morphy(token, tag0)==None: + nlp = spacy.load('en') + doc = nlp(token) + tag = doc[0].tag_ + return tag + +def show_abnormals(tokens,probs,text,show_suggestions=False): #多加了一个参数text,用来生成原来的token的 + global suggestions + global original_tokens + original_tokens = token_Align(tokens,text) + def gap2color(mode): + if mode == 1: + return 'yellow_1' + elif mode == 2: + return 'orange_1' + else: + return 'red_1' + + def print_token(token, suggestion, gap ,mode): + if gap == 0 and mode == 1: + print(stylize(token + ' ', colored.fg('white') + colored.bg('black')), end='') + else: + print(stylize(token, colored.fg(gap2color(mode)) + colored.bg('black')), end='') + if show_suggestions and mode > 1: + print(stylize('/' + str(suggestion) + ' ', colored.fg('green' if gap > 10 else 'cyan') + colored.bg('black')), end='') + else: + print(stylize(' ', colored.fg(gap2color(mode)) + colored.bg('black')), end='') + + + avg_gap = 0. + tokens_tag = nltk.pos_tag(tokens) #给整个text做词性标注 + for i in range(1, len(tokens) - 1): # skip first [CLS] and last [SEP] + if tokens[i]=='[UNK]': + continue + top_word,gap = analyse_prob(probs[i],tokens[i]) + print() + print("*******************************************************************************************************************") + print(i) + print(gap) + avg_gap += gap + suggestion = None + #doc = nlp(tokens[i]) #用spacy标记 + #tag = doc[0].tag_ + #tag = nltk.pos_tag([tokens[i]])[0][1] #直接对token标记 + tag = tokens_tag[i][1]#当前tokens的词性,上面是用不同的方法标注词性 + tag = analyse_词性(tokens[i],tag) + print(tag) + + if 'VB' in tag: #如果是动词的各种时态 + suggestion = analyse_V(i,probs[i],gap,top_word,2.5 ,7.9 ,1.8) + + elif "DT" == tag: #如果是冠词(冠词原则上不改变词性) + suggestion = analyse_DT(i,probs[i],gap,top_word,3 ,4 ,1) + + elif "JJ" in tag : #形容词 + suggestion = analyse_adj(i,probs[i],gap,top_word,5 ,8 ,2) + + elif "RB" in tag: #副词 + suggestion = analyse_adv(i,probs[i],gap,top_word,5 ,8 ,2) + + elif "PRP" in tag: #代词 + suggestion = analyse_pronoun(i,probs[i],gap,top_word,3 ,5 ,1.5) + + elif "NN" in tag: #名词 + suggestion = analyse_N(i,probs[i],gap,top_word,4 ,10 ,2.2) + + elif "CC" in tag: #连词 + suggestion = analyse_CC(i,probs[i],gap,top_word,2 ,2.5 ,1.5) + + elif "IN" == tag or 'TO' == tag: #介词 + suggestion = analyse_IN(i,probs[i],gap,top_word,3.5 ,4 ,1.5) + + elif 'MD' in tag: #情态动词 + suggestion = analyse_MD(i,probs[i],gap,top_word,3 ,4 ,1.5) + + elif "CD" in tag: #数词直接pass + pass + + elif "WDT" == tag and gap > 3.5: #who,which,that那些 + suggestion = top_word #推荐的词一般比较准 + + elif tokens[i] in u',.!?[]()<>"\':,。!?【】()《》“‘”’.': + suggestion = analyse_biaodian(i,probs[i],gap,top_word,1.3 ,2 ,1) + + elif gap > 5: + suggestion = top_word + + if (suggestion != None and suggestion.lower() != tokens[i] and suggestion.lower() != original_tokens[i]): #修改存在并且是另外一个词 + suggestions.update({i:suggestion}) + mode = 2 + elif suggestions.__contains__(i)==True: #这是因为之前在拼写检查时已经修改了该位置的单词 + if original_tokens[i] == tokens[i]: + del suggestions[i] + mode = 1 + else: + mode = 2 + suggestion = suggestions[i] + else: + if original_tokens[i] != tokens[i]: + mode = 2 + suggestions[i] = tokens[i] + suggestion = tokens[i] + else: + mode = 1 + + print_token(original_tokens[i], suggestion, gap, mode) + print() + print(original_tokens[i],tokens[i],suggestion,mode) + avg_gap /= (len(tokens) - 2) + print() + print('平均gap:'+ str(avg_gap)) + return avg_gap + +def analyze_part_text(text, masked_tokens=None, show_suggestions=True, show_firstk_probs=500): + print("原始文本") + print(text) + step = 15 #用于训练加速的步长,每15个token被mask一个位置 + global input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids,suggestions,original_tokens + suggestions = {}#清空全局变量 + text = process_biaodian(text) + print("标点处理后") + print(text) + text0 = text #保存有拼写错误的文本 + text = correct_spelling(text[0]) #拼写修正过得文本 + print("拼写修正后********************************") + print(text) + print("********************************") + #黄金搭档token_Align放在show_abnormals里面了 + input_ids_sen,input_type_ids_sen,in_sentence,sentences,entire_ids,entire_type_ids = process_text(text[0]) + + examples = convert_text_to_examples(text) + features = convert_examples_to_features(examples, tokenizer, print_info=False) + given_mask = "[MASK]" in features[0].tokens + if not given_mask or masked_tokens is not None: + assert len(features) == 1 + features, batches = copy_and_mask_feature(features[0],step, masked_tokens=masked_tokens) + #print(len(features)) + + input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) #把input_ids增加了一个维度,变成[n_features,sequence_len] + #这里的n_features实际上是句子有多少批训练 + + input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long) #把input_type_ids增加了一个维度,其实每一行都一样 + input_ids = input_ids.to(device) + input_type_ids = input_type_ids.to(device) + + mlm_logits = model(input_ids) + mlm_probs = F.softmax(mlm_logits, dim=-1) + tokens = features[0].tokens #为了输出,[mask]在input_ids里面表示出来,features的token都一样 + print(tokens) + if not given_mask or masked_tokens is not None: + bsz, seq_len, vocab_size = mlm_probs.size() #三个维度分别是batch_size, sequence_length, vocab_size + assert bsz == len(batches) + reduced_mlm_probs = torch.Tensor(1, len(tokens), vocab_size) + for i in batches: + pos = i + while pos < len(tokens): + reduced_mlm_probs[0, pos] = mlm_probs[i, pos] + pos = pos + step + mlm_probs = reduced_mlm_probs #压缩一下大小,节约不必要浪费的空间(只需要第i个batch里面[mask]位置的词汇表概率即可) + top_pairs = show_lm_probs(tokens, None, mlm_probs[0], firstk=show_firstk_probs) #传入的probs是二维的 + if not given_mask: + avg_gap = show_abnormals(tokens,mlm_probs[0],text0[0], show_suggestions=show_suggestions) + return suggestions,original_tokens,avg_gap + + +def analyze_text(text, masked_tokens=None, show_suggestions=True, show_firstk_probs=500): + suggestions = {} + avg_gap = 0 + new_part_suggestions = {} + original_tokens = ['[CLS]','[SEP]'] + text = process_biaodian(text) + text0 = text #保存有拼写错误的文本 + texts = split_text(text,130,100) + accumulate_length = 0 + remainer = 2 #[CLS]和[SEP] + for text0 in texts: + part_suggestions,part_original_tokens,part_avg_gap = analyze_part_text(text0, masked_tokens, show_suggestions, show_firstk_probs) + for key in part_suggestions: + new_part_suggestions[key + accumulate_length] = part_suggestions[key] + tokens_length = len(part_original_tokens) + accumulate_length = accumulate_length + tokens_length - remainer + suggestions.update(new_part_suggestions) + original_tokens = original_tokens[:-1] + part_original_tokens[1:] + avg_gap = avg_gap + part_avg_gap*(tokens_length - 2) + avg_gap = avg_gap/(accumulate_length) + return suggestions,original_tokens,avg_gap +# In[1626]: + + + +''' + 功能:对suggestions进行修改,由于某处位置改变造成suggestions后面的错误位置都相应移动 + 输入: + index:开始移动的位置 + direction:移动的方向,1表示向右边移,-1表示向左边移 +''' +def modify_suggestions(index,direction): + global suggestions + new_suggestions = {}; + if direction == 0: + pass + elif direction == 1: + for key in suggestions: + if key < index: + new_suggestions.update({key:suggestions[key]}) + else: + new_suggestions.update({key+1:suggestions[key]}) + elif direction == -1: + for key in suggestions: + if key < index: + new_suggestions.update({key:suggestions[key]}) + else: + new_suggestions.update({key-1:suggestions[key]}) + suggestions = new_suggestions + + +# In[1592]: + + +#print(suggestions) +def display_suggestion(): + print("**********************************display_suggestions********************************************************") + print("| {:50} : {}".format("suggestion","position in text")) + print("---------------------------------------------------------------------------------------") + for key in suggestions: + print("| {:<50} : {}".format(suggestions[key] ,key)) + print("*************************************************************************************************************") +#display_suggestion() + +''' + 功能: + 修改文本,tokens,suggestions + 输入: + index:修改的位置 + text:被修改前的原文 + 输出: + [text]:修改后的文本 + new_tokens:修改后的新tokens + suggestions:修改后新的建议字典 +''' +def modify_text(index,text): #修改文本,tokens,以及suggestions + global suggestions,original_tokens + tokens = original_tokens + new_text = "" + suggestion = suggestions[index] + del(suggestions[index]) + suggestion_tokens = suggestion.split(" ") + #print(suggestion_tokens) + if '去掉前面' == suggestion_tokens[0]: + del tokens[index - 1] + del suggestion_tokens[0] + del suggestion_tokens[0] + modify_suggestions(index,-1) + index = index - 1 + elif '去掉后面' == suggestion_tokens[0]: + del tokens[index + 1] + del suggestion_tokens[0] + del suggestion_tokens[0] + modify_suggestions(index+2,-1) + elif '去掉' == suggestion_tokens[0]: + del tokens[index] + del suggestion_tokens[0] + del suggestion_tokens[0] + modify_suggestions(index+1,-1) + if '原位置改成' in suggestion_tokens: + del suggestion_tokens[0] + + + len_suggest = len(suggestion_tokens) + if len_suggest == 1: + tokens[index] = suggestion_tokens[0] + elif len_suggest == 2: + tokens.insert(index,suggestion_tokens[0]) + tokens[index + 1] = suggestion_tokens[1] + modify_suggestions(index+1,1) + final_len = len(tokens) + + for i in range(1,len(tokens)-1): + word = tokens[i] + if word[0:2] == "##": + new_text = new_text + word[2:] + else: + new_text = new_text + ' ' + word + + original_tokens = tokens + return [text],tokens,suggestions + + +# In[1576]: + + +#变成py文件 +try: + get_ipython().system('jupyter nbconvert --to python likunlin_final.ipynb') +except: + pass + diff --git "a/likunlin_\350\215\211\347\250\277.ipynb" "b/likunlin_\350\215\211\347\250\277.ipynb" new file mode 100644 index 00000000000000..356bcac377cc68 --- /dev/null +++ "b/likunlin_\350\215\211\347\250\277.ipynb" @@ -0,0 +1,1216 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "05/14/2019 17:59:16 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/xd/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['dr', '##ae', '##m']\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;31m# Mask a token that we will try to predict back with `BertForMaskedLM`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0mmasked_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m8\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mtokenized_text\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'[CLS]'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'who'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'was'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'jim'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'henson'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'?'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'[SEP]'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'jim'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'[MASK]'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'was'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'a'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'puppet'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'##eer'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'[SEP]'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;31m# Convert token to vocabulary indices\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "import torch\n", + "from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM\n", + "\n", + "# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows\n", + "import logging\n", + "\n", + "logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',\n", + " datefmt = '%m/%d/%Y %H:%M:%S',\n", + " level = logging.INFO)\n", + "logger = logging.getLogger(__name__)\n", + "\n", + "# Load pre-trained model tokenizer (vocabulary)\n", + "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", + "\n", + "# Tokenized input\n", + "text = \"draem\"\n", + "tokenized_text = tokenizer.tokenize(text)\n", + "print(tokenized_text)\n", + "# Mask a token that we will try to predict back with `BertForMaskedLM`\n", + "masked_index = 8\n", + "assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']\n", + "\n", + "# Convert token to vocabulary indices\n", + "indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)\n", + "# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)\n", + "segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]\n", + "\n", + "# Convert inputs to PyTorch tensors\n", + "tokens_tensor = torch.tensor([indexed_tokens])\n", + "segments_tensors = torch.tensor([segments_ids])" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('tall', 'NN')]\n" + ] + } + ], + "source": [ + "import nltk\n", + "from pattern.en import conjugate, lemma, lexeme,PRESENT,SG\n", + "words = nltk.word_tokenize(\"I don't like the flower.\")\n", + "word_tag = nltk.pos_tag(['tall'])\n", + "print(word_tag)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "nltk.word_tokenize(text):对指定的句子进行分词,返回单词列表\n", + "\n", + "nltk.pos_tag(words):对指定的单词列表进行词性标记,返回标记列表\n", + "\n", + "CC coordinating conjunction\n", + "CD cardinal digit\n", + "DT determiner\n", + "EX existential there (like: \"there is\" ... think of it like \"there exists\")\n", + "FW foreign word\n", + "IN preposition/subordinating conjunction\n", + "JJ adjective 'big'\n", + "JJR adjective, comparative 'bigger'\n", + "JJS adjective, superlative 'biggest'\n", + "LS list marker 1)\n", + "MD modal could, will\n", + "NN noun, singular 'desk'\n", + "NNS noun plural 'desks'\n", + "NNP proper noun, singular 'Harrison'\n", + "NNPS proper noun, plural 'Americans'\n", + "PDT predeterminer 'all the kids'\n", + "POS possessive ending parent's\n", + "PRP personal pronoun I, he, she\n", + "PRP$ possessive pronoun my, his, hers\n", + "RB adverb very, silently,\n", + "RBR adverb, comparative better\n", + "RBS adverb, superlative best\n", + "RP particle give up\n", + "TO to go 'to' the store.\n", + "UH interjection errrrrrrrm\n", + "VB verb, base form take\n", + "VBD verb, past tense took\n", + "VBG verb, gerund/present participle taking\n", + "VBN verb, past participle taken\n", + "VBP verb, sing. present, non-3d take\n", + "VBZ verb, 3rd person sing. present takes\n", + "WDT wh-determiner which\n", + "WP wh-pronoun who, what\n", + "WP$ possessive wh-pronoun whose\n", + "WRB wh-abverb where, when" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 248, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "my\n", + "cactus\n", + "good\n", + "rock\n", + "python\n", + "friendly\n", + "best\n", + "run\n", + "run\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to /home/xd/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + } + ], + "source": [ + "import nltk\n", + "from nltk.stem import WordNetLemmatizer\n", + "nltk.download('wordnet')\n", + "lemmatizer = WordNetLemmatizer()\n", + "\n", + "print(lemmatizer.lemmatize(\"my\"))\n", + "print(lemmatizer.lemmatize(\"cacti\"))\n", + "print(lemmatizer.lemmatize(\"better\",pos=\"a\"))#pos只能是a,v,r,n\n", + "print(lemmatizer.lemmatize(\"rocks\"))\n", + "print(lemmatizer.lemmatize(\"python\"))\n", + "print(lemmatizer.lemmatize(\"friendly\", pos=\"n\"))\n", + "print(lemmatizer.lemmatize(\"best\", pos=\"a\"))\n", + "print(lemmatizer.lemmatize(\"run\"))\n", + "print(lemmatizer.lemmatize(\"run\",'a'))" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " taller | JJR \n" + ] + } + ], + "source": [ + "import spacy\n", + "nlp = spacy.load('en')\n", + "doc = nlp(\"taller\")\n", + "for i in range(0,len(doc)):\n", + " print('{: >10} | {: <10}'.format(doc[i].text, doc[i].tag_,))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['apple', 'apples', 'appling', 'appled']\n", + "putts\n" + ] + } + ], + "source": [ + "from pattern import en\n", + "from pattern.en import conjugate, lemma, lexeme,PRESENT,SG,INFINITIVE, PRESENT, PAST, FUTURE, PROGRESSIVE \n", + "#print (lemma('better','a'))\n", + "list0 = lexeme('apples')\n", + "\n", + "print(list0)\n", + "#print (lexeme('had'))\n", + "word = \"give\"\n", + "#print( conjugate('purred', '3sg'))\n", + "print (conjugate(verb='putting',tense=PRESENT,person = 3))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Verb conjugation\n", + "The pattern.en module has a lexicon of 8,500 common English verbs and their conjugated forms (infinitive, 3rd singular present, present participle, past and past participle – verbs such as be may have more forms). Some verbs can also be negated, including be, can, do, will, must, have, may, need, dare, ought.\n", + "\n", + "conjugate(verb, \n", + " tense = PRESENT, # INFINITIVE, PRESENT, PAST, FUTURE\n", + " person = 3, # 1, 2, 3 or None\n", + " number = SINGULAR, # SG, PL\n", + " mood = INDICATIVE, # INDICATIVE, IMPERATIVE, CONDITIONAL, SUBJUNCTIVE\n", + " aspect = IMPERFECTIVE, # IMPERFECTIVE, PERFECTIVE, PROGRESSIVE \n", + " negated = False, # True or False\n", + " parse = True)\n", + "lemma(verb) # Base form, e.g., are => be.\n", + "lexeme(verb) # List of possible forms: be => is, was, ...\n", + "tenses(verb) # List of possible tenses of the given form.\n", + "The conjugate() function takes the following optional parameters:\n", + "\n", + "Tense\tPerson\tNumber\tMood\tAspect\tAlias\tTag\tExample\n", + "INFINITIVE\tNone\tNone\tNone\tNone\t\"inf\"\tVB\tbe\n", + "PRESENT\t1\tSG\tINDICATIVE\tIMPERFECTIVE\t\"1sg\"\tVBP\tI am\n", + "PRESENT\t2\tSG\tINDICATIVE\tIMPERFECTIVE\t\"2sg\"\t ·\tyou are\n", + "PRESENT\t3\tSG\tINDICATIVE\tIMPERFECTIVE\t\"3sg\"\tVBZ\the is\n", + "PRESENT\tNone\tPL\tINDICATIVE\tIMPERFECTIVE\t\"pl\"\t ·\tare\n", + "PRESENT\tNone\tNone\tINDICATIVE\tPROGRESSIVE\t\"part\"\tVBG\tbeing\n", + " \n", + "PAST\tNone\tNone\tNone\tNone\t\"p\"\tVBD\twere\n", + "PAST\t1\tPL\tINDICATIVE\tIMPERFECTIVE\t\"1sgp\"\t ·\tI was\n", + "PAST\t2\tPL\tINDICATIVE\tIMPERFECTIVE\t\"2sgp\"\t ·\tyou were\n", + "PAST\t3\tPL\tINDICATIVE\tIMPERFECTIVE\t\"3gp\"\t ·\the was\n", + "PAST\tNone\tPL\tINDICATIVE\tIMPERFECTIVE\t\"ppl\"\t ·\twere\n", + "PAST\tNone\tNone\tINDICATIVE\tPROGRESSIVE\t\"ppart\"\tVBN\tbeen" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1, 2, 3], [4, 5, 6]]\n" + ] + } + ], + "source": [ + "import torch\n", + "a = [[1,2,3],[4,5,6]]\n", + "\n", + "torch.tensor(a)\n", + "print(a)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "article(word, function=INDEFINITE) # DEFINITE | INDEFINITE,限定性冠词the或者非限定性冠词a/an\n", + "referenced(word, article=INDEFINITE) # Returns article + word. 返回冠词 + word\n", + "pluralize(word, pos=NOUN, custom={}, classical=True)\n", + "singularize(word, pos=NOUN, custom={})" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a university\n", + "an\n", + "suppers\n", + "supper\n" + ] + } + ], + "source": [ + "from pattern.en import article,referenced,pluralize, singularize\n", + "print(referenced('university'))\n", + "print(article('hour'))\n", + "\n", + "print(pluralize('supper'))\n", + "print(singularize('supper'))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'pattern.en' has no attribute 'adjective'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpattern\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0men\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;31m#print( en.is_number(12))\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0men\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madjective\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_emotion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"anxious\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mboolean\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m: module 'pattern.en' has no attribute 'adjective'" + ] + } + ], + "source": [ + "from pattern import en\n", + "#print( en.is_number(12))\n", + "print(en.adjective.is_emotion(\"anxious\", boolean=False))" + ] + }, + { + "cell_type": "code", + "execution_count": 382, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'path_similarity' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwordnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msynsets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'basil'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpath_similarity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'basic'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'base'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m \u001b[0mwordnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msimilarity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'path_similarity' is not defined" + ] + } + ], + "source": [ + "from pattern.en import wordnet\n", + "\n", + "a = wordnet.synsets('basement')[0]\n", + "b = wordnet.synsets('base')[0]\n", + "c = wordnet.synsets('basil')[0]\n", + "\n", + "t = path_similarity('basic','base')\n", + "print(t)\n", + "print( wordnet.similarity(a, a)) \n", + "print (wordnet.similarity(a, b))\n", + "print( wordnet.similarity(a, c)) " + ] + }, + { + "cell_type": "code", + "execution_count": 263, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loudlier\n" + ] + } + ], + "source": [ + "from pattern.en import comparative, superlative,grade\n", + " \n", + "print (comparative('loudly'))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " if univ_pos in (NOUN, 'NOUN', 'noun'):\n", + " univ_pos = 'noun'\n", + " elif univ_pos in (VERB, 'VERB', 'verb'):\n", + " univ_pos = 'verb'\n", + " elif univ_pos in (ADJ, 'ADJ', 'adj'):\n", + " univ_pos = 'adj'\n", + " elif univ_pos in (PUNCT, 'PUNCT', 'punct'):\n", + " univ_pos = 'punct'" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'Tagger'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mspacy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpipeline\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mTagger\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mtagger\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTagger\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnlp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvocab\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mdoc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnlp\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mu\"I went to countryside with my family.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprocessed\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtagger\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mImportError\u001b[0m: cannot import name 'Tagger'" + ] + } + ], + "source": [ + "from spacy.pipeline import Tagger\n", + "tagger = Tagger(nlp.vocab)\n", + "doc = nlp(u\"I went to countryside with my family.\")\n", + "processed = tagger(doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "I i\n", + "went go\n", + "to to\n", + "countryside countryside\n", + "with with\n", + "my my\n", + "family family\n", + ". .\n", + "hurrily hurrily\n" + ] + } + ], + "source": [ + "import spacy\n", + "\n", + "print(nlp(u''))\n", + "for tok in nlp(u'I went to countryside with my family.'):\n", + " print (tok, tok.lemma_)\n", + " \n", + "for tok in nlp(u'He tried his best to run hurrily'):\n", + " if tok.text == 'hurrily':\n", + " print (tok, tok.lemma_) " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "better 0 \n", + ". 0 \n" + ] + } + ], + "source": [ + "import spacy\n", + "\n", + "doc = nlp(u\"better.\")\n", + "\n", + "for token in doc:\n", + " print(token, token.lemma, token.lemma_)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'ducks'}\n" + ] + } + ], + "source": [ + "from spacy.lemmatizer import Lemmatizer\n", + "from spacy.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES\n", + "lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)\n", + "lemmas = lemmatizer(u'ducks', u'NOUN')\n", + "print(lemmas)" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('summarise', 0.6666666666666666), ('summarises', 0.3333333333333333)]\n" + ] + } + ], + "source": [ + "from pattern.en import suggest\n", + "word = 'darkment'\n", + "print (suggest('summeries'))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "这里包含中文字符.!?\n" + ] + } + ], + "source": [ + "def C_trans_to_E(string):\n", + " E_pun = u',.!?[]()<>\"\\''\n", + " C_pun = u',。!?【】()《》“‘'\n", + " table= {ord(f):ord(t) for f,t in zip(C_pun,E_pun)}\n", + " return string.translate(table)\n", + "\n", + "s1 = '这里包含中文字符。!?'\n", + "s2 = C_trans_to_E(s1)\n", + "print(s2)" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "beauty\n", + "beauti\n", + "beauti\n" + ] + } + ], + "source": [ + "from nltk.stem.lancaster import LancasterStemmer\n", + "from nltk.stem.porter import PorterStemmer\n", + "from nltk.stem import SnowballStemmer\n", + "stemmers=[]\n", + "stemmers.append(LancasterStemmer()) \n", + "stemmers.append(SnowballStemmer(\"english\"))\n", + "stemmers.append(PorterStemmer())\n", + "for stemmer in stemmers:\n", + " word = stemmer.stem(\"beautiful\")\n", + " print(word)\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n", + "useful\n" + ] + } + ], + "source": [ + "from pattern.en import suggest\n", + "from nltk.stem.lancaster import LancasterStemmer\n", + "from nltk.stem.porter import PorterStemmer\n", + "from nltk.stem import SnowballStemmer\n", + "stemmers=[]\n", + "stemmers.append(LancasterStemmer()) \n", + "stemmers.append(SnowballStemmer(\"english\"))\n", + "stemmers.append(PorterStemmer())\n", + "\n", + "def adj_to_adv(word):\n", + " suggest_word = None\n", + " if(word == \"good\"):\n", + " return \"well\"\n", + " else:\n", + " word_stem = LancasterStemmer().stem(word)\n", + " #print(word_stem)\n", + " suggest_ = word + 'ly'\n", + " #print(suggest_)\n", + " suggest_list = suggest(suggest_)\n", + " #print(suggest_list)\n", + " for word_ in suggest_list:\n", + " stem_list = []\n", + " #print(word_[0])\n", + " for stemmer in stemmers:\n", + " stem_list.append(stemmer.stem(word_[0]))\n", + " #print(stem_list)\n", + " if word_stem in stem_list and word != word_[0]:\n", + " suggest_word = word_[0]\n", + " break\n", + " return suggest_word\n", + "\n", + "def adv_to_adj(word):\n", + " suggest_word = None\n", + " if(word == \"well\"):\n", + " return \"good\" \n", + " else:\n", + " word_stem = PorterStemmer().stem(word)\n", + " #print(\"词根\" + word_stem)\n", + " suggest_ = word[:-2]\n", + " #print(word)\n", + " suggest_list = suggest(suggest_)\n", + " #print(suggest_list)\n", + " for word_ in suggest_list:\n", + " stem_list = []\n", + " #print(word_[0])\n", + " for stemmer in stemmers:\n", + " stem_list.append(stemmer.stem(word_[0]))\n", + " #print(stem_list)\n", + " if word_stem in stem_list and word != word_[0]:\n", + " suggest_word = word_[0]\n", + " break\n", + " return suggest_word\n", + "\n", + "print(adj_to_adv(\"difficult\"))\n", + "print(adv_to_adj(\"usefully\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'comparative' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 52\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mlist_word\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mlist_word2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 54\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuild_like_word_adj\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"angry\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 55\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuild_like_word_adj2\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"angry\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuild_like_word_adv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"however\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36mbuild_like_word_adj\u001b[0;34m(word)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mlemmas\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0mlist_word\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m \u001b[0mlist_word\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcomparative\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 14\u001b[0m \u001b[0mlist_word\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msuperlative\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mword_adv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0madj_to_adv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'comparative' is not defined" + ] + } + ], + "source": [ + "from spacy.lemmatizer import Lemmatizer\n", + "from spacy.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES\n", + "from pattern.en import suggest\n", + "lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)\n", + "#lemmas = lemmatizer(u'best', u'adj')\n", + "\n", + "def build_like_word_adj(word): #创建类似形容词列表\n", + " list_word = []\n", + " lemmas = lemmatizer(word, u'adj')\n", + " #print(lemmas)\n", + " for i in lemmas:\n", + " list_word.append(i)\n", + " list_word.append(comparative(i))\n", + " list_word.append(superlative(i))\n", + " word_adv = adj_to_adv(i)\n", + " if word_adv != None:\n", + " list_word.append(word_adv)\n", + " return list_word\n", + "\n", + "def build_like_word_adv(word): #创建类似形容词列表\n", + " list_word = []\n", + " lemmas = lemmatizer(word, u'adj')\n", + " #print(lemmas)\n", + " for i in lemmas:\n", + " list_word.append(i)\n", + " list_word.append(comparative(i))\n", + " list_word.append(superlative(i))\n", + " word_adj = adv_to_adj(i)\n", + " if word_adj != None:\n", + " list_word.append(word_adj)\n", + " return list_word\n", + "def build_like_word_adj2(word): #创建类似形容词列表\n", + " list_word = []\n", + " list_word2 = [] #把比较级最高级带more的放在这里\n", + " lemmas = lemmatizer(word, u'adj')\n", + " #print(lemmas)\n", + " for i in lemmas:\n", + " list_word.append(i)\n", + " word_er = comparative(i)\n", + " if \"more\" in word_er:\n", + " list_word2.append(word_er)\n", + " else:\n", + " list_word.append(word_er)\n", + " word_est = superlative(i)\n", + " if \"most\" in word_est:\n", + " list_word2.append(word_est)\n", + " else:\n", + " list_word.append(word_est)\n", + " word_adv = adj_to_adv(i)\n", + " if word_adv != None:\n", + " list_word.append(word_adv)\n", + " return list_word,list_word2\n", + "\n", + "print(build_like_word_adj(\"angry\"))\n", + "print(build_like_word_adj2(\"angry\"))\n", + "print(build_like_word_adv(\"however\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 287, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "19 xiaofang -13\n", + "18 xiaofang -13\n" + ] + } + ], + "source": [ + "name = 'Tim' #全局变量\n", + "ids = 130\n", + "def f1():\n", + " age = 18 #局部变量\n", + " print(age,name,ids)\n", + "\n", + " \n", + "def f2():\n", + " age=19 #局部变量\n", + " global name,ids\n", + " name = 'xiaofang'\n", + " ids = -13\n", + " print(age,name,ids)\n", + " f1()\n", + "\n", + "\n", + "f2()" + ] + }, + { + "cell_type": "code", + "execution_count": 302, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/12/2019 16:58:49 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/xd/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084\n" + ] + } + ], + "source": [ + "import os\n", + "import json\n", + "\n", + "import numpy as np\n", + "import math\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "from pylab import rcParams\n", + "\n", + "import torch\n", + "import torch.nn.functional as F\n", + "from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig\n", + "from examples.extract_features import *\n", + "\n", + "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')#do_lower_case:在标记化时将文本转换为小写。默认= True\n" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'tokenizer' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mword\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mlist_word\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mlist_word_id\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvocab\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mword\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mword\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'是错误的key'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'tokenizer' is not defined" + ] + } + ], + "source": [ + "list_word_id = []\n", + "list_word = ['angry', 'angrier', 'angriest', 'angrily']\n", + "for word in list_word:\n", + " try:\n", + " list_word_id.append(tokenizer.vocab[word])\n", + " except KeyError:\n", + " print(word + '是错误的key')\n", + "print(list_word_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 304, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2]\n" + ] + } + ], + "source": [ + "a = 1\n", + "b = [2, 3]\n", + "\n", + "def func():\n", + " del b[1]\n", + "\n", + "func()\n", + "print(b)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['The', 'man', 'is', 'a', 'Chinese', '.', 'He', 'is', \"n't\", 'a', 'bitch', '.']\n" + ] + } + ], + "source": [ + "tokenize = nltk.word_tokenize\n", + "text = tokenize(\"The man is a Chinese . He isn't a bitch.\")\n", + "print(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 317, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{1: 'xiaofang', 13: 'xiaoheimao'}\n" + ] + } + ], + "source": [ + "dictionary = {}\n", + "dictionary.update({1:\"xiaofang\"})\n", + "dictionary.update({13:\"xiaoheimao\"})\n", + "\n", + "print(dictionary)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "3、词干化\n", + "you are best. it is lemmatize test for spacy. I love these books\n", + "you -PRON- 757862\n", + "are be 536\n", + "best best 902\n", + ". . 453\n", + "it it 519\n", + "is is 513\n", + "lemmatize lemmatize 1138934\n", + "test test 1877\n", + "for for 531\n", + "spacy spacy 857539\n", + ". . 453\n", + "I -PRON- 757862\n", + "love love 949\n", + "these these 742\n", + "books book 1300\n" + ] + } + ], + "source": [ + "print(\"\\n3、词干化\")\n", + "test_doc = nlp(u\"you are best. it is lemmatize test for spacy. I love these books\")\n", + "print(test_doc)\n", + "for token in test_doc:\n", + " print(token, token.lemma_, token.lemma)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "basic basic 0.9999999284398302\n", + "basic as 0.3952775975639635\n", + "basic base 0.44361194055627906\n", + "as basic 0.3952775975639635\n", + "as as 0.9999999975007859\n", + "as base 0.3801193503113012\n", + "base basic 0.44361194055627906\n", + "base as 0.3801193503113012\n", + "base base 0.9999999536640364\n" + ] + } + ], + "source": [ + "import spacy\n", + "import en_core_web_md\n", + "nlp = en_core_web_md.load() # make sure to use larger model!\n", + "tokens = nlp(u'basic as base')\n", + "\n", + "for token1 in tokens:\n", + " for token2 in tokens:\n", + " print(token1.text, token2.text, token1.similarity(token2))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "[('increasing', 1.0)]\n" + ] + } + ], + "source": [ + "import enchant\n", + "from pattern.en import suggest\n", + "d = enchant.Dict(\"en_US\")\n", + "\n", + "print(d.check(\"cream\"))\n", + "print(suggest(\"increasing\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('cream', 0.5384615384615384), ('crew', 0.4230769230769231), ('cret', 0.038461538461538464)]\n", + "pattern的suggest time cost 0.0010383129119873047 s\n", + "['rem', 'creme', 'cream', 'cram', 'chem', 'crew', 'c rem', 'Cree', 'Rem', 'crime', 'crimp']\n", + "enchant的suggest time cost 0.019581079483032227 s\n" + ] + } + ], + "source": [ + "from pattern.en import suggest\n", + "import time\n", + "import enchant\n", + "d = enchant.Dict(\"en_US\")\n", + "time_start=time.time()\n", + "print(suggest(\"crem\"))\n", + "time_end=time.time()\n", + "print('pattern的suggest time cost',time_end-time_start,'s')\n", + "\n", + "time_start=time.time()\n", + "print(d.suggest(\"crem\"))\n", + "time_end=time.time()\n", + "print('enchant的suggest time cost',time_end-time_start,'s')" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('beauteous', 0.5), ('dishy', 0.5)]\n" + ] + } + ], + "source": [ + "from convert_pos import convert\n", + " \n", + "print(convert(\"beauty\", 'n', 'a'))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Synset('happy.a.01'), Synset('felicitous.s.02'), Synset('glad.s.02'), Synset('happy.s.04')]\n" + ] + } + ], + "source": [ + "word = 'happy'\n", + "from_pos = 'a'\n", + "synsets = wn.synsets(word, pos=from_pos)\n", + "print(synsets)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['it', 'was', 'monday', 'morning', ',', 'and', 'the', 'writing', 'class', 'had', 'just', 'begin', '.', 'everyone', 'was', 'silent', ',', 'wait', 'to', 'see', 'who', 'would', 'be', 'called', 'upon', 'to', 'read', 'his', 'and', 'her', 'paragraph', 'aloud', '.', 'some', 'of', 'us', 'were', 'confident', 'and', 'eager', 'take', 'part', 'in', 'the', 'class', 'activity', ',', 'others', 'were', 'nervous', 'and', 'anxious', '.', 'i', 'had', 'done', 'myself', 'homework', 'but', 'i', 'was', 'shy', '.', 'i', 'was', 'afraid', 'that', 'to', 'speak', 'in', 'front', 'of', 'a', 'larger', 'group', 'of', 'people', '.', 'at', 'that', 'moment', ',', 'i', 'remembered', 'that', 'my', 'father', 'once', 'said', ',', '``', 'the', 'classroom', 'is', 'a', 'place', 'for', 'learning', 'and', 'that', 'include', 'leaning', 'from', 'textbooks', ',', 'and', 'mistake', 'as', 'well', '.', \"''\", 'immediate', ',', 'i', 'raised', 'my', 'huuuand', '.']\n" + ] + } + ], + "source": [ + "text = nltk.word_tokenize('It was Monday morning, and the writing class had just begin. Everyone was silent, wait to see who would be called upon to read his and her paragraph aloud. Some of us were confident and eager take part in the class activity, others were nervous and anxious. I had done myself homework but I was shy. I was afraid that to speak in front of a larger group of people. At that moment, I remembered that my father once said, \"The classroom is a place for learning and that include leaning from textbooks, and mistake as well.\" Immediate, I raised my huuuand.'.lower())\n", + "print(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[NbConvertApp] Converting notebook likunlin_草稿.ipynb to python\n", + "[NbConvertApp] Writing 14963 bytes to likunlin_草稿.py\n" + ] + } + ], + "source": [ + "try:\n", + " !jupyter nbconvert --to python likunlin_草稿.ipynb\n", + "except:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\"\n" + ] + } + ], + "source": [ + "def C_trans_to_E(string): #标点符号转换函数\n", + " E_pun = u',.!?[]()<>\"\\''\n", + " C_pun = u',。!?【】()《》“‘'\n", + " table= {ord(f):ord(t) for f,t in zip(C_pun,E_pun)}\n", + " return string.translate(table)\n", + "\n", + "print(C_trans_to_E(\"“\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n" + ] + } + ], + "source": [ + "from nltk.corpus import wordnet as wn\n", + "\n", + "print(wn.morphy('taller', \"n\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "def analyse_词性(token,tag):\n", + " if 'VB' in tag: #如果是动词的各种时态\n", + " tag0 = \"v\"\n", + " elif \"JJ\" in tag : #形容词\n", + " tag0 = \"a\"\n", + " elif \"RB\" in tag: #副词\n", + " tag0 = \"r\"\n", + " elif \"NN\" in tag: #名词\n", + " tag0 = \"n\"\n", + " else:\n", + " return tag\n", + " if wn.morphy(token, tag0)==None:\n", + " tag = nltk.pos_tag([token])[0][1]\n", + " return tag" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('countryside', 'NN')]\n" + ] + } + ], + "source": [ + "tokens = ['I','went','to','countryside','.']\n", + "tokens = ['countryside']\n", + "print(nltk.pos_tag(tokens))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lkl.tar b/lkl.tar new file mode 100644 index 00000000000000..310b50123edd39 Binary files /dev/null and b/lkl.tar differ diff --git a/mnist/processed/test.pt b/mnist/processed/test.pt new file mode 100644 index 00000000000000..eb8ca2281f459d Binary files /dev/null and b/mnist/processed/test.pt differ diff --git a/mnist/processed/training.pt b/mnist/processed/training.pt new file mode 100644 index 00000000000000..ccd5b1c99cdc24 Binary files /dev/null and b/mnist/processed/training.pt differ diff --git a/mnist/raw/t10k-images-idx3-ubyte b/mnist/raw/t10k-images-idx3-ubyte new file mode 100644 index 00000000000000..1170b2cae98de7 Binary files /dev/null and b/mnist/raw/t10k-images-idx3-ubyte differ diff --git a/mnist/raw/t10k-labels-idx1-ubyte b/mnist/raw/t10k-labels-idx1-ubyte new file mode 100644 index 00000000000000..d1c3a970612bbd Binary files /dev/null and b/mnist/raw/t10k-labels-idx1-ubyte differ diff --git a/mnist/raw/train-images-idx3-ubyte b/mnist/raw/train-images-idx3-ubyte new file mode 100644 index 00000000000000..bbce27659e0fc2 Binary files /dev/null and b/mnist/raw/train-images-idx3-ubyte differ diff --git a/mnist/raw/train-labels-idx1-ubyte b/mnist/raw/train-labels-idx1-ubyte new file mode 100644 index 00000000000000..d6b4c5db3b5206 Binary files /dev/null and b/mnist/raw/train-labels-idx1-ubyte differ diff --git a/pattern_develop b/pattern_develop new file mode 160000 index 00000000000000..53245196139c6e --- /dev/null +++ b/pattern_develop @@ -0,0 +1 @@ +Subproject commit 53245196139c6ef26dc9c34873dda8a16f236d23 diff --git a/probe_pretrained_model.py b/probe_pretrained_model.py new file mode 100644 index 00000000000000..be03d451adb729 --- /dev/null +++ b/probe_pretrained_model.py @@ -0,0 +1,274 @@ +import os + +import numpy as np +import math +import matplotlib +import matplotlib.pyplot as plt +from pylab import rcParams + +import torch +import torch.nn.functional as F +from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig +from examples.extract_features import * + +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + +CONFIG_NAME = 'bert_config.json' +BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/' +config_file = os.path.join(BERT_DIR, CONFIG_NAME) +config = BertConfig.from_json_file(config_file) + +model = BertForPreTraining.from_pretrained(BERT_DIR) +model.eval() + +vis_attn_topk = 3 + +def has_chinese_label(labels): + labels = [label.split('->')[0].strip() for label in labels] + r = sum([len(label) > 1 for label in labels if label not in ['BOS', 'EOS']]) * 1. / (len(labels) - 1) + return 0 < r < 0.5 # r == 0 means empty query labels used in self attention + +def _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col, color='b'): + assert len(query_labels) == attn.size(0) + assert len(key_labels) == attn.size(1) + + ax1.set_xlim([-1, 1]) + ax1.set_xticks([]) + ax2 = ax1.twinx() + nlabels = max(len(key_labels), len(query_labels)) + pos = range(nlabels) + + if 'self' in attn_name and col < ncols - 1: + query_labels = ['' for _ in query_labels] + + for ax, labels in [(ax1, key_labels), (ax2, query_labels)]: + ax.set_yticks(pos) + if has_chinese_label(labels): + ax.set_yticklabels(labels, fontproperties=zhfont) + else: + ax.set_yticklabels(labels) + ax.set_ylim([nlabels - 1, 0]) + ax.tick_params(width=0, labelsize='xx-large') + + for spine in ax.spines.values(): + spine.set_visible(False) + +# mask, attn = filter_attn(attn) + for qi in range(attn.size(0)): +# if not mask[qi]: +# continue +# for ki in range(attn.size(1)): + for ki in attn[qi].topk(vis_attn_topk)[1]: + a = attn[qi, ki] + ax1.plot((-1, 1), (ki, qi), color, alpha=a) +# print(attn.mean(dim=0).topk(5)[0]) +# ax1.barh(pos, attn.mean(dim=0).data.cpu().numpy()) + +def plot_layer_attn(result_tuple, attn_name='dec_self_attns', layer=0, heads=None): + hypo, nheads, labels_dict = result_tuple + key_labels, query_labels = labels_dict[attn_name] + if heads is None: + heads = range(nheads) + else: + nheads = len(heads) + + stride = 2 if attn_name == 'dec_enc_attns' else 1 + nlabels = max(len(key_labels), len(query_labels)) + rcParams['figure.figsize'] = 20, int(round(nlabels * stride * nheads / 8 * 1.0)) + + rows = nheads // ncols * stride + fig, axes = plt.subplots(rows, ncols) + + # for head in range(nheads): + for head_i, head in enumerate(heads): + row, col = head_i * stride // ncols, head_i * stride % ncols + ax1 = axes[row, col] + attn = hypo[attn_name][layer][head] + _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col) + if attn_name == 'dec_enc_attns': + col = col + 1 + axes[row, col].axis('off') # next subfig acts as blank place holder + # plt.suptitle('%s with %d heads, Layer %d' % (attn_name, nheads, layer), fontsize=20) + plt.show() + +ncols = 4 + +import re +def convert_text_to_examples(text): + examples = [] + unique_id = 0 + if True: + for line in text: + line = tokenization.convert_to_unicode(line) + line = line.strip() + text_a = None + text_b = None + m = re.match(r"^(.*) \|\|\| (.*)$", line) + if m is None: + text_a = line + else: + text_a = m.group(1) + text_b = m.group(2) + examples.append( + InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) + unique_id += 1 + return examples + +def convert_examples_to_features(examples, tokenizer, append_special_tokens=True, replace_mask=True, print_info=False): + features = [] + for (ex_index, example) in enumerate(examples): + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + tokens = [] + input_type_ids = [] + if append_special_tokens: + tokens.append("[CLS]") + input_type_ids.append(0) + for token in tokens_a: + if replace_mask and token == '_': # XD + token = "[MASK]" + tokens.append(token) + input_type_ids.append(0) + if append_special_tokens: + tokens.append("[SEP]") + input_type_ids.append(0) + + if tokens_b: + for token in tokens_b: + if replace_mask and token == '_': # XD + token = "[MASK]" + tokens.append(token) + input_type_ids.append(1) + if append_special_tokens: + tokens.append("[SEP]") + input_type_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_mask = [1] * len(input_ids) + + if ex_index < 5 and print_info: + logger.info("*** Example ***") + logger.info("unique_id: %s" % (example.unique_id)) + logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( + "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) + + features.append( + InputFeatures( + unique_id=example.unique_id, + tokens=tokens, + input_ids=input_ids, + input_mask=input_mask, + input_type_ids=input_type_ids)) + return features + +def copy_and_mask_features(features): + import copy + masked_feature_copies = [] + for feature in features: + for masked_pos in range(len(feature.tokens)): + feature_copy = copy.deepcopy(feature) + feature_copy.input_ids[masked_pos] = tokenizer.vocab["[MASK]"] + masked_feature_copies.append(feature_copy) + return masked_feature_copies + +def show_lm_probs(tokens, input_ids, probs, topk=5, firstk=20): + def print_pair(token, prob, end_str='', hit_mark=' '): + # token = token.replace('', '').replace('\n', '/n') + print('{}{: >3} | {: <12}'.format(hit_mark, int(round(prob*100)), token), end=end_str) + + for i in range(len(tokens)): + if i >= firstk: + break + ind_ = input_ids[i].item() if input_ids is not None else tokenizer.vocab[tokens[i]] + prob_ = probs[i][ind_].item() + print_pair(tokens[i], prob_, end_str='\t') + values, indices = probs[i].topk(topk) + for j in range(topk): + ind, prob = indices[j].item(), values[j].item() + hit_mark = '*' if ind == ind_ else ' ' + print_pair(tokenizer.ids_to_tokens[ind], prob, hit_mark=hit_mark) + print() + +import colored +from colored import stylize + +def show_abnormals(tokens, probs, show_suggestions=False): + def gap2color(gap): + if gap <= 5: + return 'yellow' + elif gap <= 10: + return 'orange_1' + else: + return 'red_1' + + def print_token(token, suggestion, gap): + if gap == 0: + print(stylize(token + ' ', colored.fg('white') + colored.bg('black')), end='') + else: + print(stylize(token, colored.fg(gap2color(gap)) + colored.bg('black')), end='') + if show_suggestions and gap > 5: + print(stylize('/' + suggestion + ' ', colored.fg('green' if gap > 10 else 'cyan') + colored.bg('black')), end='') + else: + print(stylize(' ', colored.fg(gap2color(gap)) + colored.bg('black')), end='') + # print('/' + suggestion, end=' ') + # print('%.2f' % gap, end=' ') + + avg_gap = 0. + for i in range(1, len(tokens) - 1): # skip first [CLS] and last [SEP] + ind_ = tokenizer.vocab[tokens[i]] + prob_ = probs[i][ind_].item() + top_prob = probs[i].max().item() + top_ind = probs[i].argmax().item() + gap = math.log(top_prob) - math.log(prob_) + suggestion = tokenizer.ids_to_tokens[top_ind] + print_token(tokens[i], suggestion, gap) + avg_gap += gap + avg_gap /= (len(tokens) - 2) + print() + print(avg_gap) + +analyzed_cache = {} + +def analyze_text(text, show_suggestions=False, show_firstk_probs=20): + if text[0] in analyzed_cache: + features, mlm_probs = analyzed_cache[text[0]] + given_mask = "[MASK]" in features[0].tokens + else: + examples = convert_text_to_examples(text) + features = convert_examples_to_features(examples, tokenizer, print_info=False) + given_mask = "[MASK]" in features[0].tokens + if not given_mask: + features = copy_and_mask_features(features) + + input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long) + + mlm_logits, _ = model(input_ids, input_type_ids) + mlm_probs = F.softmax(mlm_logits, dim=-1) + + if not given_mask: + seq_len, _, vocab_size = mlm_probs.size() + reduced_mlm_probs = torch.Tensor(1, seq_len, vocab_size) + for i in range(seq_len): + reduced_mlm_probs[0, i] = mlm_probs[i, i] + mlm_probs = reduced_mlm_probs + + analyzed_cache[text[0]] = (features, mlm_probs) + + show_lm_probs(features[0].tokens, None, mlm_probs[0], firstk=show_firstk_probs) + if not given_mask: + show_abnormals(features[0].tokens, mlm_probs[0], show_suggestions=show_suggestions) + +text = ["Who was Jim Henson? Jim Henson was a puppeteer."] +text = ["I went to school by bus. I was very tired."] +text = ["Last week I went to the theatre. I had a very good seat. The play was very interesting. But I didn't enjoy it. A young man and a young woman were sitting behind me. They were talking loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angrily. They didn't pay any attention.In the end, I couldn't bear it. I turned round again. 'I can't hear a word!' I said angrily. 'It's none of your business,' the young man said rudely. 'This is a private conversation!'"] +# text = ["Last week I went to the theatre. I had a very good seat. The play was very interesting. But I didn't enjoy it. A young man and a young woman were sitting behind me. They were talking loudly. I got very angry. I couldn't hear a word. I turned round. I looked at the man angrily. They didn't pay any attention."] +# text = ["After the outbreak of the disease, the Ministry of Agriculture and rural areas immediately sent a supervision team to the local. Local Emergency Response Mechanism has been activated in accordance with the requirements, to take blockade, culling, harmless treatment, disinfection and other treatment measures to all disease and culling of pigs for harmless treatment. At the same time, all live pigs and their products are prohibited from transferring out of the blockade area, and live pigs are not allowed to be transported into the blockade area. At present, all the above measures have been implemented."] + +# analyze_text(text) diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py index 7850fa5555e5a4..0ef8263748150b 100644 --- a/pytorch_pretrained_bert/__init__.py +++ b/pytorch_pretrained_bert/__init__.py @@ -1,6 +1,8 @@ +__version__ = "0.4.0" from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer from .modeling import (BertConfig, BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction, - BertForSequenceClassification, BertForQuestionAnswering) + BertForSequenceClassification, BertForMultipleChoice, + BertForTokenClassification, BertForQuestionAnswering) from .optimization import BertAdam from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE diff --git a/pytorch_pretrained_bert/__main__.py b/pytorch_pretrained_bert/__main__.py index 73f1909b43a264..79ad8429323221 100644 --- a/pytorch_pretrained_bert/__main__.py +++ b/pytorch_pretrained_bert/__main__.py @@ -1,5 +1,5 @@ # coding: utf8 -if __name__ == '__main__': +def main(): import sys try: from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch @@ -17,3 +17,6 @@ TF_CONFIG = sys.argv.pop() TF_CHECKPOINT = sys.argv.pop() convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) + +if __name__ == '__main__': + main() diff --git a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py index 20fdd8c0d6e856..1ff6c073e32909 100755 --- a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py +++ b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py @@ -50,7 +50,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor name = name.split('/') # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model - if name[-1] in ["adam_v", "adam_m"]: + if any(n in ["adam_v", "adam_m", "global_step"] for n in name): print("Skipping {}".format("/".join(name))) continue pointer = model @@ -59,9 +59,9 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor l = re.split(r'_(\d+)', m_name) else: l = [m_name] - if l[0] == 'kernel': + if l[0] == 'kernel' or l[0] == 'gamma': pointer = getattr(pointer, 'weight') - elif l[0] == 'output_bias': + elif l[0] == 'output_bias' or l[0] == 'beta': pointer = getattr(pointer, 'bias') elif l[0] == 'output_weights': pointer = getattr(pointer, 'weight') diff --git a/pytorch_pretrained_bert/file_utils.py b/pytorch_pretrained_bert/file_utils.py index f734b7e22b114b..43fa8ca87e20ee 100644 --- a/pytorch_pretrained_bert/file_utils.py +++ b/pytorch_pretrained_bert/file_utils.py @@ -45,13 +45,15 @@ def url_to_filename(url: str, etag: str = None) -> str: return filename -def filename_to_url(filename: str, cache_dir: str = None) -> Tuple[str, str]: +def filename_to_url(filename: str, cache_dir: Union[str, Path] = None) -> Tuple[str, str]: """ Return the url and etag (which may be ``None``) stored for `filename`. Raise ``FileNotFoundError`` if `filename` or its stored metadata do not exist. """ if cache_dir is None: cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) cache_path = os.path.join(cache_dir, filename) if not os.path.exists(cache_path): @@ -69,7 +71,7 @@ def filename_to_url(filename: str, cache_dir: str = None) -> Tuple[str, str]: return url, etag -def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str: +def cached_path(url_or_filename: Union[str, Path], cache_dir: Union[str, Path] = None) -> str: """ Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file and cache it, and @@ -80,6 +82,8 @@ def cached_path(url_or_filename: Union[str, Path], cache_dir: str = None) -> str cache_dir = PYTORCH_PRETRAINED_BERT_CACHE if isinstance(url_or_filename, Path): url_or_filename = str(url_or_filename) + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) parsed = urlparse(url_or_filename) @@ -158,13 +162,15 @@ def http_get(url: str, temp_file: IO) -> None: progress.close() -def get_from_cache(url: str, cache_dir: str = None) -> str: +def get_from_cache(url: str, cache_dir: Union[str, Path] = None) -> str: """ Given a URL, look for the corresponding dataset in the local cache. If it's not there, download it. Then return the path to the cached file. """ if cache_dir is None: cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) os.makedirs(cache_dir, exist_ok=True) @@ -221,7 +227,7 @@ def read_set_from_file(filename: str) -> Set[str]: Expected file format is one item per line. ''' collection = set() - with open(filename, 'r') as file_: + with open(filename, 'r', encoding='utf-8') as file_: for line in file_: collection.add(line.rstrip()) return collection diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index 2d6dfa531dc5fd..0826531badaaca 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -1,5 +1,6 @@ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,16 +34,15 @@ from .file_utils import cached_path -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO) logger = logging.getLogger(__name__) PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz", 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz", 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz", - 'bert-base-multilingual': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual.tar.gz", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz", 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz", } CONFIG_NAME = 'bert_config.json' @@ -103,7 +103,7 @@ def __init__(self, initializing all weight matrices. """ if isinstance(vocab_size_or_config_json_file, str): - with open(vocab_size_or_config_json_file, "r") as reader: + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: json_config = json.loads(reader.read()) for key, value in json_config.items(): self.__dict__[key] = value @@ -134,7 +134,7 @@ def from_dict(cls, json_object): @classmethod def from_json_file(cls, json_file): """Constructs a `BertConfig` from a json file of parameters.""" - with open(json_file, "r") as reader: + with open(json_file, "r", encoding='utf-8') as reader: text = reader.read() return cls.from_dict(json.loads(text)) @@ -150,22 +150,24 @@ def to_json_string(self): """Serializes this instance to a JSON string.""" return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" - -class BertLayerNorm(nn.Module): - def __init__(self, config, variance_epsilon=1e-12): - """Construct a layernorm module in the TF style (epsilon inside the square root). - """ - super(BertLayerNorm, self).__init__() - self.gamma = nn.Parameter(torch.ones(config.hidden_size)) - self.beta = nn.Parameter(torch.zeros(config.hidden_size)) - self.variance_epsilon = variance_epsilon - - def forward(self, x): - u = x.mean(-1, keepdim=True) - s = (x - u).pow(2).mean(-1, keepdim=True) - x = (x - u) / torch.sqrt(s + self.variance_epsilon) - return self.gamma * x + self.beta - +try: + from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm +except ImportError: + print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.") + class BertLayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-12): + """Construct a layernorm module in the TF style (epsilon inside the square root). + """ + super(BertLayerNorm, self).__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.bias = nn.Parameter(torch.zeros(hidden_size)) + self.variance_epsilon = eps + + def forward(self, x): + u = x.mean(-1, keepdim=True) + s = (x - u).pow(2).mean(-1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.variance_epsilon) + return self.weight * x + self.bias class BertEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings. @@ -178,7 +180,7 @@ def __init__(self, config): # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file - self.LayerNorm = BertLayerNorm(config) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, input_ids, token_type_ids=None): @@ -253,7 +255,7 @@ class BertSelfOutput(nn.Module): def __init__(self, config): super(BertSelfOutput, self).__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = BertLayerNorm(config) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): @@ -292,7 +294,7 @@ class BertOutput(nn.Module): def __init__(self, config): super(BertOutput, self).__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = BertLayerNorm(config) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): @@ -320,7 +322,7 @@ class BertEncoder(nn.Module): def __init__(self, config): super(BertEncoder, self).__init__() layer = BertLayer(config) - self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) + self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True): all_encoder_layers = [] @@ -354,7 +356,7 @@ def __init__(self, config): self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.transform_act_fn = ACT2FN[config.hidden_act] \ if isinstance(config.hidden_act, str) else config.hidden_act - self.LayerNorm = BertLayerNorm(config) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) @@ -437,28 +439,32 @@ def init_bert_weights(self, module): # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) elif isinstance(module, BertLayerNorm): - module.beta.data.normal_(mean=0.0, std=self.config.initializer_range) - module.gamma.data.normal_(mean=0.0, std=self.config.initializer_range) + module.bias.data.zero_() + module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() @classmethod - def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs): + def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs): """ - Instantiate a PreTrainedBertModel from a pre-trained model file. + Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict. Download and cache the pre-trained model file if needed. - + Params: pretrained_model_name: either: - a str with the name of a pre-trained model to load selected in the list of: . `bert-base-uncased` . `bert-large-uncased` . `bert-base-cased` - . `bert-base-multilingual` + . `bert-large-cased` + . `bert-base-multilingual-uncased` + . `bert-base-multilingual-cased` . `bert-base-chinese` - a path or url to a pretrained model archive containing: . `bert_config.json` a configuration file for the model . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance + cache_dir: an optional path to a folder in which the pre-trained models will be cached. + state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models *inputs, **kwargs: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification) """ @@ -476,7 +482,7 @@ def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwarg "associated to this path or url.".format( pretrained_model_name, ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), - pretrained_model_name)) + archive_file)) return None if resolved_archive_file == archive_file: logger.info("loading archive file {}".format(archive_file)) @@ -497,11 +503,26 @@ def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwarg # Load config config_file = os.path.join(serialization_dir, CONFIG_NAME) config = BertConfig.from_json_file(config_file) - logger.info("Model config {}".format(config)) + # logger.info("Model config {}".format(config)) # XD # Instantiate model. model = cls(config, *inputs, **kwargs) - weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) - state_dict = torch.load(weights_path) + if state_dict is None: + weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) + state_dict = torch.load(weights_path) + + old_keys = [] + new_keys = [] + for key in state_dict.keys(): + new_key = None + if 'gamma' in key: + new_key = key.replace('gamma', 'weight') + if 'beta' in key: + new_key = key.replace('beta', 'bias') + if new_key: + old_keys.append(key) + new_keys.append(new_key) + for old_key, new_key in zip(old_keys, new_keys): + state_dict[new_key] = state_dict.pop(old_key) missing_keys = [] unexpected_keys = [] @@ -557,7 +578,7 @@ class BertModel(PreTrainedBertModel): of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size], - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding - to the last attention block, + to the last attention block of shape [batch_size, sequence_length, hidden_size], `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a classifier pretrained on top of the hidden state associated to the first character of the input (`CLF`) to train on the Next-Sentence task (see BERT's paper). @@ -567,10 +588,10 @@ class BertModel(PreTrainedBertModel): # Already been converted into WordPiece token ids input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - config = modeling.BertConfig(vocab_size=32000, hidden_size=512, - num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) model = modeling.BertModel(config=config) all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) @@ -648,18 +669,18 @@ class BertForPreTraining(PreTrainedBertModel): sentence classification loss. if `masked_lm_labels` or `next_sentence_label` is `None`: Outputs a tuple comprising - - the masked language modeling logits, and - - the next sentence classification logits. + - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and + - the next sentence classification logits of shape [batch_size, 2]. Example usage: ```python # Already been converted into WordPiece token ids input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - config = BertConfig(vocab_size=32000, hidden_size=512, - num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) model = BertForPreTraining(config) masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask) @@ -678,7 +699,7 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm if masked_lm_labels is not None and next_sentence_label is not None: loss_fct = CrossEntropyLoss(ignore_index=-1) - masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels(-1)) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss return total_loss @@ -709,20 +730,20 @@ class BertForMaskedLM(PreTrainedBertModel): is only computed for the labels set in [0, ..., vocab_size] Outputs: - if `masked_lm_labels` is `None`: + if `masked_lm_labels` is not `None`: Outputs the masked language modeling loss. if `masked_lm_labels` is `None`: - Outputs the masked language modeling logits. + Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size]. Example usage: ```python # Already been converted into WordPiece token ids input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - config = BertConfig(vocab_size=32000, hidden_size=512, - num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) model = BertForMaskedLM(config) masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask) @@ -774,7 +795,7 @@ class BertForNextSentencePrediction(PreTrainedBertModel): Outputs the total_loss which is the sum of the masked language modeling loss and the next sentence classification loss. if `next_sentence_label` is `None`: - Outputs the next sentence classification logits. + Outputs the next sentence classification logits of shape [batch_size, 2]. Example usage: ```python @@ -783,8 +804,8 @@ class BertForNextSentencePrediction(PreTrainedBertModel): input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - config = BertConfig(vocab_size=32000, hidden_size=512, - num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) model = BertForNextSentencePrediction(config) seq_relationship_logits = model(input_ids, token_type_ids, input_mask) @@ -836,17 +857,17 @@ class BertForSequenceClassification(PreTrainedBertModel): if `labels` is not `None`: Outputs the CrossEntropy classification loss of the output with the labels. if `labels` is `None`: - Outputs the classification logits. + Outputs the classification logits of shape [batch_size, num_labels]. Example usage: ```python # Already been converted into WordPiece token ids input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - config = BertConfig(vocab_size=32000, hidden_size=512, - num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) num_labels = 2 @@ -870,7 +891,142 @@ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=No if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - return loss, logits + return loss + else: + return logits + + +class BertForMultipleChoice(PreTrainedBertModel): + """BERT model for multiple choice tasks. + This module is composed of the BERT model with a linear layer on top of + the pooled output. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model. + `num_choices`: the number of classes for the classifier. Default = 2. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] + with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` + and type 1 corresponds to a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] + with indices selected in [0, ..., num_choices]. + + Outputs: + if `labels` is not `None`: + Outputs the CrossEntropy classification loss of the output with the labels. + if `labels` is `None`: + Outputs the classification logits of shape [batch_size, num_labels]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]]) + input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]]) + token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]]) + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + num_choices = 2 + + model = BertForMultipleChoice(config, num_choices) + logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + def __init__(self, config, num_choices=2): + super(BertForMultipleChoice, self).__init__(config) + self.num_choices = num_choices + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) + flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) + _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False) + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, self.num_choices) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + return loss + else: + return reshaped_logits + + +class BertForTokenClassification(PreTrainedBertModel): + """BERT model for token-level classification. + This module is composed of the BERT model with a linear layer on top of + the full hidden state of the last layer. + + Params: + `config`: a BertConfig class instance with the configuration to build a new model. + `num_labels`: the number of classes for the classifier. Default = 2. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] + with indices selected in [0, ..., num_labels]. + + Outputs: + if `labels` is not `None`: + Outputs the CrossEntropy classification loss of the output with the labels. + if `labels` is `None`: + Outputs the classification logits of shape [batch_size, sequence_length, num_labels]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + num_labels = 2 + + model = BertForTokenClassification(config, num_labels) + logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + def __init__(self, config, num_labels=2): + super(BertForTokenClassification, self).__init__(config) + self.num_labels = num_labels + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, num_labels) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): + sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + return loss else: return logits @@ -881,15 +1037,7 @@ class BertForQuestionAnswering(PreTrainedBertModel): the sequence output that computes start_logits and end_logits Params: - `config`: either - - a BertConfig class instance with the configuration to build a new model, or - - a str with the name of a pre-trained model to load selected in the list of: - . `bert-base-uncased` - . `bert-large-uncased` - . `bert-base-cased` - . `bert-base-multilingual` - . `bert-base-chinese` - The pre-trained model will be downloaded and cached if needed. + `config`: a BertConfig class instance with the configuration to build a new model. Inputs: `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] @@ -914,17 +1062,17 @@ class BertForQuestionAnswering(PreTrainedBertModel): Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions. if `start_positions` or `end_positions` is `None`: Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end - position tokens. + position tokens of shape [batch_size, sequence_length]. Example usage: ```python # Already been converted into WordPiece token ids input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - config = BertConfig(vocab_size=32000, hidden_size=512, - num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) model = BertForQuestionAnswering(config) start_logits, end_logits = model(input_ids, token_type_ids, input_mask) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index 4266a8f83ba699..f3d1de0d37b8b6 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -17,6 +17,7 @@ import math import torch from torch.optim import Optimizer +from torch.optim.optimizer import required from torch.nn.utils import clip_grad_norm_ def warmup_cosine(x, warmup=0.002): @@ -52,13 +53,13 @@ class BertAdam(Optimizer): b1: Adams b1. Default: 0.9 b2: Adams b2. Default: 0.999 e: Adams epsilon. Default: 1e-6 - weight_decay_rate: Weight decay. Default: 0.01 + weight_decay: Weight decay. Default: 0.01 max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 """ - def __init__(self, params, lr, warmup=-1, t_total=-1, schedule='warmup_linear', - b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01, + def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', + b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0): - if not lr >= 0.0: + if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) if schedule not in SCHEDULES: raise ValueError("Invalid schedule parameter: {}".format(schedule)) @@ -71,7 +72,7 @@ def __init__(self, params, lr, warmup=-1, t_total=-1, schedule='warmup_linear', if not e >= 0.0: raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, - b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate, + b1=b1, b2=b2, e=e, weight_decay=weight_decay, max_grad_norm=max_grad_norm) super(BertAdam, self).__init__(params, defaults) @@ -139,8 +140,8 @@ def step(self, closure=None): # Instead we want to decay the weights in a manner that doesn't interact # with the m/v parameters. This is equivalent to adding the square # of the weights to the loss with plain (non-momentum) SGD. - if group['weight_decay_rate'] > 0.0: - update += group['weight_decay_rate'] * p.data + if group['weight_decay'] > 0.0: + update += group['weight_decay'] * p.data if group['t_total'] != -1: schedule_fct = SCHEDULES[group['schedule']] diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py index c37a7e3b9ee32b..595eb8fdaa92a8 100644 --- a/pytorch_pretrained_bert/tokenization.py +++ b/pytorch_pretrained_bert/tokenization.py @@ -25,18 +25,27 @@ from .file_utils import cached_path -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO) logger = logging.getLogger(__name__) PRETRAINED_VOCAB_ARCHIVE_MAP = { 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", - 'bert-base-multilingual': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-vocab.txt", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", } +PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { + 'bert-base-uncased': 512, + 'bert-large-uncased': 512, + 'bert-base-cased': 512, + 'bert-large-cased': 512, + 'bert-base-multilingual-uncased': 512, + 'bert-base-multilingual-cased': 512, + 'bert-base-chinese': 512, +} +VOCAB_NAME = 'vocab.txt' def load_vocab(vocab_file): @@ -65,7 +74,9 @@ def whitespace_tokenize(text): class BertTokenizer(object): """Runs end-to-end tokenization: punctuation splitting + wordpiece""" - def __init__(self, vocab_file, do_lower_case=True): + + def __init__(self, vocab_file, do_lower_case=True, max_len=None, + never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): if not os.path.isfile(vocab_file): raise ValueError( "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " @@ -73,8 +84,10 @@ def __init__(self, vocab_file, do_lower_case=True): self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict( [(ids, tok) for tok, ids in self.vocab.items()]) - self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, + never_split=never_split) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + self.max_len = max_len if max_len is not None else int(1e12) def tokenize(self, text): split_tokens = [] @@ -88,6 +101,12 @@ def convert_tokens_to_ids(self, tokens): ids = [] for token in tokens: ids.append(self.vocab[token]) + if len(ids) > self.max_len: + raise ValueError( + "Token indices sequence length is longer than the specified maximum " + " sequence length for this BERT model ({} > {}). Running this" + " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) + ) return ids def convert_ids_to_tokens(self, ids): @@ -98,7 +117,7 @@ def convert_ids_to_tokens(self, ids): return tokens @classmethod - def from_pretrained(cls, pretrained_model_name, do_lower_case=True): + def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs): """ Instantiate a PreTrainedBertModel from a pre-trained model file. Download and cache the pre-trained model file if needed. @@ -107,16 +126,11 @@ def from_pretrained(cls, pretrained_model_name, do_lower_case=True): vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name] else: vocab_file = pretrained_model_name + if os.path.isdir(vocab_file): + vocab_file = os.path.join(vocab_file, VOCAB_NAME) # redirect to the cache, if necessary try: - resolved_vocab_file = cached_path(vocab_file) - if resolved_vocab_file == vocab_file: - logger.info("loading vocabulary file {}".format(vocab_file)) - else: - logger.info("loading vocabulary file {} from cache at {}".format( - vocab_file, resolved_vocab_file)) - # Instantiate tokenizer. - tokenizer = cls(resolved_vocab_file, do_lower_case) + resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) except FileNotFoundError: logger.error( "Model name '{}' was not found in model name list ({}). " @@ -124,21 +138,36 @@ def from_pretrained(cls, pretrained_model_name, do_lower_case=True): "associated to this path or url.".format( pretrained_model_name, ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), - pretrained_model_name)) - tokenizer = None + vocab_file)) + return None + if resolved_vocab_file == vocab_file: + logger.info("loading vocabulary file {}".format(vocab_file)) + else: + logger.info("loading vocabulary file {} from cache at {}".format( + vocab_file, resolved_vocab_file)) + if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: + # if we're using a pretrained model, ensure the tokenizer wont index sequences longer + # than the number of positional embeddings + max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name] + kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) + # Instantiate tokenizer. + tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) return tokenizer class BasicTokenizer(object): """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" - def __init__(self, do_lower_case=True): + def __init__(self, + do_lower_case=True, + never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): """Constructs a BasicTokenizer. Args: do_lower_case: Whether to lower case the input. """ self.do_lower_case = do_lower_case + self.never_split = never_split def tokenize(self, text): """Tokenizes a piece of text.""" @@ -153,7 +182,7 @@ def tokenize(self, text): orig_tokens = whitespace_tokenize(text) split_tokens = [] for token in orig_tokens: - if self.do_lower_case: + if self.do_lower_case and token not in self.never_split: token = token.lower() token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token)) @@ -174,6 +203,8 @@ def _run_strip_accents(self, text): def _run_split_on_punc(self, text): """Splits punctuation on a piece of text.""" + if text in self.never_split: + return [text] chars = list(text) i = 0 start_new_word = True @@ -191,7 +222,7 @@ def _run_split_on_punc(self, text): i += 1 return ["".join(x) for x in output] - + def _tokenize_chinese_chars(self, text): """Adds whitespace around any CJK character.""" output = [] @@ -216,17 +247,17 @@ def _is_chinese_char(self, cp): # space-separated words, so they are not treated specially and handled # like the all of the other languages. if ((cp >= 0x4E00 and cp <= 0x9FFF) or # - (cp >= 0x3400 and cp <= 0x4DBF) or # - (cp >= 0x20000 and cp <= 0x2A6DF) or # - (cp >= 0x2A700 and cp <= 0x2B73F) or # - (cp >= 0x2B740 and cp <= 0x2B81F) or # - (cp >= 0x2B820 and cp <= 0x2CEAF) or - (cp >= 0xF900 and cp <= 0xFAFF) or # - (cp >= 0x2F800 and cp <= 0x2FA1F)): # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # return True - + return False - + def _clean_text(self, text): """Performs invalid character removal and whitespace cleanup on text.""" output = [] @@ -261,7 +292,7 @@ def tokenize(self, text): Args: text: A single token or whitespace separated tokens. This should have - already been passed through `BasicTokenizer. + already been passed through `BasicTokenizer`. Returns: A list of wordpiece tokens. diff --git a/requirements.txt b/requirements.txt index e9a3640a9b3a63..f37f11cc540bb1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ -# This installs Pytorch for CUDA 8 only. If you are using a newer version, -# please visit http://pytorch.org/ and install the relevant version. -torch>=0.4.1,<0.5.0 +# PyTorch +torch>=0.4.1 # progress bars in model download and training scripts tqdm # Accessing files from S3 directly. diff --git a/run_child_finetuning.py b/run_child_finetuning.py new file mode 100644 index 00000000000000..3fd4a66f3f9de4 --- /dev/null +++ b/run_child_finetuning.py @@ -0,0 +1,531 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import logging +import argparse +from tqdm import tqdm, trange +import math + +import numpy as np +import torch +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler +from torch.utils.data.distributed import DistributedSampler + +from pytorch_pretrained_bert.tokenization import BertTokenizer +from pytorch_pretrained_bert.modeling import BertForPreTraining +from pytorch_pretrained_bert.optimization import BertAdam + +# from child_generator import make_sentences +# from child_frames import frames + +from torch.utils.data import Dataset, TensorDataset +import random + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO) +logger = logging.getLogger(__name__) + + +def warmup_linear(x, warmup=0.002): + if x < warmup: + return x/warmup + return 1.0 - x + + +def rejoin_masked_tokens(tokens): + out = [] + while len(tokens) > 0: + token = tokens.pop(0) + if token not in ['[', ']']: + out.append(token) + else: + assert token == '[' + next_token = tokens.pop(0) # the maksed word + next_next_token = tokens.pop(0) # "]" symbol + out.append(token + next_token + next_next_token) + return out + + +class CHILDDataset(Dataset): + def __init__(self, tokenizer, all_lines, one_sent=False, seq_len=None, dev_percent=0.2): + self.tokenizer = tokenizer + self.one_sent = one_sent + self.seq_len = seq_len + + self.all_lines = all_lines +# self.all_lines = [] +# for frame in frames: +# self.all_lines += make_sentences(**frame) + + random.shuffle(self.all_lines) + + self.examples = [] + cur_id = 0 + for line in self.all_lines: + t1, t2, is_next_label = self.split_sent(line) + + tokens_a = self.tokenizer.tokenize(t1) + tokens_a = rejoin_masked_tokens(tokens_a) + + if t2 is None: + tokens_b = None + else: + tokens_b = self.tokenizer.tokenize(t2) + tokens_b = rejoin_masked_tokens(tokens_b) + + example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=tokens_b, is_next=is_next_label) + self.examples.append(example) + cur_id += 1 + + if self.seq_len is None: + # self.seq_len = max([len(example.tokens_a) + 3 for example in self.examples]) + # if example.tokens_b is not None: + # self.seq_len += len(example.tokens_b) + self.seq_len = max([len(example.tokens_a) + len(example.tokens_b) + 3 + if example.tokens_b is not None else len(example.tokens_a) + 2 + for example in self.examples]) + + self.features = [convert_example_to_features(example, self.seq_len, self.tokenizer) for example in self.examples] + + self.n_examples = len(self.all_lines) + self.n_dev = int(self.n_examples * dev_percent) + self.n_train = self.n_examples - self.n_dev + + def get_train_examples(self): + return self.examples[:self.n_train] + + def get_dev_examples(self): + return self.examples[self.n_train:] + + def get_train_features(self): + return self.features[:self.n_train] + + def get_dev_features(self): + return self.features[self.n_train:] + + def build_dataset(self, features): + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + all_lm_label_ids = torch.tensor([f.lm_label_ids for f in features], dtype=torch.long) + all_is_next = torch.tensor([f.is_next for f in features], dtype=torch.long) + dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lm_label_ids, all_is_next) + return dataset + + def __len__(self): + return len(self.all_lines) + + def split_sent(self, line): + label = 0 + if "|||" in line: + t1, t2 = [t.strip() for t in line.split("|||")] + assert len(t1) > 0 and len(t2) > 0, "%d %d" % (len(t1), len(t2)) + if self.one_sent: + t1 = t1 + " " + t2 + t2 = None + else: + # assert self.one_sent + t1, t2 = line.strip(), None + return t1, t2, label + + +class InputExample(object): + def __init__(self, guid, tokens_a, tokens_b=None, is_next=None, lm_labels=None): + self.guid = guid + self.tokens_a = tokens_a + self.tokens_b = tokens_b + self.is_next = is_next # nextSentence + self.lm_labels = lm_labels # masked words for language model + + +class InputFeatures(object): + def __init__(self, input_ids, input_mask, segment_ids, is_next, lm_label_ids): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.is_next = is_next + self.lm_label_ids = lm_label_ids + + +def mask_word(tokens, tokenizer): + output_label = [] + + for i, token in enumerate(tokens): + if token.startswith("[") and token.endswith("]"): # masked word + token = token[1:-1] + tokens[i] = "[MASK]" + output_label.append(tokenizer.vocab[token]) + else: + output_label.append(-1) + + return tokens, output_label + + +def convert_example_to_features(example, max_seq_length, tokenizer): + tokens_a = example.tokens_a + tokens_b = example.tokens_b + + t1_random, t1_label = mask_word(tokens_a, tokenizer) + lm_label_ids = [-1] + t1_label + [-1] + + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b is not None and len(tokens_b) > 0: + t2_random, t2_label = mask_word(tokens_b, tokenizer) + lm_label_ids += (t2_label + [-1]) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + lm_label_ids.append(-1) + + assert len(input_ids) == max_seq_length, '%d != %d' % (len(input_ids), max_seq_length) + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(lm_label_ids) == max_seq_length + + if example.guid < -5: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("tokens: %s" % " ".join( + [str(x) for x in tokens])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + logger.info("LM label: %s " % (lm_label_ids)) + logger.info("Is next sentence label: %s " % (example.is_next)) + + features = InputFeatures(input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + lm_label_ids=lm_label_ids, + is_next=example.is_next) + return features + + +def main(): + parser = argparse.ArgumentParser() + + ## Required parameters + parser.add_argument("--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model checkpoints will be written.") + + ## Other parameters + parser.add_argument("--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument("--do_train", + action='store_true', + help="Whether to run training.") + parser.add_argument("--do_eval", + action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument("--train_batch_size", + default=32, + type=int, + help="Total batch size for training.") + parser.add_argument("--eval_batch_size", + default=32, + type=int, + help="Total batch size for eval.") + parser.add_argument("--learning_rate", + default=3e-5, + type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_proportion", + default=0.1, + type=float, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10%% of training.") + parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") + parser.add_argument("--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models.") + parser.add_argument("--local_rank", + type=int, + default=-1, + help="local_rank for distributed training on gpus") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") + parser.add_argument('--gradient_accumulation_steps', + type=int, + default=1, + help="Number of updates steps to accumualte before performing a backward/update pass.") + parser.add_argument('--fp16', + action='store_true', + help="Whether to use 16-bit float precision instead of 32-bit") + parser.add_argument('--loss_scale', + type = float, default = 0, + help = "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" + "0 (default value): dynamic loss scaling.\n" + "Positive power of 2: static loss scaling value.\n") + + args = parser.parse_args() + + if args.local_rank == -1 or args.no_cuda: + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + n_gpu = torch.cuda.device_count() + else: + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + n_gpu = 1 + # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.distributed.init_process_group(backend='nccl') + logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( + device, n_gpu, bool(args.local_rank != -1), args.fp16)) + + if args.gradient_accumulation_steps < 1: + raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( + args.gradient_accumulation_steps)) + + args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + if not args.do_train and not args.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + if os.path.exists(args.output_dir) and os.listdir(args.output_dir): + raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) + os.makedirs(args.output_dir, exist_ok=True) + + BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/' + tokenizer = BertTokenizer.from_pretrained(os.path.join(BERT_DIR, 'vocab.txt'), do_lower_case=args.do_lower_case) + + #train_examples = None + num_train_steps = None + if args.do_train: + print("Loading Train Dataset", args.train_file) + train_features = CHILDDataset(tokenizer).get_train_features() + num_train_steps = int( + len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) + + # Prepare model + model = BertForMaskedLM.from_pretrained(BERT_DIR) + if args.fp16: + model.half() + model.to(device) + if args.local_rank != -1: + try: + from apex.parallel import DistributedDataParallel as DDP + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + model = DDP(model) + elif n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Prepare optimizer + param_optimizer = list(model.named_parameters()) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + if args.fp16: + try: + from apex.optimizers import FP16_Optimizer + from apex.optimizers import FusedAdam + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + optimizer = FusedAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + bias_correction=False, + max_grad_norm=1.0) + if args.loss_scale == 0: + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + else: + optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + + else: + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_steps) + + global_step = 0 + if args.do_train: + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_dataset)) + logger.info(" Batch size = %d", args.train_batch_size) + logger.info(" Num steps = %d", num_train_steps) + + all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) + all_lm_label_ids = torch.tensor([f.lm_label_id for f in train_features], dtype=torch.long) + all_is_next = torch.tensor([f.is_next for f in train_features], dtype=torch.long) + train_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lm_label_ids, all_is_next) + + if args.local_rank == -1: + train_sampler = RandomSampler(train_dataset) + else: + #TODO: check if this works with current data generator from disk that relies on file.__next__ + # (it doesn't return item back by index) + train_sampler = DistributedSampler(train_dataset) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) + + if args.do_eval: + eval_features = CHILDDataset(tokenizer).get_dev_features() + all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) + all_lm_label_ids = torch.tensor([f.lm_label_id for f in eval_features], dtype=torch.long) + all_is_next = torch.tensor([f.is_next for f in eval_features], dtype=torch.long) + eval_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lm_label_ids, all_is_next) + + eval_sampler = SequentialSampler(eval_dataset) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) + + logger.info("Epoch 0") + logger.info("Evaluating on train set...") + validate(model, train_dataloader) + logger.info("Evaluating on valid set...") + validate(model, eval_dataloader) + + for epoch in trange(int(args.num_train_epochs), desc="Epoch"): + model.train() + tr_loss = 0 + nb_tr_examples, nb_tr_steps = 0, 0 + for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): + batch = tuple(t.to(device) for t in batch) + input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch + loss = model(input_ids, segment_ids, input_mask, lm_label_ids) + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + if args.fp16: + optimizer.backward(loss) + else: + loss.backward() + tr_loss += loss.item() + nb_tr_examples += input_ids.size(0) + nb_tr_steps += 1 + if (step + 1) % args.gradient_accumulation_steps == 0: + # modify learning rate with special warm up BERT uses + lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_steps, args.warmup_proportion) + for param_group in optimizer.param_groups: + param_group['lr'] = lr_this_step + optimizer.step() + optimizer.zero_grad() + global_step += 1 + + if args.do_eval: + logger.info("Epoch %d" % epoch + 1) + logger.info("Evaluating on train set...") + validate(model, train_dataloader) + logger.info("Evaluating on valid set...") + validate(model, eval_dataloader) + + # Save a trained model + logger.info("** ** * Saving fine - tuned model ** ** * ") + model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self + output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") + if args.do_train: + torch.save(model_to_save.state_dict(), output_model_file) + + +def validate(model, dataset, device, batch_size=128, randomized=False): + model.eval() + eval_loss, eval_accuracy = 0, 0 + nb_eval_steps, nb_eval_examples = 0, 0 + +# for input_ids, input_mask, segment_ids, label_ids, is_next in tqdm(eval_dataloader, desc="Evaluating"): + for i, batch_idx in enumerate(get_batch_index(len(dataset), batch_size, randomized=randomized)): + batch = tuple(t[batch_idx] for t in dataset.tensors) + batch = tuple(t.to(device) for t in batch) + input_ids, input_mask, segment_ids, label_ids, is_next = batch + input_ids = input_ids.to(device) + input_mask = input_mask.to(device) + segment_ids = segment_ids.to(device) + label_ids = label_ids.to(device) + + with torch.no_grad(): + tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) + logits = model(input_ids, segment_ids, input_mask) + + logits = logits.detach().cpu().numpy() + label_ids = label_ids.to('cpu').numpy() + tmp_eval_accuracy = accuracy(logits, label_ids) + + eval_loss += tmp_eval_loss.mean().item() + eval_accuracy += tmp_eval_accuracy + + nb_eval_examples += input_ids.size(0) + nb_eval_steps += 1 + + eval_loss = eval_loss / nb_eval_steps + eval_accuracy = eval_accuracy / nb_eval_examples +# loss = tr_loss/nb_tr_steps if args.do_train else None + result = {'eval_loss': eval_loss, + 'eval_accuracy': eval_accuracy,} +# 'global_step': global_step, +# 'loss': loss} + + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + + +def get_batch_index(dataset_size, batch_size, randomized=False): + import math + idx_list = list(range(dataset_size)) + if randomized: + random.shuffle(idx_list) + n_batches = math.ceil(len(idx_list) / batch_size) + return [idx_list[i * batch_size: (i + 1) * batch_size] for i in range(n_batches)] + + +def accuracy(out, labels): + outputs = np.argmax(out, axis=-1) +# return int(np.all((outputs == labels)[labels != -1])) + return int(np.sum((outputs == labels)[labels != -1])) + + +if __name__ == "__main__": + main() diff --git a/score.py b/score.py new file mode 100644 index 00000000000000..f1c3224c0b9d4a --- /dev/null +++ b/score.py @@ -0,0 +1,92 @@ +from collections import OrderedDict +import re +import json +import os +from pprint import pprint + + +def score(): + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'WSC_child_problem.json') + with open(path, 'r') as f: + data_l = json.load(f) + f.close() + + result = [] + s_order = ['sentence', 'answer1', 'answer0', 'correct_answer', 'adjacent_ref', 'predict_answer', 'score'] + data_order = ['index', 'sentences'] + for data in data_l: + if data['sentences'] != []: + for i in range(len(data['sentences'])): + s = data['sentences'][i] + score = 0 + if s['predict_answer'] != []: + predict_answer = s['predict_answer'][0] + if any(answer.lower() == predict_answer[0] for answer in s['correct_answer']): + score = 1 + s['score'] = score + s = OrderedDict(sorted(s.items(), key=lambda i:s_order.index(i[0]))) + data['sentences'][i] = s + data = OrderedDict(sorted(data.items(), key=lambda i:data_order.index(i[0]))) + result.append(data) + + print('Save the score in WSC_child_problem.json\n') + with open(path, 'w') as f: + json.dump(result, f, indent=4, separators=(',', ': '), ensure_ascii=False) + f.close() + + total_score = 0 + total_valid_problems = 0 + l = {} + for r in result: + for s in r['sentences']: + if 'score' in s: + total_valid_problems += 1 + score = s['score'] + total_score += score + if r['index'] not in l.keys(): + l[r['index']] = [0, 1] + else: + l[r['index']][1] += 1 + if score == 1: + l[r['index']][0] += 1 + print('Correct problems:') + pprint(l) + print() + + print('Score each valid problems:') + description = ' Total valid problems: {0}\n Correct answers: {1}\n Accuracy: {2}' + print(description.format(total_valid_problems, total_score, float(total_score/total_valid_problems))) + + print() + result_dict = {} + for r in result: + for s in r['sentences']: + if 'score' in s: + index = r['index'] + if index < 252: + if index % 2 == 1: + index -= 1 + elif index in [252, 253, 254]: + index = 252 + else: + if index % 2 == 0: + index -= 1 + if index in result_dict.keys(): + result_dict[index].append(s) + else: + result_dict[index] = [s] + + total_score = 0 + for key in result_dict.keys(): + score = 1 + for s in result_dict[key]: + if s['score'] == 0: + score = 0 + total_score += score + print('Score each valid problem groups:') + description = ' Total valid problems: {0}\n Correct answers: {1}\n Accuracy: {2}' + print(description.format(len(result_dict), total_score, float(total_score/len(result_dict)))) + + +if __name__ == '__main__': + score() diff --git a/setup.py b/setup.py index 9b2a678832005e..e9b5c077c4914e 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,47 @@ +""" +Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py + +To create the package for pypi. + +1. Change the version in __init__.py and setup.py. + +2. Commit these changes with the message: "Release: VERSION" + +3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' " + Push the tag to git: git push --tags origin master + +4. Build both the sources and the wheel. Do not change anything in setup.py between + creating the wheel and the source distribution (obviously). + + For the wheel, run: "python setup.py bdist_wheel" in the top level allennlp directory. + (this will build a wheel for the python version you use to build it - make sure you use python 3.x). + + For the sources, run: "python setup.py sdist" + You should now have a /dist directory with both .whl and .tar.gz source versions of allennlp. + +5. Check that everything looks correct by uploading the package to the pypi test server: + + twine upload dist/* -r pypitest + (pypi suggest using twine as other methods upload files via plaintext.) + + Check that you can install it in a virtualenv by running: + pip install -i https://testpypi.python.org/pypi allennlp + +6. Upload the final version to actual pypi: + twine upload dist/* -r pypi + +7. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory. + +""" from setuptools import find_packages, setup setup( name="pytorch_pretrained_bert", - version="0.2.0", + version="0.4.0", author="Thomas Wolf, Victor Sanh, Tim Rault, Google AI Language Team Authors", author_email="thomas@huggingface.co", description="PyTorch version of Google AI BERT model with script to load Google pre-trained models", - long_description=open("README.md", "r").read(), + long_description=open("README.md", "r", encoding='utf-8').read(), long_description_content_type="text/markdown", keywords='BERT NLP deep learning google', license='Apache', @@ -18,7 +53,11 @@ 'boto3', 'requests', 'tqdm'], - scripts=["bin/pytorch_pretrained_bert"], + entry_points={ + 'console_scripts': [ + "pytorch_pretrained_bert=pytorch_pretrained_bert.__main__:main" + ] + }, python_requires='>=3.5.0', tests_require=['pytest'], classifiers=[ diff --git a/test_WSC_child_problem.py b/test_WSC_child_problem.py new file mode 100644 index 00000000000000..7c731feb81712b --- /dev/null +++ b/test_WSC_child_problem.py @@ -0,0 +1,345 @@ +from IPython.core.interactiveshell import InteractiveShell +InteractiveShell.ast_node_interactivity = 'all' + +import os +import json + +import numpy as np +import math +import matplotlib +import matplotlib.pyplot as plt +from pylab import rcParams + +import torch +import torch.nn.functional as F +from pytorch_pretrained_bert import tokenization, BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, BertConfig +from examples.extract_features import * + +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + +CONFIG_NAME = 'bert_config.json' +BERT_DIR = '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/' +config_file = os.path.join(BERT_DIR, CONFIG_NAME) +config = BertConfig.from_json_file(config_file) +model = BertForPreTraining.from_pretrained(BERT_DIR) +model.eval() +class Args: + def __init__(self): + pass + +args = Args() +args.no_cuda = False + +device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") +model.to(device) + +vis_attn_topk = 3 + + +def has_chinese_label(labels): + labels = [label.split('->')[0].strip() for label in labels] + r = sum([len(label) > 1 for label in labels if label not in ['BOS', 'EOS']]) * 1. / (len(labels) - 1) + return 0 < r < 0.5 # r == 0 means empty query labels used in self attention + +def _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col, color='b'): + assert len(query_labels) == attn.size(0) + assert len(key_labels) == attn.size(1) + + ax1.set_xlim([-1, 1]) + ax1.set_xticks([]) + ax2 = ax1.twinx() + nlabels = max(len(key_labels), len(query_labels)) + pos = range(nlabels) + + if 'self' in attn_name and col < ncols - 1: + query_labels = ['' for _ in query_labels] + + for ax, labels in [(ax1, key_labels), (ax2, query_labels)]: + ax.set_yticks(pos) + if has_chinese_label(labels): + ax.set_yticklabels(labels, fontproperties=zhfont) + else: + ax.set_yticklabels(labels) + ax.set_ylim([nlabels - 1, 0]) + ax.tick_params(width=0, labelsize='xx-large') + + for spine in ax.spines.values(): + spine.set_visible(False) + +# mask, attn = filter_attn(attn) + for qi in range(attn.size(0)): +# if not mask[qi]: +# continue +# for ki in range(attn.size(1)): + for ki in attn[qi].topk(vis_attn_topk)[1]: + a = attn[qi, ki] + ax1.plot((-1, 1), (ki, qi), color, alpha=a) +# print(attn.mean(dim=0).topk(5)[0]) +# ax1.barh(pos, attn.mean(dim=0).data.cpu().numpy()) + +def plot_layer_attn(result_tuple, attn_name='dec_self_attns', layer=0, heads=None): + hypo, nheads, labels_dict = result_tuple + key_labels, query_labels = labels_dict[attn_name] + if heads is None: + heads = range(nheads) + else: + nheads = len(heads) + + stride = 2 if attn_name == 'dec_enc_attns' else 1 + nlabels = max(len(key_labels), len(query_labels)) + rcParams['figure.figsize'] = 20, int(round(nlabels * stride * nheads / 8 * 1.0)) + + rows = nheads // ncols * stride + fig, axes = plt.subplots(rows, ncols) + + # for head in range(nheads): + for head_i, head in enumerate(heads): + row, col = head_i * stride // ncols, head_i * stride % ncols + ax1 = axes[row, col] + attn = hypo[attn_name][layer][head] + _plot_attn(ax1, attn_name, attn, key_labels, query_labels, col) + if attn_name == 'dec_enc_attns': + col = col + 1 + axes[row, col].axis('off') # next subfig acts as blank place holder + # plt.suptitle('%s with %d heads, Layer %d' % (attn_name, nheads, layer), fontsize=20) + plt.show() + +ncols = 4 +import re +def convert_text_to_examples(text): + examples = [] + unique_id = 0 + if True: + for line in text: + line = line.strip() + text_a = None + text_b = None + m = re.match(r"^(.*) \|\|\| (.*)$", line) + if m is None: + text_a = line + else: + text_a = m.group(1) + text_b = m.group(2) + examples.append( + InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) + unique_id += 1 + return examples + +def convert_examples_to_features(examples, tokenizer, append_special_tokens=True, replace_mask=True, print_info=False): + features = [] + for (ex_index, example) in enumerate(examples): + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + tokens = [] + input_type_ids = [] + if append_special_tokens: + tokens.append("[CLS]") + input_type_ids.append(0) + for token in tokens_a: + if replace_mask and token == '_': # XD + token = "[MASK]" + tokens.append(token) + input_type_ids.append(0) + if append_special_tokens: + tokens.append("[SEP]") + input_type_ids.append(0) + + if tokens_b: + for token in tokens_b: + if replace_mask and token == '_': # XD + token = "[MASK]" + tokens.append(token) + input_type_ids.append(1) + if append_special_tokens: + tokens.append("[SEP]") + input_type_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + input_mask = [1] * len(input_ids) + + if ex_index < 5 and print_info: + logger.info("*** Example ***") + logger.info("unique_id: %s" % (example.unique_id)) + logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( + "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) + + features.append( + InputFeatures( + unique_id=example.unique_id, + tokens=tokens, + input_ids=input_ids, + input_mask=input_mask, + input_type_ids=input_type_ids)) + return features + +def copy_and_mask_features(features): + import copy + masked_feature_copies = [] + for feature in features: + for masked_pos in range(len(feature.tokens)): + feature_copy = copy.deepcopy(feature) + feature_copy.input_ids[masked_pos] = tokenizer.vocab["[MASK]"] + masked_feature_copies.append(feature_copy) + return masked_feature_copies + +def show_lm_probs(tokens, input_ids, probs, topk=5, firstk=20): + def print_pair(token, prob, end_str='', hit_mark=' '): + if i < firstk: + # token = token.replace('', '').replace('\n', '/n') + print('{}{: >3} | {: <12}'.format(hit_mark, int(round(prob*100)), token), end=end_str) + + ret = None + for i in range(len(tokens)): + ind_ = input_ids[i].item() if input_ids is not None else tokenizer.vocab[tokens[i]] + prob_ = probs[i][ind_].item() + print_pair(tokens[i], prob_, end_str='\t') + values, indices = probs[i].topk(topk) + top_pairs = [] + for j in range(topk): + ind, prob = indices[j].item(), values[j].item() + hit_mark = '*' if ind == ind_ else ' ' + token = tokenizer.ids_to_tokens[ind] + print_pair(token, prob, hit_mark=hit_mark, end_str='' if j < topk - 1 else '\n') + top_pairs.append((token, prob)) + if tokens[i] == "[MASK]": + ret = top_pairs + return ret + +import colored +from colored import stylize + +def show_abnormals(tokens, probs, show_suggestions=False): + def gap2color(gap): + if gap <= 5: + return 'yellow_1' + elif gap <= 10: + return 'orange_1' + else: + return 'red_1' + + def print_token(token, suggestion, gap): + if gap == 0: + print(stylize(token + ' ', colored.fg('white') + colored.bg('black')), end='') + else: + print(stylize(token, colored.fg(gap2color(gap)) + colored.bg('black')), end='') + if show_suggestions and gap > 5: + print(stylize('/' + suggestion + ' ', colored.fg('green' if gap > 10 else 'cyan') + colored.bg('black')), end='') + else: + print(stylize(' ', colored.fg(gap2color(gap)) + colored.bg('black')), end='') + # print('/' + suggestion, end=' ') + # print('%.2f' % gap, end=' ') + + avg_gap = 0. + for i in range(1, len(tokens) - 1): # skip first [CLS] and last [SEP] + ind_ = tokenizer.vocab[tokens[i]] + prob_ = probs[i][ind_].item() + top_prob = probs[i].max().item() + top_ind = probs[i].argmax().item() + gap = math.log(top_prob) - math.log(prob_) + suggestion = tokenizer.ids_to_tokens[top_ind] + print_token(tokens[i], suggestion, gap) + avg_gap += gap + avg_gap /= (len(tokens) - 2) + print() + print(avg_gap) + +analyzed_cache = {} + +def analyze_text(text, show_suggestions=False, show_firstk_probs=20): + if text[0] in analyzed_cache: + features, mlm_probs = analyzed_cache[text[0]] + given_mask = "[MASK]" in features[0].tokens + else: + examples = convert_text_to_examples(text) + features = convert_examples_to_features(examples, tokenizer, print_info=False) + given_mask = "[MASK]" in features[0].tokens + if not given_mask: + features = copy_and_mask_features(features) + + input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long) + input_ids = input_ids.to(device) + input_type_ids = input_type_ids.to(device) + + mlm_logits, _ = model(input_ids, input_type_ids) + mlm_probs = F.softmax(mlm_logits, dim=-1) + + if not given_mask: + seq_len, _, vocab_size = mlm_probs.size() + reduced_mlm_probs = torch.Tensor(1, seq_len, vocab_size) + for i in range(seq_len): + reduced_mlm_probs[0, i] = mlm_probs[i, i] + mlm_probs = reduced_mlm_probs + + analyzed_cache[text[0]] = (features, mlm_probs) + + top_pairs = show_lm_probs(features[0].tokens, None, mlm_probs[0], firstk=show_firstk_probs) + if not given_mask: + show_abnormals(features[0].tokens, mlm_probs[0], show_suggestions=show_suggestions) + return top_pairs + + +def detect_vocabulary(): + import json + import os + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'WSC_child_problem.json') + with open(path, 'r') as f: + data_l = json.load(f) + f.close() + + print('Detect whether the vocabulary of WSC_child_problem.json in the tokenizer or not ...') + for data in data_l: + for s in data['sentences']: + for a in s['answer0'] + s['answer1']: + a = a.lower() + if a not in tokenizer.vocab: + print(a, 'not in vocab!!!') + print('Done.') + + +def test_by_WSC_child_problem(): + from collections import OrderedDict + import json + import os + import re + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'WSC_child_problem.json') + with open(path, 'r') as f: + data_l = json.load(f) + f.close() + + result = [] + s_order = ['sentence', 'answer1', 'answer0', 'correct_answer', 'adjacent_ref', 'predict_answer', 'score'] + data_order = ['index', 'sentences'] + for data in data_l: + if data['sentences'] != []: + for i in range(len(data['sentences'])): + s = data['sentences'][i] + s['predict_answer'] = [] + res = analyze_text([s['sentence']], show_firstk_probs=-1) + answer = s['answer1'] + s['answer0'] + print(data['index']) + print(res) + for r in res: + if any(a.lower() == r[0] for a in answer): + s['predict_answer'].append(list(r)) + s = OrderedDict(sorted(s.items(), key=lambda i:s_order.index(i[0]))) + data['sentences'][i] = s + print(s['predict_answer']) + data = OrderedDict(sorted(data.items(), key=lambda i:data_order.index(i[0]))) + result.append(data) + print('Save the predict_answer in WSC_child_problem.json') + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'WSC_child_problem.json') + with open(path, 'w') as f: + json.dump(result, f, indent=4, separators=(',', ': '), ensure_ascii=False) + f.close() + print('Done.') + + +test_by_WSC_child_problem() +#detect_vocabulary() diff --git a/tests/modeling_test.py b/tests/modeling_test.py index 48d56826f8e914..b5665121397d9b 100644 --- a/tests/modeling_test.py +++ b/tests/modeling_test.py @@ -22,7 +22,10 @@ import torch -from pytorch_pretrained_bert import BertConfig, BertModel +from pytorch_pretrained_bert import (BertConfig, BertModel, BertForMaskedLM, + BertForNextSentencePrediction, BertForPreTraining, + BertForQuestionAnswering, BertForSequenceClassification, + BertForTokenClassification) class BertModelTest(unittest.TestCase): @@ -35,6 +38,7 @@ def __init__(self, is_training=True, use_input_mask=True, use_token_type_ids=True, + use_labels=True, vocab_size=99, hidden_size=32, num_hidden_layers=5, @@ -45,7 +49,9 @@ def __init__(self, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, + type_sequence_label_size=2, initializer_range=0.02, + num_labels=3, scope=None): self.parent = parent self.batch_size = batch_size @@ -53,6 +59,7 @@ def __init__(self, self.is_training = is_training self.use_input_mask = use_input_mask self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers @@ -63,10 +70,12 @@ def __init__(self, self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size self.initializer_range = initializer_range + self.num_labels = num_labels self.scope = scope - def create_model(self): + def prepare_config_and_inputs(self): input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None @@ -77,6 +86,12 @@ def create_model(self): if self.use_token_type_ids: token_type_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + sequence_labels = None + token_labels = None + if self.use_labels: + sequence_labels = BertModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.num_labels) + config = BertConfig( vocab_size_or_config_json_file=self.vocab_size, hidden_size=self.hidden_size, @@ -90,10 +105,16 @@ def create_model(self): type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range) - model = BertModel(config=config) + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels - all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) + def check_loss_output(self, result): + self.parent.assertListEqual( + list(result["loss"].size()), + []) + def create_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): + model = BertModel(config=config) + all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) outputs = { "sequence_output": all_encoder_layers[-1], "pooled_output": pooled_output, @@ -101,13 +122,119 @@ def create_model(self): } return outputs - def check_output(self, result): + def check_bert_model_output(self, result): + self.parent.assertListEqual( + [size for layer in result["all_encoder_layers"] for size in layer.size()], + [self.batch_size, self.seq_length, self.hidden_size] * self.num_hidden_layers) self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]) - self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size]) + + def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): + model = BertForMaskedLM(config=config) + loss = model(input_ids, token_type_ids, input_mask, token_labels) + prediction_scores = model(input_ids, token_type_ids, input_mask) + outputs = { + "loss": loss, + "prediction_scores": prediction_scores, + } + return outputs + + def check_bert_for_masked_lm_output(self, result): + self.parent.assertListEqual( + list(result["prediction_scores"].size()), + [self.batch_size, self.seq_length, self.vocab_size]) + + def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): + model = BertForNextSentencePrediction(config=config) + loss = model(input_ids, token_type_ids, input_mask, sequence_labels) + seq_relationship_score = model(input_ids, token_type_ids, input_mask) + outputs = { + "loss": loss, + "seq_relationship_score": seq_relationship_score, + } + return outputs + + def check_bert_for_next_sequence_prediction_output(self, result): + self.parent.assertListEqual( + list(result["seq_relationship_score"].size()), + [self.batch_size, 2]) + + + def create_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): + model = BertForPreTraining(config=config) + loss = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels) + prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask) + outputs = { + "loss": loss, + "prediction_scores": prediction_scores, + "seq_relationship_score": seq_relationship_score, + } + return outputs + + def check_bert_for_pretraining_output(self, result): + self.parent.assertListEqual( + list(result["prediction_scores"].size()), + [self.batch_size, self.seq_length, self.vocab_size]) + self.parent.assertListEqual( + list(result["seq_relationship_score"].size()), + [self.batch_size, 2]) + + + def create_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): + model = BertForQuestionAnswering(config=config) + loss = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels) + start_logits, end_logits = model(input_ids, token_type_ids, input_mask) + outputs = { + "loss": loss, + "start_logits": start_logits, + "end_logits": end_logits, + } + return outputs + + def check_bert_for_question_answering_output(self, result): + self.parent.assertListEqual( + list(result["start_logits"].size()), + [self.batch_size, self.seq_length]) + self.parent.assertListEqual( + list(result["end_logits"].size()), + [self.batch_size, self.seq_length]) + + + def create_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): + model = BertForSequenceClassification(config=config, num_labels=self.num_labels) + loss = model(input_ids, token_type_ids, input_mask, sequence_labels) + logits = model(input_ids, token_type_ids, input_mask) + outputs = { + "loss": loss, + "logits": logits, + } + return outputs + + def check_bert_for_sequence_classification_output(self, result): + self.parent.assertListEqual( + list(result["logits"].size()), + [self.batch_size, self.num_labels]) + + + def create_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels): + model = BertForTokenClassification(config=config, num_labels=self.num_labels) + loss = model(input_ids, token_type_ids, input_mask, token_labels) + logits = model(input_ids, token_type_ids, input_mask) + outputs = { + "loss": loss, + "logits": logits, + } + return outputs + + def check_bert_for_token_classification_output(self, result): + self.parent.assertListEqual( + list(result["logits"].size()), + [self.batch_size, self.seq_length, self.num_labels]) + + def test_default(self): self.run_tester(BertModelTest.BertModelTester(self)) @@ -118,8 +245,33 @@ def test_config_to_json_string(self): self.assertEqual(obj["hidden_size"], 37) def run_tester(self, tester): - output_result = tester.create_model() - tester.check_output(output_result) + config_and_inputs = tester.prepare_config_and_inputs() + output_result = tester.create_bert_model(*config_and_inputs) + tester.check_bert_model_output(output_result) + + output_result = tester.create_bert_for_masked_lm(*config_and_inputs) + tester.check_bert_for_masked_lm_output(output_result) + tester.check_loss_output(output_result) + + output_result = tester.create_bert_for_next_sequence_prediction(*config_and_inputs) + tester.check_bert_for_next_sequence_prediction_output(output_result) + tester.check_loss_output(output_result) + + output_result = tester.create_bert_for_pretraining(*config_and_inputs) + tester.check_bert_for_pretraining_output(output_result) + tester.check_loss_output(output_result) + + output_result = tester.create_bert_for_question_answering(*config_and_inputs) + tester.check_bert_for_question_answering_output(output_result) + tester.check_loss_output(output_result) + + output_result = tester.create_bert_for_sequence_classification(*config_and_inputs) + tester.check_bert_for_sequence_classification_output(output_result) + tester.check_loss_output(output_result) + + output_result = tester.create_bert_for_token_classification(*config_and_inputs) + tester.check_bert_for_token_classification_output(output_result) + tester.check_loss_output(output_result) @classmethod def ids_tensor(cls, shape, vocab_size, rng=None, name=None): diff --git a/tests/optimization_test.py b/tests/optimization_test.py index 1c010750ae1f0f..848b9d1cf5c2f1 100644 --- a/tests/optimization_test.py +++ b/tests/optimization_test.py @@ -32,10 +32,10 @@ def assertListAlmostEqual(self, list1, list2, tol): def test_adam(self): w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True) target = torch.tensor([0.4, 0.2, -0.5]) - criterion = torch.nn.MSELoss(reduction='elementwise_mean') + criterion = torch.nn.MSELoss() # No warmup, constant schedule, no gradient clipping optimizer = BertAdam(params=[w], lr=2e-1, - weight_decay_rate=0.0, + weight_decay=0.0, max_grad_norm=-1) for _ in range(100): loss = criterion(w, target) diff --git a/tests/tokenization_test.py b/tests/tokenization_test.py index f541a620e8320b..e1474e938bbcb9 100644 --- a/tests/tokenization_test.py +++ b/tests/tokenization_test.py @@ -44,12 +44,30 @@ def test_full_tokenizer(self): self.assertListEqual( tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) + def test_full_tokenizer_raises_error_for_long_sequences(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing", "," + ] + with open("/tmp/bert_tokenizer_test.txt", "w") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + vocab_file = vocab_writer.name + + tokenizer = BertTokenizer(vocab_file, max_len=10) + os.remove(vocab_file) + tokens = tokenizer.tokenize(u"the cat sat on the mat in the summer time") + indices = tokenizer.convert_tokens_to_ids(tokens) + self.assertListEqual(indices, [0 for _ in range(10)]) + + tokens = tokenizer.tokenize(u"the cat sat on the mat in the summer time .") + self.assertRaises(ValueError, tokenizer.convert_tokens_to_ids, tokens) + def test_chinese(self): tokenizer = BasicTokenizer() - + self.assertListEqual( tokenizer.tokenize(u"ah\u535A\u63A8zz"), - [u"ah", u"\u535A", u"\u63A8", u"zz"]) + [u"ah", u"\u535A", u"\u63A8", u"zz"]) def test_basic_tokenizer_lower(self): tokenizer = BasicTokenizer(do_lower_case=True) diff --git a/train_child.py b/train_child.py new file mode 100644 index 00000000000000..a9d104e1753e55 --- /dev/null +++ b/train_child.py @@ -0,0 +1,177 @@ +import argparse +import os +import json +import itertools +from itertools import product, permutations +from random import sample + +import torch +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler + +from pytorch_pretrained_bert.tokenization import BertTokenizer +from pytorch_pretrained_bert.modeling import BertForPreTraining, BertForMaskedLM, BertConfig +from pytorch_pretrained_bert.optimization import BertAdam +from run_child_finetuning import * +#from child_frames import frames +#from child_wsc_generator import make_sentences +from child_generator import make_sentences + +BERT_DIR = '/nas/pretrain-bert/pretrain-pytorch/bert-base-uncased' +tokenizer = BertTokenizer.from_pretrained('/nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt') + + +parser = argparse.ArgumentParser() + +parser.add_argument("--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") +parser.add_argument("--do_train", + action='store_true', + help="Whether to run training.") +parser.add_argument("--do_eval", + action='store_true', + help="Whether to run eval on the dev set.") +parser.add_argument("--train_batch_size", + default=32, + type=int, + help="Total batch size for training.") +parser.add_argument("--eval_batch_size", + default=32, + type=int, + help="Total batch size for eval.") +parser.add_argument("--learning_rate", + default=3e-5, + type=float, + help="The initial learning rate for Adam.") +parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") +parser.add_argument("--warmup_proportion", + default=0.1, + type=float, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10%% of training.") +parser.add_argument("--no_cuda", + action='store_true', + help="Whether not to use CUDA when available") +parser.add_argument("--do_lower_case", + action='store_true', + help="Whether to lower case the input text. True for uncased models, False for cased models.") +parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") +parser.add_argument('--gradient_accumulation_steps', + type=int, + default=1, + help="Number of updates steps to accumualte before performing a backward/update pass.") +parser.add_argument("--dev_percent", + default=0.5, + type=float) +# args = parser.parse_args(['--output_dir', '/home']) +# args = parser.parse_args([]) +args = parser.parse_args() +args.do_lower_case = True +args.do_train = True +args.do_eval = True +args.eval_batch_size = 128 +# args.learning_rate = 1e-4 +#args.num_train_epochs = 100 +print(args) + +sentences = make_sentences(maybe=False, structured=False) +#sentences = [] +#for frame in frames: +# sentences += make_sentences(**frame)[-1] +logger.info('num_sent = %d' % len(sentences)) +child_dataset = CHILDDataset(tokenizer, sentences, dev_percent=args.dev_percent) +train_features = child_dataset.get_train_features() +logger.info('num_train_examples = %d' % len(train_features)) +num_train_steps = int( + len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) +logger.info('num_train_steps = %d' % num_train_steps) +eval_features = child_dataset.get_dev_features() + +train_dataset = child_dataset.build_dataset(train_features) +eval_dataset = child_dataset.build_dataset(eval_features) + +device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") +n_gpu = torch.cuda.device_count() +logger.info("device: {} n_gpu: {}".format( + device, n_gpu)) + +args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) + +random.seed(args.seed) +np.random.seed(args.seed) +torch.manual_seed(args.seed) +if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + +# Prepare model +model = BertForMaskedLM.from_pretrained(BERT_DIR) +#CONFIG_NAME = 'bert_config_small.json' +#config = BertConfig(os.path.join(BERT_DIR, CONFIG_NAME)) +#model = BertForMaskedLM(config) +_ = model.to(device) +if n_gpu > 1: + model = torch.nn.DataParallel(model) + +# Prepare optimizer +param_optimizer = list(model.named_parameters()) +no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] +optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] +optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_steps) + +logger.info("Epoch 0") +logger.info("Evaluating on train set...") +#validate(model, train_dataset, device) +logger.info("Evaluating on valid set...") +#validate(model, eval_dataset, device) + +global_step = 0 +for epoch in trange(int(args.num_train_epochs), desc="Epoch"): + _ = model.train() + tr_loss = 0 + nb_tr_examples, nb_tr_steps = 0, 0 +# for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): + for step, batch_idx in enumerate(get_batch_index(len(train_dataset), args.train_batch_size, randomized=True)): + batch = tuple(t[batch_idx] for t in train_dataset.tensors) + batch = tuple(t.to(device) for t in batch) + input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch + loss = model(input_ids, segment_ids, input_mask, lm_label_ids) + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + loss.backward() + tr_loss += loss.item() + nb_tr_examples += input_ids.size(0) + nb_tr_steps += 1 + if (step + 1) % args.gradient_accumulation_steps == 0: + # modify learning rate with special warm up BERT uses + lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_steps, args.warmup_proportion) + if global_step % 1000 == 0: + print('global_step %d, lr = %f' % (global_step, lr_this_step)) + for param_group in optimizer.param_groups: + param_group['lr'] = lr_this_step + optimizer.step() + optimizer.zero_grad() + global_step += 1 + + if args.do_eval: + logger.info("Epoch %d" % (epoch + 1)) + logger.info("Evaluating on train set...") + validate(model, train_dataset, device) + logger.info("Evaluating on valid set...") + validate(model, eval_dataset, device) diff --git a/train_child_wh+yesno_maybe0_structured0_devpercent.1_noiselen30_bert.out b/train_child_wh+yesno_maybe0_structured0_devpercent.1_noiselen30_bert.out new file mode 100644 index 00000000000000..d883d97aef83c6 --- /dev/null +++ b/train_child_wh+yesno_maybe0_structured0_devpercent.1_noiselen30_bert.out @@ -0,0 +1,220 @@ +06/09/2019 19:28:36 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 19:28:36 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 19:28:36 - INFO - run_child_finetuning - num_sent = 46080 +06/09/2019 19:29:02 - INFO - run_child_finetuning - num_train_steps = 7776 +06/09/2019 19:29:03 - INFO - run_child_finetuning - device: cuda n_gpu: 1 +06/09/2019 19:29:03 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased +06/09/2019 19:29:03 - INFO - pytorch_pretrained_bert.modeling - Model config { + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "type_vocab_size": 2, + "vocab_size": 30522 +} + +06/09/2019 19:29:06 - INFO - pytorch_pretrained_bert.modeling - Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias'] +06/09/2019 19:29:08 - INFO - run_child_finetuning - Epoch 0 +06/09/2019 19:29:08 - INFO - run_child_finetuning - Evaluating on train set... +06/09/2019 19:30:06 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 19:30:06 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 19:30:06 - INFO - run_child_finetuning - num_sent = 46080 +06/09/2019 19:30:32 - INFO - run_child_finetuning - num_train_steps = 7776 +06/09/2019 19:30:37 - INFO - run_child_finetuning - device: cuda n_gpu: 1 +06/09/2019 19:30:37 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased +06/09/2019 19:30:37 - INFO - pytorch_pretrained_bert.modeling - Model config { + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "type_vocab_size": 2, + "vocab_size": 30522 +} + +06/09/2019 19:30:40 - INFO - pytorch_pretrained_bert.modeling - Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias'] +06/09/2019 19:30:45 - INFO - run_child_finetuning - Epoch 0 +06/09/2019 19:30:45 - INFO - run_child_finetuning - Evaluating on train set... +06/09/2019 19:30:45 - INFO - run_child_finetuning - Evaluating on valid set... + Epoch: 0%| | 0/6 [00:00 + validate(model, train_dataset, device) + File "/home/xd/projects/pytorch-pretrained-BERT/run_child_finetuning.py", line 486, in validate + tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) + File "/home/qsj/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in __call__ + result = self.forward(*input, **kwargs) + File "/home/xd/projects/pytorch-pretrained-BERT/pytorch_pretrained_bert/modeling.py", line 761, in forward + prediction_scores = self.cls(sequence_output) + File "/home/qsj/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in __call__ + result = self.forward(*input, **kwargs) + File "/home/xd/projects/pytorch-pretrained-BERT/pytorch_pretrained_bert/modeling.py", line 393, in forward + prediction_scores = self.predictions(sequence_output) + File "/home/qsj/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in __call__ + result = self.forward(*input, **kwargs) + File "/home/xd/projects/pytorch-pretrained-BERT/pytorch_pretrained_bert/modeling.py", line 383, in forward + hidden_states = self.decoder(hidden_states) + self.bias +RuntimeError: CUDA error: out of memory + +Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten. +Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead. +Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable. +Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable. +Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex. +Namespace(dev_percent=0.1, do_eval=True, do_lower_case=True, do_train=True, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=3e-05, max_seq_length=128, no_cuda=False, num_train_epochs=6.0, seed=42, train_batch_size=32, warmup_proportion=0.1) +num_sent = 46080 -> 46080 +global_step 0, lr = 0.000000 +global_step 1000, lr = 0.000026 +global_step 2000, lr = 0.000022 +06/09/2019 20:21:50 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 20:21:50 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 20:21:50 - INFO - run_child_finetuning - num_sent = 46080 +06/09/2019 20:22:16 - INFO - run_child_finetuning - num_train_steps = 7776 +06/09/2019 20:22:21 - INFO - run_child_finetuning - device: cuda n_gpu: 1 +06/09/2019 20:22:21 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased +06/09/2019 20:22:21 - INFO - pytorch_pretrained_bert.modeling - Model config { + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "type_vocab_size": 2, + "vocab_size": 30522 +} + +06/09/2019 20:22:24 - INFO - pytorch_pretrained_bert.modeling - Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias'] +06/09/2019 20:22:26 - INFO - run_child_finetuning - Epoch 0 +06/09/2019 20:22:26 - INFO - run_child_finetuning - Evaluating on train set... +06/09/2019 20:22:26 - INFO - run_child_finetuning - Evaluating on valid set... + Epoch: 0%| | 0/6 [00:00 + validate(model, train_dataset, device) + File "/home/xd/projects/pytorch-pretrained-BERT/run_child_finetuning.py", line 486, in validate + tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) + File "/home/qsj/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in __call__ + result = self.forward(*input, **kwargs) + File "/home/xd/projects/pytorch-pretrained-BERT/pytorch_pretrained_bert/modeling.py", line 761, in forward + prediction_scores = self.cls(sequence_output) + File "/home/qsj/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in __call__ + result = self.forward(*input, **kwargs) + File "/home/xd/projects/pytorch-pretrained-BERT/pytorch_pretrained_bert/modeling.py", line 393, in forward + prediction_scores = self.predictions(sequence_output) + File "/home/qsj/miniconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 477, in __call__ + result = self.forward(*input, **kwargs) + File "/home/xd/projects/pytorch-pretrained-BERT/pytorch_pretrained_bert/modeling.py", line 383, in forward + hidden_states = self.decoder(hidden_states) + self.bias +RuntimeError: CUDA error: out of memory + +Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten. +Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead. +Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable. +Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable. +Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex. +Namespace(dev_percent=0.1, do_eval=True, do_lower_case=True, do_train=True, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=3e-05, max_seq_length=128, no_cuda=False, num_train_epochs=6.0, seed=42, train_batch_size=32, warmup_proportion=0.1) +num_sent = 46080 -> 46080 +global_step 0, lr = 0.000000 +global_step 1000, lr = 0.000026 +global_step 2000, lr = 0.000022 +06/09/2019 21:10:16 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 21:10:16 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 21:10:16 - INFO - run_child_finetuning - num_sent = 46080 +06/09/2019 21:10:38 - INFO - run_child_finetuning - num_train_steps = 7776 +06/09/2019 21:10:43 - INFO - run_child_finetuning - device: cuda n_gpu: 1 +06/09/2019 21:10:43 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased +06/09/2019 21:10:43 - INFO - pytorch_pretrained_bert.modeling - Model config { + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "type_vocab_size": 2, + "vocab_size": 30522 +} + +06/09/2019 21:10:46 - INFO - pytorch_pretrained_bert.modeling - Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias'] +06/09/2019 21:10:48 - INFO - run_child_finetuning - Epoch 0 +06/09/2019 21:10:48 - INFO - run_child_finetuning - Evaluating on train set... +06/09/2019 21:10:48 - INFO - run_child_finetuning - Evaluating on valid set... + Epoch: 0%| | 0/6 [00:00 + child_dataset = CHILDDataset(tokenizer, sentences, dev_percent=args.dev_percent) + File "/home/xd/projects/pytorch-pretrained-BERT/run_child_finetuning.py", line 68, in __init__ + t1, t2, is_next_label = self.split_sent(line) + File "/home/xd/projects/pytorch-pretrained-BERT/run_child_finetuning.py", line 121, in split_sent + assert self.one_sent +AssertionError +Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten. +Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead. +Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable. +Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable. +Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex. +Namespace(dev_percent=0.05, do_eval=True, do_lower_case=True, do_train=True, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=3e-05, max_seq_length=128, no_cuda=False, num_train_epochs=6.0, seed=42, train_batch_size=32, warmup_proportion=0.1) +num_sent = 92160 -> 92160 +06/09/2019 19:12:36 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 19:12:36 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 19:12:36 - INFO - run_child_finetuning - num_sent = 92160 +Traceback (most recent call last): + File "/home/qsj/miniconda3/lib/python3.6/runpy.py", line 183, in _run_module_as_main + mod_name, mod_spec, code = _get_module_details(mod_name, _Error) + File "/home/qsj/miniconda3/lib/python3.6/runpy.py", line 109, in _get_module_details + __import__(pkg_name) + File "/home/xd/projects/pytorch-pretrained-BERT/train_child.py", line 86, in + child_dataset = CHILDDataset(tokenizer, sentences, dev_percent=args.dev_percent) + File "/home/xd/projects/pytorch-pretrained-BERT/run_child_finetuning.py", line 71, in __init__ + tokens_b = self.tokenizer.tokenize(t2) + File "/home/xd/projects/pytorch-pretrained-BERT/pytorch_pretrained_bert/tokenization.py", line 94, in tokenize + for token in self.basic_tokenizer.tokenize(text): + File "/home/xd/projects/pytorch-pretrained-BERT/pytorch_pretrained_bert/tokenization.py", line 174, in tokenize + text = self._clean_text(text) + File "/home/xd/projects/pytorch-pretrained-BERT/pytorch_pretrained_bert/tokenization.py", line 264, in _clean_text + for char in text: +TypeError: 'NoneType' object is not iterable +Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten. +Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead. +Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable. +Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable. +Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex. +Namespace(dev_percent=0.05, do_eval=True, do_lower_case=True, do_train=True, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=3e-05, max_seq_length=128, no_cuda=False, num_train_epochs=6.0, seed=42, train_batch_size=32, warmup_proportion=0.1) +num_sent = 92160 -> 92159 +06/09/2019 19:21:08 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 19:21:08 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 19:21:09 - INFO - run_child_finetuning - num_sent = 92160 +06/09/2019 19:22:02 - INFO - run_child_finetuning - num_train_steps = 16416 +06/09/2019 19:22:03 - INFO - run_child_finetuning - device: cuda n_gpu: 1 +06/09/2019 19:22:03 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased +06/09/2019 19:22:03 - INFO - pytorch_pretrained_bert.modeling - Model config { + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "type_vocab_size": 2, + "vocab_size": 30522 +} + +06/09/2019 19:22:07 - INFO - pytorch_pretrained_bert.modeling - Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias'] +06/09/2019 19:22:09 - INFO - run_child_finetuning - Epoch 0 +06/09/2019 19:22:09 - INFO - run_child_finetuning - Evaluating on train set... +06/09/2019 19:25:45 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 19:25:45 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 19:25:46 - INFO - run_child_finetuning - num_sent = 46080 +06/09/2019 19:26:12 - INFO - run_child_finetuning - num_train_steps = 8208 +06/09/2019 19:26:13 - INFO - run_child_finetuning - device: cuda n_gpu: 1 +06/09/2019 19:26:13 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased +06/09/2019 19:26:13 - INFO - pytorch_pretrained_bert.modeling - Model config { + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "type_vocab_size": 2, + "vocab_size": 30522 +} + +06/09/2019 19:26:16 - INFO - pytorch_pretrained_bert.modeling - Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias'] +06/09/2019 19:26:18 - INFO - run_child_finetuning - Epoch 0 +06/09/2019 19:26:18 - INFO - run_child_finetuning - Evaluating on train set... diff --git a/train_child_wh+yesno_maybe1_structured0_devpercent.1_bert.out b/train_child_wh+yesno_maybe1_structured0_devpercent.1_bert.out new file mode 100644 index 00000000000000..ce94b8158c2bf4 --- /dev/null +++ b/train_child_wh+yesno_maybe1_structured0_devpercent.1_bert.out @@ -0,0 +1,90 @@ +06/09/2019 18:18:33 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 18:18:33 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 18:19:03 - INFO - run_child_finetuning - device: cuda n_gpu: 1 +06/09/2019 18:19:03 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased +06/09/2019 18:19:03 - INFO - pytorch_pretrained_bert.modeling - Model config { + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "type_vocab_size": 2, + "vocab_size": 30522 +} + +06/09/2019 18:19:06 - INFO - pytorch_pretrained_bert.modeling - Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias'] +06/09/2019 18:19:08 - INFO - run_child_finetuning - Epoch 0 +06/09/2019 18:19:08 - INFO - run_child_finetuning - Evaluating on train set... +06/09/2019 18:19:08 - INFO - run_child_finetuning - Evaluating on valid set... + Epoch: 0%| | 0/6 [00:00 3840 +num_train_steps = 6000 +global_step 0, lr = 0.000000 +global_step 1000, lr = 0.000417 +global_step 2000, lr = 0.000333 +global_step 3000, lr = 0.000250 +global_step 4000, lr = 0.000167 +global_step 5000, lr = 0.000083 diff --git a/train_child_whonly_maybe0_bert.out b/train_child_whonly_maybe0_bert.out new file mode 100644 index 00000000000000..23245ed9774704 --- /dev/null +++ b/train_child_whonly_maybe0_bert.out @@ -0,0 +1,60 @@ +06/09/2019 16:43:11 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 16:43:11 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 16:43:13 - INFO - run_child_finetuning - device: cuda n_gpu: 1 +06/09/2019 16:43:13 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased +06/09/2019 16:43:13 - INFO - pytorch_pretrained_bert.modeling - Model config { + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "type_vocab_size": 2, + "vocab_size": 30522 +} + +06/09/2019 16:43:16 - INFO - pytorch_pretrained_bert.modeling - Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias'] +06/09/2019 16:43:18 - INFO - run_child_finetuning - Epoch 0 +06/09/2019 16:43:18 - INFO - run_child_finetuning - Evaluating on train set... +06/09/2019 16:43:18 - INFO - run_child_finetuning - Evaluating on valid set... + Epoch: 0%| | 0/3 [00:00 3840 +num_train_steps = 180 +global_step 0, lr = 0.000000 diff --git a/train_child_whonly_maybe1.out b/train_child_whonly_maybe1.out new file mode 100644 index 00000000000000..1ca64917d28b87 --- /dev/null +++ b/train_child_whonly_maybe1.out @@ -0,0 +1,927 @@ +Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten. +Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead. +Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable. +Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable. +Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex. +06/09/2019 15:39:53 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 15:39:53 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +Namespace(dev_percent=0.5, do_eval=True, do_lower_case=True, do_train=True, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=0.0005, max_seq_length=128, no_cuda=False, num_train_epochs=100, seed=42, train_batch_size=32, warmup_proportion=0.1) +num_sent = 7680 -> 5760 +num_train_steps = 12000 +06/09/2019 15:39:59 - INFO - run_child_finetuning - device: cuda n_gpu: 1 +06/09/2019 15:40:05 - INFO - run_child_finetuning - Epoch 0 +06/09/2019 15:40:05 - INFO - run_child_finetuning - Evaluating on train set... +06/09/2019 15:40:05 - INFO - run_child_finetuning - Evaluating on valid set... + Epoch: 0%| | 0/100 [00:00 5760 +num_train_steps = 12000 +06/09/2019 16:27:29 - INFO - run_child_finetuning - device: cuda n_gpu: 1 +06/09/2019 16:27:34 - INFO - run_child_finetuning - Epoch 0 +06/09/2019 16:27:34 - INFO - run_child_finetuning - Evaluating on train set... +06/09/2019 16:27:34 - INFO - run_child_finetuning - Evaluating on valid set... + Epoch: 0%| | 0/100 [00:00 5760 +num_train_steps = 360 +global_step 0, lr = 0.000000 diff --git a/train_child_whonly_maybeonly.out b/train_child_whonly_maybeonly.out new file mode 100644 index 00000000000000..87a640ee24b5c9 --- /dev/null +++ b/train_child_whonly_maybeonly.out @@ -0,0 +1,506 @@ +Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten. +Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead. +Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable. +Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable. +Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex. +06/09/2019 16:16:25 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 16:16:25 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +Namespace(dev_percent=0.5, do_eval=True, do_lower_case=True, do_train=True, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=0.0005, max_seq_length=128, no_cuda=False, num_train_epochs=100, seed=42, train_batch_size=32, warmup_proportion=0.1) +num_sent = 3840 -> 1920 +num_train_steps = 6000 +06/09/2019 16:16:33 - INFO - run_child_finetuning - device: cuda n_gpu: 1 +06/09/2019 16:16:36 - INFO - run_child_finetuning - Epoch 0 +06/09/2019 16:16:36 - INFO - run_child_finetuning - Evaluating on train set... +06/09/2019 16:16:36 - INFO - run_child_finetuning - Evaluating on valid set... + Epoch: 0%| | 0/100 [00:00 + self.all_lines = [] +NameError: name 'self' is not defined +Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten. +Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead. +Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable. +Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable. +Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex. +Namespace(dev_percent=0.3, do_eval=True, do_lower_case=True, do_train=True, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=3e-05, max_seq_length=128, no_cuda=False, num_train_epochs=6.0, seed=42, train_batch_size=32, warmup_proportion=0.1) +06/09/2019 22:16:32 - ERROR - pytorch_pretrained_bert.tokenization - Model name '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/vocab.txt' was not found in model name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese). We assumed '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/vocab.txt' was a path or url but couldn't find any file associated to this path or url. +06/09/2019 22:16:32 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +Traceback (most recent call last): + File "/home/qsj/miniconda3/lib/python3.6/runpy.py", line 183, in _run_module_as_main + mod_name, mod_spec, code = _get_module_details(mod_name, _Error) + File "/home/qsj/miniconda3/lib/python3.6/runpy.py", line 109, in _get_module_details + __import__(pkg_name) + File "/home/xd/projects/pytorch-pretrained-BERT/train_child.py", line 88, in + sentences += make_sentences(**frame) + File "/home/xd/projects/pytorch-pretrained-BERT/child_generator.py", line 53, in make_sentences + assert entities[0].lower() in tokenizer.vocab , entities[0] +AttributeError: 'NoneType' object has no attribute 'vocab' +Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten. +Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead. +Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable. +Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable. +Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex. +Namespace(dev_percent=0.3, do_eval=True, do_lower_case=True, do_train=True, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=3e-05, max_seq_length=128, no_cuda=False, num_train_epochs=6.0, seed=42, train_batch_size=32, warmup_proportion=0.1) +06/09/2019 22:17:49 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 22:17:49 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 22:17:49 - INFO - run_child_finetuning - num_sent = 120 +Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten. +Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead. +Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable. +Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable. +Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex. +Namespace(dev_percent=0.3, do_eval=True, do_lower_case=True, do_train=True, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=3e-05, max_seq_length=128, no_cuda=False, num_train_epochs=6.0, seed=42, train_batch_size=32, warmup_proportion=0.1) +[[("John couldn't see the stage behind Susan ||| because [John] is short.", "John couldn't catch sight of the stage behind Susan ||| because [John] is short."), ("John couldn't see the stage behind Susan ||| because [Susan] is tall.", "John couldn't catch sight of the stage behind Susan ||| because [Susan] is tall."), ("John couldn't see the stage behind Susan ||| because [John] isn't tall.", "John couldn't catch sight of the stage behind Susan ||| because [John] isn't tall."), ("John couldn't see the stage behind Susan ||| because [Susan] isn't short.", "John couldn't catch sight of the stage behind Susan ||| because [Susan] isn't short."), ('Susan blocked the view of John ||| because [John] is short.', 'Susan obstructed the sight of John ||| because [John] is short.'), ('Susan blocked the view of John ||| because [Susan] is tall.', 'Susan obstructed the sight of John ||| because [Susan] is tall.'), ("Susan blocked the view of John ||| because [John] isn't tall.", "Susan obstructed the sight of John ||| because [John] isn't tall."), ("Susan blocked the view of John ||| because [Susan] isn't short.", "Susan obstructed the sight of John ||| because [Susan] isn't short."), ("John could see the stage behind Susan ||| because [John] isn't short.", "John could catch sight of the stage behind Susan ||| because [John] isn't short."), ("John could see the stage behind Susan ||| because [Susan] isn't tall.", "John could catch sight of the stage behind Susan ||| because [Susan] isn't tall."), ('John could see the stage behind Susan ||| because [John] is tall.', 'John could catch sight of the stage behind Susan ||| because [John] is tall.'), ('John could see the stage behind Susan ||| because [Susan] is short.', 'John could catch sight of the stage behind Susan ||| because [Susan] is short.'), ("Susan didn't block the view of John ||| because [John] isn't short.", "Susan didn't obstruct the sight of John ||| because [John] isn't short."), ("Susan didn't block the view of John ||| because [Susan] isn't tall.", "Susan didn't obstruct the sight of John ||| because [Susan] isn't tall."), ("Susan didn't block the view of John ||| because [John] is tall.", "Susan didn't obstruct the sight of John ||| because [John] is tall."), ("Susan didn't block the view of John ||| because [Susan] is short.", "Susan didn't obstruct the sight of John ||| because [Susan] is short."), ("John couldn't see the stage behind Susan ||| although [John] isn't short.", "John couldn't catch sight of the stage behind Susan ||| although [John] isn't short."), ("John couldn't see the stage behind Susan ||| although [Susan] isn't tall.", "John couldn't catch sight of the stage behind Susan ||| although [Susan] isn't tall."), ("John couldn't see the stage behind Susan ||| although [John] is tall.", "John couldn't catch sight of the stage behind Susan ||| although [John] is tall."), ("John couldn't see the stage behind Susan ||| although [Susan] is short.", "John couldn't catch sight of the stage behind Susan ||| although [Susan] is short."), ("Susan blocked the view of John ||| although [John] isn't short.", "Susan obstructed the sight of John ||| although [John] isn't short."), ("Susan blocked the view of John ||| although [Susan] isn't tall.", "Susan obstructed the sight of John ||| although [Susan] isn't tall."), ('Susan blocked the view of John ||| although [John] is tall.', 'Susan obstructed the sight of John ||| although [John] is tall.'), ('Susan blocked the view of John ||| although [Susan] is short.', 'Susan obstructed the sight of John ||| although [Susan] is short.'), ('John could see the stage behind Susan ||| although [John] is short.', 'John could catch sight of the stage behind Susan ||| although [John] is short.'), ('John could see the stage behind Susan ||| although [Susan] is tall.', 'John could catch sight of the stage behind Susan ||| although [Susan] is tall.'), ("John could see the stage behind Susan ||| although [John] isn't tall.", "John could catch sight of the stage behind Susan ||| although [John] isn't tall."), ("John could see the stage behind Susan ||| although [Susan] isn't short.", "John could catch sight of the stage behind Susan ||| although [Susan] isn't short."), ("Susan didn't block the view of John ||| although [John] is short.", "Susan didn't obstruct the sight of John ||| although [John] is short."), ("Susan didn't block the view of John ||| although [Susan] is tall.", "Susan didn't obstruct the sight of John ||| although [Susan] is tall."), ("Susan didn't block the view of John ||| although [John] isn't tall.", "Susan didn't obstruct the sight of John ||| although [John] isn't tall."), ("Susan didn't block the view of John ||| although [Susan] isn't short.", "Susan didn't obstruct the sight of John ||| although [Susan] isn't short.")], [('the newspapers could be placed on all the chairs ||| because there were many of the [newspapers].', 'the newspapers could be put on all the chairs ||| because there were many of the [newspapers].'), ('the newspapers could be placed on all the chairs ||| because there were few of the [chairs].', 'the newspapers could be put on all the chairs ||| because there were few of the [chairs].'), ('the newspapers could be placed on all the chairs ||| because there were not few of the [newspapers].', 'the newspapers could be put on all the chairs ||| because there were not few of the [newspapers].'), ('the newspapers could be placed on all the chairs ||| because there were not many of the [chairs].', 'the newspapers could be put on all the chairs ||| because there were not many of the [chairs].'), ('the chairs could all be covered by the newspapers ||| because there were many of the [newspapers].', 'the chairs could carry all the newspapers ||| because there were many of the [newspapers].'), ('the chairs could all be covered by the newspapers ||| because there were few of the [chairs].', 'the chairs could carry all the newspapers ||| because there were few of the [chairs].'), ('the chairs could all be covered by the newspapers ||| because there were not few of the [newspapers].', 'the chairs could carry all the newspapers ||| because there were not few of the [newspapers].'), ('the chairs could all be covered by the newspapers ||| because there were not many of the [chairs].', 'the chairs could carry all the newspapers ||| because there were not many of the [chairs].'), ("the newspapers couldn't be placed on all the chairs ||| because there were not many of the [newspapers].", "the newspapers couldn't be put on all the chairs ||| because there were not many of the [newspapers]."), ("the newspapers couldn't be placed on all the chairs ||| because there were not few of the [chairs].", "the newspapers couldn't be put on all the chairs ||| because there were not few of the [chairs]."), ("the newspapers couldn't be placed on all the chairs ||| because there were few of the [newspapers].", "the newspapers couldn't be put on all the chairs ||| because there were few of the [newspapers]."), ("the newspapers couldn't be placed on all the chairs ||| because there were many of the [chairs].", "the newspapers couldn't be put on all the chairs ||| because there were many of the [chairs]."), ("the chairs couldn't all be covered by the newspapers ||| because there were not many of the [newspapers].", "the chairs couldn't carry all the newspapers ||| because there were not many of the [newspapers]."), ("the chairs couldn't all be covered by the newspapers ||| because there were not few of the [chairs].", "the chairs couldn't carry all the newspapers ||| because there were not few of the [chairs]."), ("the chairs couldn't all be covered by the newspapers ||| because there were few of the [newspapers].", "the chairs couldn't carry all the newspapers ||| because there were few of the [newspapers]."), ("the chairs couldn't all be covered by the newspapers ||| because there were many of the [chairs].", "the chairs couldn't carry all the newspapers ||| because there were many of the [chairs]."), ('the newspapers could be placed on all the chairs ||| although there were not many of the [newspapers].', 'the newspapers could be put on all the chairs ||| although there were not many of the [newspapers].'), ('the newspapers could be placed on all the chairs ||| although there were not few of the [chairs].', 'the newspapers could be put on all the chairs ||| although there were not few of the [chairs].'), ('the newspapers could be placed on all the chairs ||| although there were few of the [newspapers].', 'the newspapers could be put on all the chairs ||| although there were few of the [newspapers].'), ('the newspapers could be placed on all the chairs ||| although there were many of the [chairs].', 'the newspapers could be put on all the chairs ||| although there were many of the [chairs].'), ('the chairs could all be covered by the newspapers ||| although there were not many of the [newspapers].', 'the chairs could carry all the newspapers ||| although there were not many of the [newspapers].'), ('the chairs could all be covered by the newspapers ||| although there were not few of the [chairs].', 'the chairs could carry all the newspapers ||| although there were not few of the [chairs].'), ('the chairs could all be covered by the newspapers ||| although there were few of the [newspapers].', 'the chairs could carry all the newspapers ||| although there were few of the [newspapers].'), ('the chairs could all be covered by the newspapers ||| although there were many of the [chairs].', 'the chairs could carry all the newspapers ||| although there were many of the [chairs].'), ("the newspapers couldn't be placed on all the chairs ||| although there were many of the [newspapers].", "the newspapers couldn't be put on all the chairs ||| although there were many of the [newspapers]."), ("the newspapers couldn't be placed on all the chairs ||| although there were few of the [chairs].", "the newspapers couldn't be put on all the chairs ||| although there were few of the [chairs]."), ("the newspapers couldn't be placed on all the chairs ||| although there were not few of the [newspapers].", "the newspapers couldn't be put on all the chairs ||| although there were not few of the [newspapers]."), ("the newspapers couldn't be placed on all the chairs ||| although there were not many of the [chairs].", "the newspapers couldn't be put on all the chairs ||| although there were not many of the [chairs]."), ("the chairs couldn't all be covered by the newspapers ||| although there were many of the [newspapers].", "the chairs couldn't carry all the newspapers ||| although there were many of the [newspapers]."), ("the chairs couldn't all be covered by the newspapers ||| although there were few of the [chairs].", "the chairs couldn't carry all the newspapers ||| although there were few of the [chairs]."), ("the chairs couldn't all be covered by the newspapers ||| although there were not few of the [newspapers].", "the chairs couldn't carry all the newspapers ||| although there were not few of the [newspapers]."), ("the chairs couldn't all be covered by the newspapers ||| although there were not many of the [chairs].", "the chairs couldn't carry all the newspapers ||| although there were not many of the [chairs].")], ["Anna did better than Andy on the test ||| although [Anna] hadn't studied hard.", "Anna did better than Andy on the test ||| although [Andy] wasn't lazy in doing homework.", 'Anna did better than Andy on the test ||| although [Anna] was lazy in doing homework.', 'Anna did better than Andy on the test ||| although [Andy] had studied hard.', "Andy did worse than Anna on the test ||| although [Anna] hadn't studied hard.", "Andy did worse than Anna on the test ||| although [Andy] wasn't lazy in doing homework.", 'Andy did worse than Anna on the test ||| although [Anna] was lazy in doing homework.', 'Andy did worse than Anna on the test ||| although [Andy] had studied hard.', "Anna didn't do better than Andy on the test ||| although [Anna] had studied hard.", "Anna didn't do better than Andy on the test ||| although [Andy] was lazy in doing homework.", "Anna didn't do better than Andy on the test ||| although [Anna] wasn't lazy in doing homework.", "Anna didn't do better than Andy on the test ||| although [Andy] hadn't studied hard.", "Andy didn't do worse than Anna on the test ||| although [Anna] had studied hard.", "Andy didn't do worse than Anna on the test ||| although [Andy] was lazy in doing homework.", "Andy didn't do worse than Anna on the test ||| although [Anna] wasn't lazy in doing homework.", "Andy didn't do worse than Anna on the test ||| although [Andy] hadn't studied hard."], ['Bill passed the half-empty plate to Amy ||| because [Bill] was full.', 'Bill passed the half-empty plate to Amy ||| because [Amy] was hungry.', "Bill passed the half-empty plate to Amy ||| because [Bill] wasn't hungry.", "Bill passed the half-empty plate to Amy ||| because [Amy] wasn't full.", 'Amy received the half-empty plate from Bill ||| because [Bill] was full.', 'Amy received the half-empty plate from Bill ||| because [Amy] was hungry.', "Amy received the half-empty plate from Bill ||| because [Bill] wasn't hungry.", "Amy received the half-empty plate from Bill ||| because [Amy] wasn't full.", "Bill didn't pass the half-empty plate to Amy ||| because [Bill] wasn't full.", "Bill didn't pass the half-empty plate to Amy ||| because [Amy] wasn't hungry.", "Bill didn't pass the half-empty plate to Amy ||| because [Bill] was hungry.", "Bill didn't pass the half-empty plate to Amy ||| because [Amy] was full.", "Amy didn't received the half-empty plate from Bill ||| because [Bill] wasn't full.", "Amy didn't received the half-empty plate from Bill ||| because [Amy] wasn't hungry.", "Amy didn't received the half-empty plate from Bill ||| because [Bill] was hungry.", "Amy didn't received the half-empty plate from Bill ||| because [Amy] was full."], ['Running at about the same speed, Tom beat Sue in the running race ||| because [Tom] had a good start.', 'Running at about the same speed, Tom beat Sue in the running race ||| because [Sue] had a bad start.', "Running at about the same speed, Tom beat Sue in the running race ||| because [Tom] didn't have a bad start.", "Running at about the same speed, Tom beat Sue in the running race ||| because [Sue] didn't have a good start.", 'Running at about the same speed, Sue lost to Tom in the running race ||| because [Tom] had a good start.', 'Running at about the same speed, Sue lost to Tom in the running race ||| because [Sue] had a bad start.', "Running at about the same speed, Sue lost to Tom in the running race ||| because [Tom] didn't have a bad start.", "Running at about the same speed, Sue lost to Tom in the running race ||| because [Sue] didn't have a good start.", "Running at about the same speed, Tom didn't beat Sue in the running race ||| because [Tom] didn't have a good start.", "Running at about the same speed, Tom didn't beat Sue in the running race ||| because [Sue] didn't have a bad start.", "Running at about the same speed, Tom didn't beat Sue in the running race ||| because [Tom] had a bad start.", "Running at about the same speed, Tom didn't beat Sue in the running race ||| because [Sue] had a good start.", "Running at about the same speed, Sue didn't lose to Tom in the running race ||| because [Tom] didn't have a good start.", "Running at about the same speed, Sue didn't lose to Tom in the running race ||| because [Sue] didn't have a bad start.", "Running at about the same speed, Sue didn't lose to Tom in the running race ||| because [Tom] had a bad start.", "Running at about the same speed, Sue didn't lose to Tom in the running race ||| because [Sue] had a good start."], [['Charles threw the schoolbag down to Emma ||| after [Charles] reached the top of the stairs.', 'Charles threw the schoolbag down to Linda ||| after [Charles] reached the top of the stairs.', 'Paul threw the schoolbag down to Emma ||| after [Paul] reached the top of the stairs.', 'Paul threw the schoolbag down to Linda ||| after [Paul] reached the top of the stairs.', 'Charles cast the schoolbag down to Emma ||| after [Charles] reached the top of the stairs.', 'Charles cast the schoolbag down to Linda ||| after [Charles] reached the top of the stairs.', 'Paul cast the schoolbag down to Emma ||| after [Paul] reached the top of the stairs.', 'Paul cast the schoolbag down to Linda ||| after [Paul] reached the top of the stairs.'], ['Charles threw the schoolbag down to Emma ||| after [Emma] reached the bottom of the stairs.', 'Charles threw the schoolbag down to Linda ||| after [Linda] reached the bottom of the stairs.', 'Paul threw the schoolbag down to Emma ||| after [Emma] reached the bottom of the stairs.', 'Paul threw the schoolbag down to Linda ||| after [Linda] reached the bottom of the stairs.', 'Charles cast the schoolbag down to Emma ||| after [Emma] reached the bottom of the stairs.', 'Charles cast the schoolbag down to Linda ||| after [Linda] reached the bottom of the stairs.', 'Paul cast the schoolbag down to Emma ||| after [Emma] reached the bottom of the stairs.', 'Paul cast the schoolbag down to Linda ||| after [Linda] reached the bottom of the stairs.'], ['Emma caught the schoolbag thrown down by Charles ||| after [Charles] reached the top of the stairs.', 'Linda caught the schoolbag thrown down by Charles ||| after [Charles] reached the top of the stairs.', 'Emma caught the schoolbag thrown down by Paul ||| after [Paul] reached the top of the stairs.', 'Linda caught the schoolbag thrown down by Paul ||| after [Paul] reached the top of the stairs.', 'Emma took the schoolbag thrown down by Charles ||| after [Charles] reached the top of the stairs.', 'Linda took the schoolbag thrown down by Charles ||| after [Charles] reached the top of the stairs.', 'Emma took the schoolbag thrown down by Paul ||| after [Paul] reached the top of the stairs.', 'Linda took the schoolbag thrown down by Paul ||| after [Paul] reached the top of the stairs.'], ['Emma caught the schoolbag thrown down by Charles ||| after [Emma] reached the bottom of the stairs.', 'Linda caught the schoolbag thrown down by Charles ||| after [Linda] reached the bottom of the stairs.', 'Emma caught the schoolbag thrown down by Paul ||| after [Emma] reached the bottom of the stairs.', 'Linda caught the schoolbag thrown down by Paul ||| after [Linda] reached the bottom of the stairs.', 'Emma took the schoolbag thrown down by Charles ||| after [Emma] reached the bottom of the stairs.', 'Linda took the schoolbag thrown down by Charles ||| after [Linda] reached the bottom of the stairs.', 'Emma took the schoolbag thrown down by Paul ||| after [Emma] reached the bottom of the stairs.', 'Linda took the schoolbag thrown down by Paul ||| after [Linda] reached the bottom of the stairs.'], ['Charles threw the schoolbag up to Emma ||| after [Charles] reached the bottom of the stairs.', 'Charles threw the schoolbag up to Linda ||| after [Charles] reached the bottom of the stairs.', 'Paul threw the schoolbag up to Emma ||| after [Paul] reached the bottom of the stairs.', 'Paul threw the schoolbag up to Linda ||| after [Paul] reached the bottom of the stairs.', 'Charles cast the schoolbag up to Emma ||| after [Charles] reached the bottom of the stairs.', 'Charles cast the schoolbag up to Linda ||| after [Charles] reached the bottom of the stairs.', 'Paul cast the schoolbag up to Emma ||| after [Paul] reached the bottom of the stairs.', 'Paul cast the schoolbag up to Linda ||| after [Paul] reached the bottom of the stairs.'], ['Charles threw the schoolbag up to Emma ||| after [Emma] reached the top of the stairs.', 'Charles threw the schoolbag up to Linda ||| after [Linda] reached the top of the stairs.', 'Paul threw the schoolbag up to Emma ||| after [Emma] reached the top of the stairs.', 'Paul threw the schoolbag up to Linda ||| after [Linda] reached the top of the stairs.', 'Charles cast the schoolbag up to Emma ||| after [Emma] reached the top of the stairs.', 'Charles cast the schoolbag up to Linda ||| after [Linda] reached the top of the stairs.', 'Paul cast the schoolbag up to Emma ||| after [Emma] reached the top of the stairs.', 'Paul cast the schoolbag up to Linda ||| after [Linda] reached the top of the stairs.'], ['Emma caught the schoolbag thrown up by Charles ||| after [Charles] reached the bottom of the stairs.', 'Linda caught the schoolbag thrown up by Charles ||| after [Charles] reached the bottom of the stairs.', 'Emma caught the schoolbag thrown up by Paul ||| after [Paul] reached the bottom of the stairs.', 'Linda caught the schoolbag thrown up by Paul ||| after [Paul] reached the bottom of the stairs.', 'Emma took the schoolbag thrown up by Charles ||| after [Charles] reached the bottom of the stairs.', 'Linda took the schoolbag thrown up by Charles ||| after [Charles] reached the bottom of the stairs.', 'Emma took the schoolbag thrown up by Paul ||| after [Paul] reached the bottom of the stairs.', 'Linda took the schoolbag thrown up by Paul ||| after [Paul] reached the bottom of the stairs.'], ['Emma caught the schoolbag thrown up by Charles ||| after [Emma] reached the top of the stairs.', 'Linda caught the schoolbag thrown up by Charles ||| after [Linda] reached the top of the stairs.', 'Emma caught the schoolbag thrown up by Paul ||| after [Emma] reached the top of the stairs.', 'Linda caught the schoolbag thrown up by Paul ||| after [Linda] reached the top of the stairs.', 'Emma took the schoolbag thrown up by Charles ||| after [Emma] reached the top of the stairs.', 'Linda took the schoolbag thrown up by Charles ||| after [Linda] reached the top of the stairs.', 'Emma took the schoolbag thrown up by Paul ||| after [Emma] reached the top of the stairs.', 'Linda took the schoolbag thrown up by Paul ||| after [Linda] reached the top of the stairs.'], ['Charles threw the schoolbag down to Emma ||| before [Charles] reached the bottom of the stairs.', 'Charles threw the schoolbag down to Linda ||| before [Charles] reached the bottom of the stairs.', 'Paul threw the schoolbag down to Emma ||| before [Paul] reached the bottom of the stairs.', 'Paul threw the schoolbag down to Linda ||| before [Paul] reached the bottom of the stairs.', 'Charles cast the schoolbag down to Emma ||| before [Charles] reached the bottom of the stairs.', 'Charles cast the schoolbag down to Linda ||| before [Charles] reached the bottom of the stairs.', 'Paul cast the schoolbag down to Emma ||| before [Paul] reached the bottom of the stairs.', 'Paul cast the schoolbag down to Linda ||| before [Paul] reached the bottom of the stairs.'], ['Charles threw the schoolbag down to Emma ||| before [Emma] reached the top of the stairs.', 'Charles threw the schoolbag down to Linda ||| before [Linda] reached the top of the stairs.', 'Paul threw the schoolbag down to Emma ||| before [Emma] reached the top of the stairs.', 'Paul threw the schoolbag down to Linda ||| before [Linda] reached the top of the stairs.', 'Charles cast the schoolbag down to Emma ||| before [Emma] reached the top of the stairs.', 'Charles cast the schoolbag down to Linda ||| before [Linda] reached the top of the stairs.', 'Paul cast the schoolbag down to Emma ||| before [Emma] reached the top of the stairs.', 'Paul cast the schoolbag down to Linda ||| before [Linda] reached the top of the stairs.'], ['Emma caught the schoolbag thrown down by Charles ||| before [Charles] reached the bottom of the stairs.', 'Linda caught the schoolbag thrown down by Charles ||| before [Charles] reached the bottom of the stairs.', 'Emma caught the schoolbag thrown down by Paul ||| before [Paul] reached the bottom of the stairs.', 'Linda caught the schoolbag thrown down by Paul ||| before [Paul] reached the bottom of the stairs.', 'Emma took the schoolbag thrown down by Charles ||| before [Charles] reached the bottom of the stairs.', 'Linda took the schoolbag thrown down by Charles ||| before [Charles] reached the bottom of the stairs.', 'Emma took the schoolbag thrown down by Paul ||| before [Paul] reached the bottom of the stairs.', 'Linda took the schoolbag thrown down by Paul ||| before [Paul] reached the bottom of the stairs.'], ['Emma caught the schoolbag thrown down by Charles ||| before [Emma] reached the top of the stairs.', 'Linda caught the schoolbag thrown down by Charles ||| before [Linda] reached the top of the stairs.', 'Emma caught the schoolbag thrown down by Paul ||| before [Emma] reached the top of the stairs.', 'Linda caught the schoolbag thrown down by Paul ||| before [Linda] reached the top of the stairs.', 'Emma took the schoolbag thrown down by Charles ||| before [Emma] reached the top of the stairs.', 'Linda took the schoolbag thrown down by Charles ||| before [Linda] reached the top of the stairs.', 'Emma took the schoolbag thrown down by Paul ||| before [Emma] reached the top of the stairs.', 'Linda took the schoolbag thrown down by Paul ||| before [Linda] reached the top of the stairs.'], ['Charles threw the schoolbag up to Emma ||| before [Charles] reached the top of the stairs.', 'Charles threw the schoolbag up to Linda ||| before [Charles] reached the top of the stairs.', 'Paul threw the schoolbag up to Emma ||| before [Paul] reached the top of the stairs.', 'Paul threw the schoolbag up to Linda ||| before [Paul] reached the top of the stairs.', 'Charles cast the schoolbag up to Emma ||| before [Charles] reached the top of the stairs.', 'Charles cast the schoolbag up to Linda ||| before [Charles] reached the top of the stairs.', 'Paul cast the schoolbag up to Emma ||| before [Paul] reached the top of the stairs.', 'Paul cast the schoolbag up to Linda ||| before [Paul] reached the top of the stairs.'], ['Charles threw the schoolbag up to Emma ||| before [Emma] reached the bottom of the stairs.', 'Charles threw the schoolbag up to Linda ||| before [Linda] reached the bottom of the stairs.', 'Paul threw the schoolbag up to Emma ||| before [Emma] reached the bottom of the stairs.', 'Paul threw the schoolbag up to Linda ||| before [Linda] reached the bottom of the stairs.', 'Charles cast the schoolbag up to Emma ||| before [Emma] reached the bottom of the stairs.', 'Charles cast the schoolbag up to Linda ||| before [Linda] reached the bottom of the stairs.', 'Paul cast the schoolbag up to Emma ||| before [Emma] reached the bottom of the stairs.', 'Paul cast the schoolbag up to Linda ||| before [Linda] reached the bottom of the stairs.'], ['Emma caught the schoolbag thrown up by Charles ||| before [Charles] reached the top of the stairs.', 'Linda caught the schoolbag thrown up by Charles ||| before [Charles] reached the top of the stairs.', 'Emma caught the schoolbag thrown up by Paul ||| before [Paul] reached the top of the stairs.', 'Linda caught the schoolbag thrown up by Paul ||| before [Paul] reached the top of the stairs.', 'Emma took the schoolbag thrown up by Charles ||| before [Charles] reached the top of the stairs.', 'Linda took the schoolbag thrown up by Charles ||| before [Charles] reached the top of the stairs.', 'Emma took the schoolbag thrown up by Paul ||| before [Paul] reached the top of the stairs.', 'Linda took the schoolbag thrown up by Paul ||| before [Paul] reached the top of the stairs.'], ['Emma caught the schoolbag thrown up by Charles ||| before [Emma] reached the bottom of the stairs.', 'Linda caught the schoolbag thrown up by Charles ||| before [Linda] reached the bottom of the stairs.', 'Emma caught the schoolbag thrown up by Paul ||| before [Emma] reached the bottom of the stairs.', 'Linda caught the schoolbag thrown up by Paul ||| before [Linda] reached the bottom of the stairs.', 'Emma took the schoolbag thrown up by Charles ||| before [Emma] reached the bottom of the stairs.', 'Linda took the schoolbag thrown up by Charles ||| before [Linda] reached the bottom of the stairs.', 'Emma took the schoolbag thrown up by Paul ||| before [Emma] reached the bottom of the stairs.', 'Linda took the schoolbag thrown up by Paul ||| before [Linda] reached the bottom of the stairs.']], [["the ball doesn't fit into the bag ||| because the [ball] is large.", "the ball doesn't fit into the box ||| because the [ball] is large.", "the toy doesn't fit into the bag ||| because the [toy] is large.", "the toy doesn't fit into the box ||| because the [toy] is large.", "the ball can't be put into the bag ||| because the [ball] is large.", "the ball can't be put into the box ||| because the [ball] is large.", "the toy can't be put into the bag ||| because the [toy] is large.", "the toy can't be put into the box ||| because the [toy] is large."], ["the ball doesn't fit into the bag ||| because the [bag] is small.", "the ball doesn't fit into the box ||| because the [box] is small.", "the toy doesn't fit into the bag ||| because the [bag] is small.", "the toy doesn't fit into the box ||| because the [box] is small.", "the ball can't be put into the bag ||| because the [bag] is small.", "the ball can't be put into the box ||| because the [box] is small.", "the toy can't be put into the bag ||| because the [bag] is small.", "the toy can't be put into the box ||| because the [box] is small."], ["the ball doesn't fit into the bag ||| because the [ball] isn't small.", "the ball doesn't fit into the box ||| because the [ball] isn't small.", "the toy doesn't fit into the bag ||| because the [toy] isn't small.", "the toy doesn't fit into the box ||| because the [toy] isn't small.", "the ball can't be put into the bag ||| because the [ball] isn't small.", "the ball can't be put into the box ||| because the [ball] isn't small.", "the toy can't be put into the bag ||| because the [toy] isn't small.", "the toy can't be put into the box ||| because the [toy] isn't small."], ["the ball doesn't fit into the bag ||| because the [bag] isn't large.", "the ball doesn't fit into the box ||| because the [box] isn't large.", "the toy doesn't fit into the bag ||| because the [bag] isn't large.", "the toy doesn't fit into the box ||| because the [box] isn't large.", "the ball can't be put into the bag ||| because the [bag] isn't large.", "the ball can't be put into the box ||| because the [box] isn't large.", "the toy can't be put into the bag ||| because the [bag] isn't large.", "the toy can't be put into the box ||| because the [box] isn't large."], ["the bag doesn't hold the ball ||| because the [ball] is large.", "the box doesn't hold the ball ||| because the [ball] is large.", "the bag doesn't hold the toy ||| because the [toy] is large.", "the box doesn't hold the toy ||| because the [toy] is large.", "the bag doesn't have enough room for the ball ||| because the [ball] is large.", "the box doesn't have enough room for the ball ||| because the [ball] is large.", "the bag doesn't have enough room for the toy ||| because the [toy] is large.", "the box doesn't have enough room for the toy ||| because the [toy] is large."], ["the bag doesn't hold the ball ||| because the [bag] is small.", "the box doesn't hold the ball ||| because the [box] is small.", "the bag doesn't hold the toy ||| because the [bag] is small.", "the box doesn't hold the toy ||| because the [box] is small.", "the bag doesn't have enough room for the ball ||| because the [bag] is small.", "the box doesn't have enough room for the ball ||| because the [box] is small.", "the bag doesn't have enough room for the toy ||| because the [bag] is small.", "the box doesn't have enough room for the toy ||| because the [box] is small."], ["the bag doesn't hold the ball ||| because the [ball] isn't small.", "the box doesn't hold the ball ||| because the [ball] isn't small.", "the bag doesn't hold the toy ||| because the [toy] isn't small.", "the box doesn't hold the toy ||| because the [toy] isn't small.", "the bag doesn't have enough room for the ball ||| because the [ball] isn't small.", "the box doesn't have enough room for the ball ||| because the [ball] isn't small.", "the bag doesn't have enough room for the toy ||| because the [toy] isn't small.", "the box doesn't have enough room for the toy ||| because the [toy] isn't small."], ["the bag doesn't hold the ball ||| because the [bag] isn't large.", "the box doesn't hold the ball ||| because the [box] isn't large.", "the bag doesn't hold the toy ||| because the [bag] isn't large.", "the box doesn't hold the toy ||| because the [box] isn't large.", "the bag doesn't have enough room for the ball ||| because the [bag] isn't large.", "the box doesn't have enough room for the ball ||| because the [box] isn't large.", "the bag doesn't have enough room for the toy ||| because the [bag] isn't large.", "the box doesn't have enough room for the toy ||| because the [box] isn't large."], ["the ball can fit into the bag ||| because the [ball] isn't large.", "the ball can fit into the box ||| because the [ball] isn't large.", "the toy can fit into the bag ||| because the [toy] isn't large.", "the toy can fit into the box ||| because the [toy] isn't large.", "the ball can be put into the bag ||| because the [ball] isn't large.", "the ball can be put into the box ||| because the [ball] isn't large.", "the toy can be put into the bag ||| because the [toy] isn't large.", "the toy can be put into the box ||| because the [toy] isn't large."], ["the ball can fit into the bag ||| because the [bag] isn't small.", "the ball can fit into the box ||| because the [box] isn't small.", "the toy can fit into the bag ||| because the [bag] isn't small.", "the toy can fit into the box ||| because the [box] isn't small.", "the ball can be put into the bag ||| because the [bag] isn't small.", "the ball can be put into the box ||| because the [box] isn't small.", "the toy can be put into the bag ||| because the [bag] isn't small.", "the toy can be put into the box ||| because the [box] isn't small."], ['the ball can fit into the bag ||| because the [ball] is small.', 'the ball can fit into the box ||| because the [ball] is small.', 'the toy can fit into the bag ||| because the [toy] is small.', 'the toy can fit into the box ||| because the [toy] is small.', 'the ball can be put into the bag ||| because the [ball] is small.', 'the ball can be put into the box ||| because the [ball] is small.', 'the toy can be put into the bag ||| because the [toy] is small.', 'the toy can be put into the box ||| because the [toy] is small.'], ['the ball can fit into the bag ||| because the [bag] is large.', 'the ball can fit into the box ||| because the [box] is large.', 'the toy can fit into the bag ||| because the [bag] is large.', 'the toy can fit into the box ||| because the [box] is large.', 'the ball can be put into the bag ||| because the [bag] is large.', 'the ball can be put into the box ||| because the [box] is large.', 'the toy can be put into the bag ||| because the [bag] is large.', 'the toy can be put into the box ||| because the [box] is large.'], ["the bag can hold the ball ||| because the [ball] isn't large.", "the box can hold the ball ||| because the [ball] isn't large.", "the bag can hold the toy ||| because the [toy] isn't large.", "the box can hold the toy ||| because the [toy] isn't large.", "the bag has enough room for the ball ||| because the [ball] isn't large.", "the box has enough room for the ball ||| because the [ball] isn't large.", "the bag has enough room for the toy ||| because the [toy] isn't large.", "the box has enough room for the toy ||| because the [toy] isn't large."], ["the bag can hold the ball ||| because the [bag] isn't small.", "the box can hold the ball ||| because the [box] isn't small.", "the bag can hold the toy ||| because the [bag] isn't small.", "the box can hold the toy ||| because the [box] isn't small.", "the bag has enough room for the ball ||| because the [bag] isn't small.", "the box has enough room for the ball ||| because the [box] isn't small.", "the bag has enough room for the toy ||| because the [bag] isn't small.", "the box has enough room for the toy ||| because the [box] isn't small."], ['the bag can hold the ball ||| because the [ball] is small.', 'the box can hold the ball ||| because the [ball] is small.', 'the bag can hold the toy ||| because the [toy] is small.', 'the box can hold the toy ||| because the [toy] is small.', 'the bag has enough room for the ball ||| because the [ball] is small.', 'the box has enough room for the ball ||| because the [ball] is small.', 'the bag has enough room for the toy ||| because the [toy] is small.', 'the box has enough room for the toy ||| because the [toy] is small.'], ['the bag can hold the ball ||| because the [bag] is large.', 'the box can hold the ball ||| because the [box] is large.', 'the bag can hold the toy ||| because the [bag] is large.', 'the box can hold the toy ||| because the [box] is large.', 'the bag has enough room for the ball ||| because the [bag] is large.', 'the box has enough room for the ball ||| because the [box] is large.', 'the bag has enough room for the toy ||| because the [bag] is large.', 'the box has enough room for the toy ||| because the [box] is large.'], ["the ball doesn't fit into the bag ||| although the [ball] isn't large.", "the ball doesn't fit into the box ||| although the [ball] isn't large.", "the toy doesn't fit into the bag ||| although the [toy] isn't large.", "the toy doesn't fit into the box ||| although the [toy] isn't large.", "the ball can't be put into the bag ||| although the [ball] isn't large.", "the ball can't be put into the box ||| although the [ball] isn't large.", "the toy can't be put into the bag ||| although the [toy] isn't large.", "the toy can't be put into the box ||| although the [toy] isn't large."], ["the ball doesn't fit into the bag ||| although the [bag] isn't small.", "the ball doesn't fit into the box ||| although the [box] isn't small.", "the toy doesn't fit into the bag ||| although the [bag] isn't small.", "the toy doesn't fit into the box ||| although the [box] isn't small.", "the ball can't be put into the bag ||| although the [bag] isn't small.", "the ball can't be put into the box ||| although the [box] isn't small.", "the toy can't be put into the bag ||| although the [bag] isn't small.", "the toy can't be put into the box ||| although the [box] isn't small."], ["the ball doesn't fit into the bag ||| although the [ball] is small.", "the ball doesn't fit into the box ||| although the [ball] is small.", "the toy doesn't fit into the bag ||| although the [toy] is small.", "the toy doesn't fit into the box ||| although the [toy] is small.", "the ball can't be put into the bag ||| although the [ball] is small.", "the ball can't be put into the box ||| although the [ball] is small.", "the toy can't be put into the bag ||| although the [toy] is small.", "the toy can't be put into the box ||| although the [toy] is small."], ["the ball doesn't fit into the bag ||| although the [bag] is large.", "the ball doesn't fit into the box ||| although the [box] is large.", "the toy doesn't fit into the bag ||| although the [bag] is large.", "the toy doesn't fit into the box ||| although the [box] is large.", "the ball can't be put into the bag ||| although the [bag] is large.", "the ball can't be put into the box ||| although the [box] is large.", "the toy can't be put into the bag ||| although the [bag] is large.", "the toy can't be put into the box ||| although the [box] is large."], ["the bag doesn't hold the ball ||| although the [ball] isn't large.", "the box doesn't hold the ball ||| although the [ball] isn't large.", "the bag doesn't hold the toy ||| although the [toy] isn't large.", "the box doesn't hold the toy ||| although the [toy] isn't large.", "the bag doesn't have enough room for the ball ||| although the [ball] isn't large.", "the box doesn't have enough room for the ball ||| although the [ball] isn't large.", "the bag doesn't have enough room for the toy ||| although the [toy] isn't large.", "the box doesn't have enough room for the toy ||| although the [toy] isn't large."], ["the bag doesn't hold the ball ||| although the [bag] isn't small.", "the box doesn't hold the ball ||| although the [box] isn't small.", "the bag doesn't hold the toy ||| although the [bag] isn't small.", "the box doesn't hold the toy ||| although the [box] isn't small.", "the bag doesn't have enough room for the ball ||| although the [bag] isn't small.", "the box doesn't have enough room for the ball ||| although the [box] isn't small.", "the bag doesn't have enough room for the toy ||| although the [bag] isn't small.", "the box doesn't have enough room for the toy ||| although the [box] isn't small."], ["the bag doesn't hold the ball ||| although the [ball] is small.", "the box doesn't hold the ball ||| although the [ball] is small.", "the bag doesn't hold the toy ||| although the [toy] is small.", "the box doesn't hold the toy ||| although the [toy] is small.", "the bag doesn't have enough room for the ball ||| although the [ball] is small.", "the box doesn't have enough room for the ball ||| although the [ball] is small.", "the bag doesn't have enough room for the toy ||| although the [toy] is small.", "the box doesn't have enough room for the toy ||| although the [toy] is small."], ["the bag doesn't hold the ball ||| although the [bag] is large.", "the box doesn't hold the ball ||| although the [box] is large.", "the bag doesn't hold the toy ||| although the [bag] is large.", "the box doesn't hold the toy ||| although the [box] is large.", "the bag doesn't have enough room for the ball ||| although the [bag] is large.", "the box doesn't have enough room for the ball ||| although the [box] is large.", "the bag doesn't have enough room for the toy ||| although the [bag] is large.", "the box doesn't have enough room for the toy ||| although the [box] is large."], ['the ball can fit into the bag ||| although the [ball] is large.', 'the ball can fit into the box ||| although the [ball] is large.', 'the toy can fit into the bag ||| although the [toy] is large.', 'the toy can fit into the box ||| although the [toy] is large.', 'the ball can be put into the bag ||| although the [ball] is large.', 'the ball can be put into the box ||| although the [ball] is large.', 'the toy can be put into the bag ||| although the [toy] is large.', 'the toy can be put into the box ||| although the [toy] is large.'], ['the ball can fit into the bag ||| although the [bag] is small.', 'the ball can fit into the box ||| although the [box] is small.', 'the toy can fit into the bag ||| although the [bag] is small.', 'the toy can fit into the box ||| although the [box] is small.', 'the ball can be put into the bag ||| although the [bag] is small.', 'the ball can be put into the box ||| although the [box] is small.', 'the toy can be put into the bag ||| although the [bag] is small.', 'the toy can be put into the box ||| although the [box] is small.'], ["the ball can fit into the bag ||| although the [ball] isn't small.", "the ball can fit into the box ||| although the [ball] isn't small.", "the toy can fit into the bag ||| although the [toy] isn't small.", "the toy can fit into the box ||| although the [toy] isn't small.", "the ball can be put into the bag ||| although the [ball] isn't small.", "the ball can be put into the box ||| although the [ball] isn't small.", "the toy can be put into the bag ||| although the [toy] isn't small.", "the toy can be put into the box ||| although the [toy] isn't small."], ["the ball can fit into the bag ||| although the [bag] isn't large.", "the ball can fit into the box ||| although the [box] isn't large.", "the toy can fit into the bag ||| although the [bag] isn't large.", "the toy can fit into the box ||| although the [box] isn't large.", "the ball can be put into the bag ||| although the [bag] isn't large.", "the ball can be put into the box ||| although the [box] isn't large.", "the toy can be put into the bag ||| although the [bag] isn't large.", "the toy can be put into the box ||| although the [box] isn't large."], ['the bag can hold the ball ||| although the [ball] is large.', 'the box can hold the ball ||| although the [ball] is large.', 'the bag can hold the toy ||| although the [toy] is large.', 'the box can hold the toy ||| although the [toy] is large.', 'the bag has enough room for the ball ||| although the [ball] is large.', 'the box has enough room for the ball ||| although the [ball] is large.', 'the bag has enough room for the toy ||| although the [toy] is large.', 'the box has enough room for the toy ||| although the [toy] is large.'], ['the bag can hold the ball ||| although the [bag] is small.', 'the box can hold the ball ||| although the [box] is small.', 'the bag can hold the toy ||| although the [bag] is small.', 'the box can hold the toy ||| although the [box] is small.', 'the bag has enough room for the ball ||| although the [bag] is small.', 'the box has enough room for the ball ||| although the [box] is small.', 'the bag has enough room for the toy ||| although the [bag] is small.', 'the box has enough room for the toy ||| although the [box] is small.'], ["the bag can hold the ball ||| although the [ball] isn't small.", "the box can hold the ball ||| although the [ball] isn't small.", "the bag can hold the toy ||| although the [toy] isn't small.", "the box can hold the toy ||| although the [toy] isn't small.", "the bag has enough room for the ball ||| although the [ball] isn't small.", "the box has enough room for the ball ||| although the [ball] isn't small.", "the bag has enough room for the toy ||| although the [toy] isn't small.", "the box has enough room for the toy ||| although the [toy] isn't small."], ["the bag can hold the ball ||| although the [bag] isn't large.", "the box can hold the ball ||| although the [box] isn't large.", "the bag can hold the toy ||| although the [bag] isn't large.", "the box can hold the toy ||| although the [box] isn't large.", "the bag has enough room for the ball ||| although the [bag] isn't large.", "the box has enough room for the ball ||| although the [box] isn't large.", "the bag has enough room for the toy ||| although the [bag] isn't large.", "the box has enough room for the toy ||| although the [box] isn't large."]], [('the ball crashed right through the board ||| because the [ball] was hard.', 'the ball penetrated through the board ||| because the [ball] was hard.'), ('the ball crashed right through the board ||| because the [board] was soft.', 'the ball penetrated through the board ||| because the [board] was soft.'), ("the ball crashed right through the board ||| because the [ball] wasn't soft.", "the ball penetrated through the board ||| because the [ball] wasn't soft."), ("the ball crashed right through the board ||| because the [board] wasn't hard.", "the ball penetrated through the board ||| because the [board] wasn't hard."), ('the board failed to block the ball ||| because the [ball] was hard.', 'the board failed to stop the ball ||| because the [ball] was hard.'), ('the board failed to block the ball ||| because the [board] was soft.', 'the board failed to stop the ball ||| because the [board] was soft.'), ("the board failed to block the ball ||| because the [ball] wasn't soft.", "the board failed to stop the ball ||| because the [ball] wasn't soft."), ("the board failed to block the ball ||| because the [board] wasn't hard.", "the board failed to stop the ball ||| because the [board] wasn't hard."), ("the ball didn't crash through the board ||| because the [ball] wasn't hard.", "the ball didn't penetrate through the board ||| because the [ball] wasn't hard."), ("the ball didn't crash through the board ||| because the [board] wasn't soft.", "the ball didn't penetrate through the board ||| because the [board] wasn't soft."), ("the ball didn't crash through the board ||| because the [ball] was soft.", "the ball didn't penetrate through the board ||| because the [ball] was soft."), ("the ball didn't crash through the board ||| because the [board] was hard.", "the ball didn't penetrate through the board ||| because the [board] was hard."), ("the board blocked the ball ||| because the [ball] wasn't hard.", "the board stopped the ball ||| because the [ball] wasn't hard."), ("the board blocked the ball ||| because the [board] wasn't soft.", "the board stopped the ball ||| because the [board] wasn't soft."), ('the board blocked the ball ||| because the [ball] was soft.', 'the board stopped the ball ||| because the [ball] was soft.'), ('the board blocked the ball ||| because the [board] was hard.', 'the board stopped the ball ||| because the [board] was hard.'), ("the ball crashed right through the board ||| although the [ball] wasn't hard.", "the ball penetrated through the board ||| although the [ball] wasn't hard."), ("the ball crashed right through the board ||| although the [board] wasn't soft.", "the ball penetrated through the board ||| although the [board] wasn't soft."), ('the ball crashed right through the board ||| although the [ball] was soft.', 'the ball penetrated through the board ||| although the [ball] was soft.'), ('the ball crashed right through the board ||| although the [board] was hard.', 'the ball penetrated through the board ||| although the [board] was hard.'), ("the board failed to block the ball ||| although the [ball] wasn't hard.", "the board failed to stop the ball ||| although the [ball] wasn't hard."), ("the board failed to block the ball ||| although the [board] wasn't soft.", "the board failed to stop the ball ||| although the [board] wasn't soft."), ('the board failed to block the ball ||| although the [ball] was soft.', 'the board failed to stop the ball ||| although the [ball] was soft.'), ('the board failed to block the ball ||| although the [board] was hard.', 'the board failed to stop the ball ||| although the [board] was hard.'), ("the ball didn't crash through the board ||| although the [ball] was hard.", "the ball didn't penetrate through the board ||| although the [ball] was hard."), ("the ball didn't crash through the board ||| although the [board] was soft.", "the ball didn't penetrate through the board ||| although the [board] was soft."), ("the ball didn't crash through the board ||| although the [ball] wasn't soft.", "the ball didn't penetrate through the board ||| although the [ball] wasn't soft."), ("the ball didn't crash through the board ||| although the [board] wasn't hard.", "the ball didn't penetrate through the board ||| although the [board] wasn't hard."), ('the board blocked the ball ||| although the [ball] was hard.', 'the board stopped the ball ||| although the [ball] was hard.'), ('the board blocked the ball ||| although the [board] was soft.', 'the board stopped the ball ||| although the [board] was soft.'), ("the board blocked the ball ||| although the [ball] wasn't soft.", "the board stopped the ball ||| although the [ball] wasn't soft."), ("the board blocked the ball ||| although the [board] wasn't hard.", "the board stopped the ball ||| although the [board] wasn't hard.")], ['Steve follows Lucy in everything ||| because [Steve] is bad at making decisions.', 'Steve follows Lucy in everything ||| because [Lucy] is good at making decisions.', "Steve follows Lucy in everything ||| because [Steve] isn't good at making decisions.", "Steve follows Lucy in everything ||| because [Lucy] isn't bad at making decisions.", 'Lucy is followed by Steve in everything ||| because [Steve] is bad at making decisions.', 'Lucy is followed by Steve in everything ||| because [Lucy] is good at making decisions.', "Lucy is followed by Steve in everything ||| because [Steve] isn't good at making decisions.", "Lucy is followed by Steve in everything ||| because [Lucy] isn't bad at making decisions.", "Steve doesn't follow Lucy in everything ||| because [Steve] isn't bad at making decisions.", "Steve doesn't follow Lucy in everything ||| because [Lucy] isn't good at making decisions.", "Steve doesn't follow Lucy in everything ||| because [Steve] is good at making decisions.", "Steve doesn't follow Lucy in everything ||| because [Lucy] is bad at making decisions.", "Lucy isn't followed by Steve in everything ||| because [Steve] isn't bad at making decisions.", "Lucy isn't followed by Steve in everything ||| because [Lucy] isn't good at making decisions.", "Lucy isn't followed by Steve in everything ||| because [Steve] is good at making decisions.", "Lucy isn't followed by Steve in everything ||| because [Lucy] is bad at making decisions."], [['the skirt is traded by Grace for the hat ||| because she thinks the [skirt] looks bad.', 'the skirt is traded by Grace for the short ||| because she thinks the [skirt] looks bad.', 'the cap is traded by Grace for the hat ||| because she thinks the [cap] looks bad.', 'the cap is traded by Grace for the short ||| because she thinks the [cap] looks bad.', 'the skirt is replaced by Grace with the hat ||| because she thinks the [skirt] looks bad.', 'the skirt is replaced by Grace with the short ||| because she thinks the [skirt] looks bad.', 'the cap is replaced by Grace with the hat ||| because she thinks the [cap] looks bad.', 'the cap is replaced by Grace with the short ||| because she thinks the [cap] looks bad.'], ['the skirt is traded by Grace for the hat ||| because she thinks the [hat] looks good.', 'the skirt is traded by Grace for the short ||| because she thinks the [short] looks good.', 'the cap is traded by Grace for the hat ||| because she thinks the [hat] looks good.', 'the cap is traded by Grace for the short ||| because she thinks the [short] looks good.', 'the skirt is replaced by Grace with the hat ||| because she thinks the [hat] looks good.', 'the skirt is replaced by Grace with the short ||| because she thinks the [short] looks good.', 'the cap is replaced by Grace with the hat ||| because she thinks the [hat] looks good.', 'the cap is replaced by Grace with the short ||| because she thinks the [short] looks good.'], ['the skirt is traded by Grace for the hat ||| because she thinks the [skirt] looks not good.', 'the skirt is traded by Grace for the short ||| because she thinks the [skirt] looks not good.', 'the cap is traded by Grace for the hat ||| because she thinks the [cap] looks not good.', 'the cap is traded by Grace for the short ||| because she thinks the [cap] looks not good.', 'the skirt is replaced by Grace with the hat ||| because she thinks the [skirt] looks not good.', 'the skirt is replaced by Grace with the short ||| because she thinks the [skirt] looks not good.', 'the cap is replaced by Grace with the hat ||| because she thinks the [cap] looks not good.', 'the cap is replaced by Grace with the short ||| because she thinks the [cap] looks not good.'], ['the skirt is traded by Grace for the hat ||| because she thinks the [hat] looks not bad.', 'the skirt is traded by Grace for the short ||| because she thinks the [short] looks not bad.', 'the cap is traded by Grace for the hat ||| because she thinks the [hat] looks not bad.', 'the cap is traded by Grace for the short ||| because she thinks the [short] looks not bad.', 'the skirt is replaced by Grace with the hat ||| because she thinks the [hat] looks not bad.', 'the skirt is replaced by Grace with the short ||| because she thinks the [short] looks not bad.', 'the cap is replaced by Grace with the hat ||| because she thinks the [hat] looks not bad.', 'the cap is replaced by Grace with the short ||| because she thinks the [short] looks not bad.'], ['the hat is substituted by Grace for the skirt ||| because she thinks the [skirt] looks bad.', 'the short is substituted by Grace for the skirt ||| because she thinks the [skirt] looks bad.', 'the hat is substituted by Grace for the cap ||| because she thinks the [cap] looks bad.', 'the short is substituted by Grace for the cap ||| because she thinks the [cap] looks bad.', 'the hat is preferred by Grace to the skirt ||| because she thinks the [skirt] looks bad.', 'the short is preferred by Grace to the skirt ||| because she thinks the [skirt] looks bad.', 'the hat is preferred by Grace to the cap ||| because she thinks the [cap] looks bad.', 'the short is preferred by Grace to the cap ||| because she thinks the [cap] looks bad.'], ['the hat is substituted by Grace for the skirt ||| because she thinks the [hat] looks good.', 'the short is substituted by Grace for the skirt ||| because she thinks the [short] looks good.', 'the hat is substituted by Grace for the cap ||| because she thinks the [hat] looks good.', 'the short is substituted by Grace for the cap ||| because she thinks the [short] looks good.', 'the hat is preferred by Grace to the skirt ||| because she thinks the [hat] looks good.', 'the short is preferred by Grace to the skirt ||| because she thinks the [short] looks good.', 'the hat is preferred by Grace to the cap ||| because she thinks the [hat] looks good.', 'the short is preferred by Grace to the cap ||| because she thinks the [short] looks good.'], ['the hat is substituted by Grace for the skirt ||| because she thinks the [skirt] looks not good.', 'the short is substituted by Grace for the skirt ||| because she thinks the [skirt] looks not good.', 'the hat is substituted by Grace for the cap ||| because she thinks the [cap] looks not good.', 'the short is substituted by Grace for the cap ||| because she thinks the [cap] looks not good.', 'the hat is preferred by Grace to the skirt ||| because she thinks the [skirt] looks not good.', 'the short is preferred by Grace to the skirt ||| because she thinks the [skirt] looks not good.', 'the hat is preferred by Grace to the cap ||| because she thinks the [cap] looks not good.', 'the short is preferred by Grace to the cap ||| because she thinks the [cap] looks not good.'], ['the hat is substituted by Grace for the skirt ||| because she thinks the [hat] looks not bad.', 'the short is substituted by Grace for the skirt ||| because she thinks the [short] looks not bad.', 'the hat is substituted by Grace for the cap ||| because she thinks the [hat] looks not bad.', 'the short is substituted by Grace for the cap ||| because she thinks the [short] looks not bad.', 'the hat is preferred by Grace to the skirt ||| because she thinks the [hat] looks not bad.', 'the short is preferred by Grace to the skirt ||| because she thinks the [short] looks not bad.', 'the hat is preferred by Grace to the cap ||| because she thinks the [hat] looks not bad.', 'the short is preferred by Grace to the cap ||| because she thinks the [short] looks not bad.'], ["the skirt isn't traded by Grace for the hat ||| because she thinks the [skirt] looks not bad.", "the skirt isn't traded by Grace for the short ||| because she thinks the [skirt] looks not bad.", "the cap isn't traded by Grace for the hat ||| because she thinks the [cap] looks not bad.", "the cap isn't traded by Grace for the short ||| because she thinks the [cap] looks not bad.", "the skirt isn't replaced by Grace with the hat ||| because she thinks the [skirt] looks not bad.", "the skirt isn't replaced by Grace with the short ||| because she thinks the [skirt] looks not bad.", "the cap isn't replaced by Grace with the hat ||| because she thinks the [cap] looks not bad.", "the cap isn't replaced by Grace with the short ||| because she thinks the [cap] looks not bad."], ["the skirt isn't traded by Grace for the hat ||| because she thinks the [hat] looks not good.", "the skirt isn't traded by Grace for the short ||| because she thinks the [short] looks not good.", "the cap isn't traded by Grace for the hat ||| because she thinks the [hat] looks not good.", "the cap isn't traded by Grace for the short ||| because she thinks the [short] looks not good.", "the skirt isn't replaced by Grace with the hat ||| because she thinks the [hat] looks not good.", "the skirt isn't replaced by Grace with the short ||| because she thinks the [short] looks not good.", "the cap isn't replaced by Grace with the hat ||| because she thinks the [hat] looks not good.", "the cap isn't replaced by Grace with the short ||| because she thinks the [short] looks not good."], ["the skirt isn't traded by Grace for the hat ||| because she thinks the [skirt] looks good.", "the skirt isn't traded by Grace for the short ||| because she thinks the [skirt] looks good.", "the cap isn't traded by Grace for the hat ||| because she thinks the [cap] looks good.", "the cap isn't traded by Grace for the short ||| because she thinks the [cap] looks good.", "the skirt isn't replaced by Grace with the hat ||| because she thinks the [skirt] looks good.", "the skirt isn't replaced by Grace with the short ||| because she thinks the [skirt] looks good.", "the cap isn't replaced by Grace with the hat ||| because she thinks the [cap] looks good.", "the cap isn't replaced by Grace with the short ||| because she thinks the [cap] looks good."], ["the skirt isn't traded by Grace for the hat ||| because she thinks the [hat] looks bad.", "the skirt isn't traded by Grace for the short ||| because she thinks the [short] looks bad.", "the cap isn't traded by Grace for the hat ||| because she thinks the [hat] looks bad.", "the cap isn't traded by Grace for the short ||| because she thinks the [short] looks bad.", "the skirt isn't replaced by Grace with the hat ||| because she thinks the [hat] looks bad.", "the skirt isn't replaced by Grace with the short ||| because she thinks the [short] looks bad.", "the cap isn't replaced by Grace with the hat ||| because she thinks the [hat] looks bad.", "the cap isn't replaced by Grace with the short ||| because she thinks the [short] looks bad."], ["the hat isn't substituted by Grace for the skirt ||| because she thinks the [skirt] looks not bad.", "the short isn't substituted by Grace for the skirt ||| because she thinks the [skirt] looks not bad.", "the hat isn't substituted by Grace for the cap ||| because she thinks the [cap] looks not bad.", "the short isn't substituted by Grace for the cap ||| because she thinks the [cap] looks not bad.", "the hat isn't preferred by Grace to the skirt ||| because she thinks the [skirt] looks not bad.", "the short isn't preferred by Grace to the skirt ||| because she thinks the [skirt] looks not bad.", "the hat isn't preferred by Grace to the cap ||| because she thinks the [cap] looks not bad.", "the short isn't preferred by Grace to the cap ||| because she thinks the [cap] looks not bad."], ["the hat isn't substituted by Grace for the skirt ||| because she thinks the [hat] looks not good.", "the short isn't substituted by Grace for the skirt ||| because she thinks the [short] looks not good.", "the hat isn't substituted by Grace for the cap ||| because she thinks the [hat] looks not good.", "the short isn't substituted by Grace for the cap ||| because she thinks the [short] looks not good.", "the hat isn't preferred by Grace to the skirt ||| because she thinks the [hat] looks not good.", "the short isn't preferred by Grace to the skirt ||| because she thinks the [short] looks not good.", "the hat isn't preferred by Grace to the cap ||| because she thinks the [hat] looks not good.", "the short isn't preferred by Grace to the cap ||| because she thinks the [short] looks not good."], ["the hat isn't substituted by Grace for the skirt ||| because she thinks the [skirt] looks good.", "the short isn't substituted by Grace for the skirt ||| because she thinks the [skirt] looks good.", "the hat isn't substituted by Grace for the cap ||| because she thinks the [cap] looks good.", "the short isn't substituted by Grace for the cap ||| because she thinks the [cap] looks good.", "the hat isn't preferred by Grace to the skirt ||| because she thinks the [skirt] looks good.", "the short isn't preferred by Grace to the skirt ||| because she thinks the [skirt] looks good.", "the hat isn't preferred by Grace to the cap ||| because she thinks the [cap] looks good.", "the short isn't preferred by Grace to the cap ||| because she thinks the [cap] looks good."], ["the hat isn't substituted by Grace for the skirt ||| because she thinks the [hat] looks bad.", "the short isn't substituted by Grace for the skirt ||| because she thinks the [short] looks bad.", "the hat isn't substituted by Grace for the cap ||| because she thinks the [hat] looks bad.", "the short isn't substituted by Grace for the cap ||| because she thinks the [short] looks bad.", "the hat isn't preferred by Grace to the skirt ||| because she thinks the [hat] looks bad.", "the short isn't preferred by Grace to the skirt ||| because she thinks the [short] looks bad.", "the hat isn't preferred by Grace to the cap ||| because she thinks the [hat] looks bad.", "the short isn't preferred by Grace to the cap ||| because she thinks the [short] looks bad."], ['the skirt is traded by Grace for the hat ||| although she thinks the [skirt] looks not bad.', 'the skirt is traded by Grace for the short ||| although she thinks the [skirt] looks not bad.', 'the cap is traded by Grace for the hat ||| although she thinks the [cap] looks not bad.', 'the cap is traded by Grace for the short ||| although she thinks the [cap] looks not bad.', 'the skirt is replaced by Grace with the hat ||| although she thinks the [skirt] looks not bad.', 'the skirt is replaced by Grace with the short ||| although she thinks the [skirt] looks not bad.', 'the cap is replaced by Grace with the hat ||| although she thinks the [cap] looks not bad.', 'the cap is replaced by Grace with the short ||| although she thinks the [cap] looks not bad.'], ['the skirt is traded by Grace for the hat ||| although she thinks the [hat] looks not good.', 'the skirt is traded by Grace for the short ||| although she thinks the [short] looks not good.', 'the cap is traded by Grace for the hat ||| although she thinks the [hat] looks not good.', 'the cap is traded by Grace for the short ||| although she thinks the [short] looks not good.', 'the skirt is replaced by Grace with the hat ||| although she thinks the [hat] looks not good.', 'the skirt is replaced by Grace with the short ||| although she thinks the [short] looks not good.', 'the cap is replaced by Grace with the hat ||| although she thinks the [hat] looks not good.', 'the cap is replaced by Grace with the short ||| although she thinks the [short] looks not good.'], ['the skirt is traded by Grace for the hat ||| although she thinks the [skirt] looks good.', 'the skirt is traded by Grace for the short ||| although she thinks the [skirt] looks good.', 'the cap is traded by Grace for the hat ||| although she thinks the [cap] looks good.', 'the cap is traded by Grace for the short ||| although she thinks the [cap] looks good.', 'the skirt is replaced by Grace with the hat ||| although she thinks the [skirt] looks good.', 'the skirt is replaced by Grace with the short ||| although she thinks the [skirt] looks good.', 'the cap is replaced by Grace with the hat ||| although she thinks the [cap] looks good.', 'the cap is replaced by Grace with the short ||| although she thinks the [cap] looks good.'], ['the skirt is traded by Grace for the hat ||| although she thinks the [hat] looks bad.', 'the skirt is traded by Grace for the short ||| although she thinks the [short] looks bad.', 'the cap is traded by Grace for the hat ||| although she thinks the [hat] looks bad.', 'the cap is traded by Grace for the short ||| although she thinks the [short] looks bad.', 'the skirt is replaced by Grace with the hat ||| although she thinks the [hat] looks bad.', 'the skirt is replaced by Grace with the short ||| although she thinks the [short] looks bad.', 'the cap is replaced by Grace with the hat ||| although she thinks the [hat] looks bad.', 'the cap is replaced by Grace with the short ||| although she thinks the [short] looks bad.'], ['the hat is substituted by Grace for the skirt ||| although she thinks the [skirt] looks not bad.', 'the short is substituted by Grace for the skirt ||| although she thinks the [skirt] looks not bad.', 'the hat is substituted by Grace for the cap ||| although she thinks the [cap] looks not bad.', 'the short is substituted by Grace for the cap ||| although she thinks the [cap] looks not bad.', 'the hat is preferred by Grace to the skirt ||| although she thinks the [skirt] looks not bad.', 'the short is preferred by Grace to the skirt ||| although she thinks the [skirt] looks not bad.', 'the hat is preferred by Grace to the cap ||| although she thinks the [cap] looks not bad.', 'the short is preferred by Grace to the cap ||| although she thinks the [cap] looks not bad.'], ['the hat is substituted by Grace for the skirt ||| although she thinks the [hat] looks not good.', 'the short is substituted by Grace for the skirt ||| although she thinks the [short] looks not good.', 'the hat is substituted by Grace for the cap ||| although she thinks the [hat] looks not good.', 'the short is substituted by Grace for the cap ||| although she thinks the [short] looks not good.', 'the hat is preferred by Grace to the skirt ||| although she thinks the [hat] looks not good.', 'the short is preferred by Grace to the skirt ||| although she thinks the [short] looks not good.', 'the hat is preferred by Grace to the cap ||| although she thinks the [hat] looks not good.', 'the short is preferred by Grace to the cap ||| although she thinks the [short] looks not good.'], ['the hat is substituted by Grace for the skirt ||| although she thinks the [skirt] looks good.', 'the short is substituted by Grace for the skirt ||| although she thinks the [skirt] looks good.', 'the hat is substituted by Grace for the cap ||| although she thinks the [cap] looks good.', 'the short is substituted by Grace for the cap ||| although she thinks the [cap] looks good.', 'the hat is preferred by Grace to the skirt ||| although she thinks the [skirt] looks good.', 'the short is preferred by Grace to the skirt ||| although she thinks the [skirt] looks good.', 'the hat is preferred by Grace to the cap ||| although she thinks the [cap] looks good.', 'the short is preferred by Grace to the cap ||| although she thinks the [cap] looks good.'], ['the hat is substituted by Grace for the skirt ||| although she thinks the [hat] looks bad.', 'the short is substituted by Grace for the skirt ||| although she thinks the [short] looks bad.', 'the hat is substituted by Grace for the cap ||| although she thinks the [hat] looks bad.', 'the short is substituted by Grace for the cap ||| although she thinks the [short] looks bad.', 'the hat is preferred by Grace to the skirt ||| although she thinks the [hat] looks bad.', 'the short is preferred by Grace to the skirt ||| although she thinks the [short] looks bad.', 'the hat is preferred by Grace to the cap ||| although she thinks the [hat] looks bad.', 'the short is preferred by Grace to the cap ||| although she thinks the [short] looks bad.'], ["the skirt isn't traded by Grace for the hat ||| although she thinks the [skirt] looks bad.", "the skirt isn't traded by Grace for the short ||| although she thinks the [skirt] looks bad.", "the cap isn't traded by Grace for the hat ||| although she thinks the [cap] looks bad.", "the cap isn't traded by Grace for the short ||| although she thinks the [cap] looks bad.", "the skirt isn't replaced by Grace with the hat ||| although she thinks the [skirt] looks bad.", "the skirt isn't replaced by Grace with the short ||| although she thinks the [skirt] looks bad.", "the cap isn't replaced by Grace with the hat ||| although she thinks the [cap] looks bad.", "the cap isn't replaced by Grace with the short ||| although she thinks the [cap] looks bad."], ["the skirt isn't traded by Grace for the hat ||| although she thinks the [hat] looks good.", "the skirt isn't traded by Grace for the short ||| although she thinks the [short] looks good.", "the cap isn't traded by Grace for the hat ||| although she thinks the [hat] looks good.", "the cap isn't traded by Grace for the short ||| although she thinks the [short] looks good.", "the skirt isn't replaced by Grace with the hat ||| although she thinks the [hat] looks good.", "the skirt isn't replaced by Grace with the short ||| although she thinks the [short] looks good.", "the cap isn't replaced by Grace with the hat ||| although she thinks the [hat] looks good.", "the cap isn't replaced by Grace with the short ||| although she thinks the [short] looks good."], ["the skirt isn't traded by Grace for the hat ||| although she thinks the [skirt] looks not good.", "the skirt isn't traded by Grace for the short ||| although she thinks the [skirt] looks not good.", "the cap isn't traded by Grace for the hat ||| although she thinks the [cap] looks not good.", "the cap isn't traded by Grace for the short ||| although she thinks the [cap] looks not good.", "the skirt isn't replaced by Grace with the hat ||| although she thinks the [skirt] looks not good.", "the skirt isn't replaced by Grace with the short ||| although she thinks the [skirt] looks not good.", "the cap isn't replaced by Grace with the hat ||| although she thinks the [cap] looks not good.", "the cap isn't replaced by Grace with the short ||| although she thinks the [cap] looks not good."], ["the skirt isn't traded by Grace for the hat ||| although she thinks the [hat] looks not bad.", "the skirt isn't traded by Grace for the short ||| although she thinks the [short] looks not bad.", "the cap isn't traded by Grace for the hat ||| although she thinks the [hat] looks not bad.", "the cap isn't traded by Grace for the short ||| although she thinks the [short] looks not bad.", "the skirt isn't replaced by Grace with the hat ||| although she thinks the [hat] looks not bad.", "the skirt isn't replaced by Grace with the short ||| although she thinks the [short] looks not bad.", "the cap isn't replaced by Grace with the hat ||| although she thinks the [hat] looks not bad.", "the cap isn't replaced by Grace with the short ||| although she thinks the [short] looks not bad."], ["the hat isn't substituted by Grace for the skirt ||| although she thinks the [skirt] looks bad.", "the short isn't substituted by Grace for the skirt ||| although she thinks the [skirt] looks bad.", "the hat isn't substituted by Grace for the cap ||| although she thinks the [cap] looks bad.", "the short isn't substituted by Grace for the cap ||| although she thinks the [cap] looks bad.", "the hat isn't preferred by Grace to the skirt ||| although she thinks the [skirt] looks bad.", "the short isn't preferred by Grace to the skirt ||| although she thinks the [skirt] looks bad.", "the hat isn't preferred by Grace to the cap ||| although she thinks the [cap] looks bad.", "the short isn't preferred by Grace to the cap ||| although she thinks the [cap] looks bad."], ["the hat isn't substituted by Grace for the skirt ||| although she thinks the [hat] looks good.", "the short isn't substituted by Grace for the skirt ||| although she thinks the [short] looks good.", "the hat isn't substituted by Grace for the cap ||| although she thinks the [hat] looks good.", "the short isn't substituted by Grace for the cap ||| although she thinks the [short] looks good.", "the hat isn't preferred by Grace to the skirt ||| although she thinks the [hat] looks good.", "the short isn't preferred by Grace to the skirt ||| although she thinks the [short] looks good.", "the hat isn't preferred by Grace to the cap ||| although she thinks the [hat] looks good.", "the short isn't preferred by Grace to the cap ||| although she thinks the [short] looks good."], ["the hat isn't substituted by Grace for the skirt ||| although she thinks the [skirt] looks not good.", "the short isn't substituted by Grace for the skirt ||| although she thinks the [skirt] looks not good.", "the hat isn't substituted by Grace for the cap ||| although she thinks the [cap] looks not good.", "the short isn't substituted by Grace for the cap ||| although she thinks the [cap] looks not good.", "the hat isn't preferred by Grace to the skirt ||| although she thinks the [skirt] looks not good.", "the short isn't preferred by Grace to the skirt ||| although she thinks the [skirt] looks not good.", "the hat isn't preferred by Grace to the cap ||| although she thinks the [cap] looks not good.", "the short isn't preferred by Grace to the cap ||| although she thinks the [cap] looks not good."], ["the hat isn't substituted by Grace for the skirt ||| although she thinks the [hat] looks not bad.", "the short isn't substituted by Grace for the skirt ||| although she thinks the [short] looks not bad.", "the hat isn't substituted by Grace for the cap ||| although she thinks the [hat] looks not bad.", "the short isn't substituted by Grace for the cap ||| although she thinks the [short] looks not bad.", "the hat isn't preferred by Grace to the skirt ||| although she thinks the [hat] looks not bad.", "the short isn't preferred by Grace to the skirt ||| although she thinks the [short] looks not bad.", "the hat isn't preferred by Grace to the cap ||| although she thinks the [hat] looks not bad.", "the short isn't preferred by Grace to the cap ||| although she thinks the [short] looks not bad."]], [('Sam succeeded in fooling Emma ||| so [Sam] got a lot of money.', 'Sam succeeded in cheating Emma ||| so [Sam] got a lot of money.'), ('Sam succeeded in fooling Emma ||| so [Emma] lost a lot of money.', 'Sam succeeded in cheating Emma ||| so [Emma] lost a lot of money.'), ('Emma was fooled by Sam ||| so [Sam] got a lot of money.', 'Emma was cheated by Sam ||| so [Sam] got a lot of money.'), ('Emma was fooled by Sam ||| so [Emma] lost a lot of money.', 'Emma was cheated by Sam ||| so [Emma] lost a lot of money.'), ("Sam failed to fool Emma ||| so [Sam] didn't get a lot of money.", "Sam failed to cheat Emma ||| so [Sam] didn't get a lot of money."), ("Sam failed to fool Emma ||| so [Emma] didn't lose a lot of money.", "Sam failed to cheat Emma ||| so [Emma] didn't lose a lot of money."), ("Emma wasn't fooled by Sam ||| so [Sam] didn't get a lot of money.", "Emma wasn't cheated by Sam ||| so [Sam] didn't get a lot of money."), ("Emma wasn't fooled by Sam ||| so [Emma] didn't lose a lot of money.", "Emma wasn't cheated by Sam ||| so [Emma] didn't lose a lot of money."), ("Sam succeeded in fooling Emma ||| but [Sam] didn't get a lot of money.", "Sam succeeded in cheating Emma ||| but [Sam] didn't get a lot of money."), ("Sam succeeded in fooling Emma ||| but [Emma] didn't lose a lot of money.", "Sam succeeded in cheating Emma ||| but [Emma] didn't lose a lot of money."), ("Emma was fooled by Sam ||| but [Sam] didn't get a lot of money.", "Emma was cheated by Sam ||| but [Sam] didn't get a lot of money."), ("Emma was fooled by Sam ||| but [Emma] didn't lose a lot of money.", "Emma was cheated by Sam ||| but [Emma] didn't lose a lot of money."), ('Sam failed to fool Emma ||| but [Sam] got a lot of money.', 'Sam failed to cheat Emma ||| but [Sam] got a lot of money.'), ('Sam failed to fool Emma ||| but [Emma] lost a lot of money.', 'Sam failed to cheat Emma ||| but [Emma] lost a lot of money.'), ("Emma wasn't fooled by Sam ||| but [Sam] got a lot of money.", "Emma wasn't cheated by Sam ||| but [Sam] got a lot of money."), ("Emma wasn't fooled by Sam ||| but [Emma] lost a lot of money.", "Emma wasn't cheated by Sam ||| but [Emma] lost a lot of money.")], ["John thanked Susan ||| although [John] hadn't received a lot of help.", "John thanked Susan ||| although [Susan] hadn't given a lot of help.", "Susan took good care of John ||| although [John] hadn't received a lot of help.", "Susan took good care of John ||| although [Susan] hadn't given a lot of help.", "John didn't thank Susan ||| although [John] had received a lot of help.", "John didn't thank Susan ||| although [Susan] had given a lot of help.", "Susan didn't good care of John ||| although [John] had received a lot of help.", "Susan didn't good care of John ||| although [Susan] had given a lot of help."], [['the cups could be placed on all the tables ||| because there were many of the [cups].', 'the cups could be placed on all the benches ||| because there were many of the [cups].', 'the pictures could be placed on all the tables ||| because there were many of the [pictures].', 'the pictures could be placed on all the benches ||| because there were many of the [pictures].', 'the cups could be put on all the tables ||| because there were many of the [cups].', 'the cups could be put on all the benches ||| because there were many of the [cups].', 'the pictures could be put on all the tables ||| because there were many of the [pictures].', 'the pictures could be put on all the benches ||| because there were many of the [pictures].'], ['the cups could be placed on all the tables ||| because there were few of the [tables].', 'the cups could be placed on all the benches ||| because there were few of the [benches].', 'the pictures could be placed on all the tables ||| because there were few of the [tables].', 'the pictures could be placed on all the benches ||| because there were few of the [benches].', 'the cups could be put on all the tables ||| because there were few of the [tables].', 'the cups could be put on all the benches ||| because there were few of the [benches].', 'the pictures could be put on all the tables ||| because there were few of the [tables].', 'the pictures could be put on all the benches ||| because there were few of the [benches].'], ['the cups could be placed on all the tables ||| because there were not few of the [cups].', 'the cups could be placed on all the benches ||| because there were not few of the [cups].', 'the pictures could be placed on all the tables ||| because there were not few of the [pictures].', 'the pictures could be placed on all the benches ||| because there were not few of the [pictures].', 'the cups could be put on all the tables ||| because there were not few of the [cups].', 'the cups could be put on all the benches ||| because there were not few of the [cups].', 'the pictures could be put on all the tables ||| because there were not few of the [pictures].', 'the pictures could be put on all the benches ||| because there were not few of the [pictures].'], ['the cups could be placed on all the tables ||| because there were not many of the [tables].', 'the cups could be placed on all the benches ||| because there were not many of the [benches].', 'the pictures could be placed on all the tables ||| because there were not many of the [tables].', 'the pictures could be placed on all the benches ||| because there were not many of the [benches].', 'the cups could be put on all the tables ||| because there were not many of the [tables].', 'the cups could be put on all the benches ||| because there were not many of the [benches].', 'the pictures could be put on all the tables ||| because there were not many of the [tables].', 'the pictures could be put on all the benches ||| because there were not many of the [benches].'], ['the tables could all be covered by the cups ||| because there were many of the [cups].', 'the benches could all be covered by the cups ||| because there were many of the [cups].', 'the tables could all be covered by the pictures ||| because there were many of the [pictures].', 'the benches could all be covered by the pictures ||| because there were many of the [pictures].', 'the tables could carry all the cups ||| because there were many of the [cups].', 'the benches could carry all the cups ||| because there were many of the [cups].', 'the tables could carry all the pictures ||| because there were many of the [pictures].', 'the benches could carry all the pictures ||| because there were many of the [pictures].'], ['the tables could all be covered by the cups ||| because there were few of the [tables].', 'the benches could all be covered by the cups ||| because there were few of the [benches].', 'the tables could all be covered by the pictures ||| because there were few of the [tables].', 'the benches could all be covered by the pictures ||| because there were few of the [benches].', 'the tables could carry all the cups ||| because there were few of the [tables].', 'the benches could carry all the cups ||| because there were few of the [benches].', 'the tables could carry all the pictures ||| because there were few of the [tables].', 'the benches could carry all the pictures ||| because there were few of the [benches].'], ['the tables could all be covered by the cups ||| because there were not few of the [cups].', 'the benches could all be covered by the cups ||| because there were not few of the [cups].', 'the tables could all be covered by the pictures ||| because there were not few of the [pictures].', 'the benches could all be covered by the pictures ||| because there were not few of the [pictures].', 'the tables could carry all the cups ||| because there were not few of the [cups].', 'the benches could carry all the cups ||| because there were not few of the [cups].', 'the tables could carry all the pictures ||| because there were not few of the [pictures].', 'the benches could carry all the pictures ||| because there were not few of the [pictures].'], ['the tables could all be covered by the cups ||| because there were not many of the [tables].', 'the benches could all be covered by the cups ||| because there were not many of the [benches].', 'the tables could all be covered by the pictures ||| because there were not many of the [tables].', 'the benches could all be covered by the pictures ||| because there were not many of the [benches].', 'the tables could carry all the cups ||| because there were not many of the [tables].', 'the benches could carry all the cups ||| because there were not many of the [benches].', 'the tables could carry all the pictures ||| because there were not many of the [tables].', 'the benches could carry all the pictures ||| because there were not many of the [benches].'], ["the cups couldn't be placed on all the tables ||| because there were not many of the [cups].", "the cups couldn't be placed on all the benches ||| because there were not many of the [cups].", "the pictures couldn't be placed on all the tables ||| because there were not many of the [pictures].", "the pictures couldn't be placed on all the benches ||| because there were not many of the [pictures].", "the cups couldn't be put on all the tables ||| because there were not many of the [cups].", "the cups couldn't be put on all the benches ||| because there were not many of the [cups].", "the pictures couldn't be put on all the tables ||| because there were not many of the [pictures].", "the pictures couldn't be put on all the benches ||| because there were not many of the [pictures]."], ["the cups couldn't be placed on all the tables ||| because there were not few of the [tables].", "the cups couldn't be placed on all the benches ||| because there were not few of the [benches].", "the pictures couldn't be placed on all the tables ||| because there were not few of the [tables].", "the pictures couldn't be placed on all the benches ||| because there were not few of the [benches].", "the cups couldn't be put on all the tables ||| because there were not few of the [tables].", "the cups couldn't be put on all the benches ||| because there were not few of the [benches].", "the pictures couldn't be put on all the tables ||| because there were not few of the [tables].", "the pictures couldn't be put on all the benches ||| because there were not few of the [benches]."], ["the cups couldn't be placed on all the tables ||| because there were few of the [cups].", "the cups couldn't be placed on all the benches ||| because there were few of the [cups].", "the pictures couldn't be placed on all the tables ||| because there were few of the [pictures].", "the pictures couldn't be placed on all the benches ||| because there were few of the [pictures].", "the cups couldn't be put on all the tables ||| because there were few of the [cups].", "the cups couldn't be put on all the benches ||| because there were few of the [cups].", "the pictures couldn't be put on all the tables ||| because there were few of the [pictures].", "the pictures couldn't be put on all the benches ||| because there were few of the [pictures]."], ["the cups couldn't be placed on all the tables ||| because there were many of the [tables].", "the cups couldn't be placed on all the benches ||| because there were many of the [benches].", "the pictures couldn't be placed on all the tables ||| because there were many of the [tables].", "the pictures couldn't be placed on all the benches ||| because there were many of the [benches].", "the cups couldn't be put on all the tables ||| because there were many of the [tables].", "the cups couldn't be put on all the benches ||| because there were many of the [benches].", "the pictures couldn't be put on all the tables ||| because there were many of the [tables].", "the pictures couldn't be put on all the benches ||| because there were many of the [benches]."], ["the tables couldn't all be covered by the cups ||| because there were not many of the [cups].", "the benches couldn't all be covered by the cups ||| because there were not many of the [cups].", "the tables couldn't all be covered by the pictures ||| because there were not many of the [pictures].", "the benches couldn't all be covered by the pictures ||| because there were not many of the [pictures].", "the tables couldn't carry all the cups ||| because there were not many of the [cups].", "the benches couldn't carry all the cups ||| because there were not many of the [cups].", "the tables couldn't carry all the pictures ||| because there were not many of the [pictures].", "the benches couldn't carry all the pictures ||| because there were not many of the [pictures]."], ["the tables couldn't all be covered by the cups ||| because there were not few of the [tables].", "the benches couldn't all be covered by the cups ||| because there were not few of the [benches].", "the tables couldn't all be covered by the pictures ||| because there were not few of the [tables].", "the benches couldn't all be covered by the pictures ||| because there were not few of the [benches].", "the tables couldn't carry all the cups ||| because there were not few of the [tables].", "the benches couldn't carry all the cups ||| because there were not few of the [benches].", "the tables couldn't carry all the pictures ||| because there were not few of the [tables].", "the benches couldn't carry all the pictures ||| because there were not few of the [benches]."], ["the tables couldn't all be covered by the cups ||| because there were few of the [cups].", "the benches couldn't all be covered by the cups ||| because there were few of the [cups].", "the tables couldn't all be covered by the pictures ||| because there were few of the [pictures].", "the benches couldn't all be covered by the pictures ||| because there were few of the [pictures].", "the tables couldn't carry all the cups ||| because there were few of the [cups].", "the benches couldn't carry all the cups ||| because there were few of the [cups].", "the tables couldn't carry all the pictures ||| because there were few of the [pictures].", "the benches couldn't carry all the pictures ||| because there were few of the [pictures]."], ["the tables couldn't all be covered by the cups ||| because there were many of the [tables].", "the benches couldn't all be covered by the cups ||| because there were many of the [benches].", "the tables couldn't all be covered by the pictures ||| because there were many of the [tables].", "the benches couldn't all be covered by the pictures ||| because there were many of the [benches].", "the tables couldn't carry all the cups ||| because there were many of the [tables].", "the benches couldn't carry all the cups ||| because there were many of the [benches].", "the tables couldn't carry all the pictures ||| because there were many of the [tables].", "the benches couldn't carry all the pictures ||| because there were many of the [benches]."], ['the cups could be placed on all the tables ||| although there were not many of the [cups].', 'the cups could be placed on all the benches ||| although there were not many of the [cups].', 'the pictures could be placed on all the tables ||| although there were not many of the [pictures].', 'the pictures could be placed on all the benches ||| although there were not many of the [pictures].', 'the cups could be put on all the tables ||| although there were not many of the [cups].', 'the cups could be put on all the benches ||| although there were not many of the [cups].', 'the pictures could be put on all the tables ||| although there were not many of the [pictures].', 'the pictures could be put on all the benches ||| although there were not many of the [pictures].'], ['the cups could be placed on all the tables ||| although there were not few of the [tables].', 'the cups could be placed on all the benches ||| although there were not few of the [benches].', 'the pictures could be placed on all the tables ||| although there were not few of the [tables].', 'the pictures could be placed on all the benches ||| although there were not few of the [benches].', 'the cups could be put on all the tables ||| although there were not few of the [tables].', 'the cups could be put on all the benches ||| although there were not few of the [benches].', 'the pictures could be put on all the tables ||| although there were not few of the [tables].', 'the pictures could be put on all the benches ||| although there were not few of the [benches].'], ['the cups could be placed on all the tables ||| although there were few of the [cups].', 'the cups could be placed on all the benches ||| although there were few of the [cups].', 'the pictures could be placed on all the tables ||| although there were few of the [pictures].', 'the pictures could be placed on all the benches ||| although there were few of the [pictures].', 'the cups could be put on all the tables ||| although there were few of the [cups].', 'the cups could be put on all the benches ||| although there were few of the [cups].', 'the pictures could be put on all the tables ||| although there were few of the [pictures].', 'the pictures could be put on all the benches ||| although there were few of the [pictures].'], ['the cups could be placed on all the tables ||| although there were many of the [tables].', 'the cups could be placed on all the benches ||| although there were many of the [benches].', 'the pictures could be placed on all the tables ||| although there were many of the [tables].', 'the pictures could be placed on all the benches ||| although there were many of the [benches].', 'the cups could be put on all the tables ||| although there were many of the [tables].', 'the cups could be put on all the benches ||| although there were many of the [benches].', 'the pictures could be put on all the tables ||| although there were many of the [tables].', 'the pictures could be put on all the benches ||| although there were many of the [benches].'], ['the tables could all be covered by the cups ||| although there were not many of the [cups].', 'the benches could all be covered by the cups ||| although there were not many of the [cups].', 'the tables could all be covered by the pictures ||| although there were not many of the [pictures].', 'the benches could all be covered by the pictures ||| although there were not many of the [pictures].', 'the tables could carry all the cups ||| although there were not many of the [cups].', 'the benches could carry all the cups ||| although there were not many of the [cups].', 'the tables could carry all the pictures ||| although there were not many of the [pictures].', 'the benches could carry all the pictures ||| although there were not many of the [pictures].'], ['the tables could all be covered by the cups ||| although there were not few of the [tables].', 'the benches could all be covered by the cups ||| although there were not few of the [benches].', 'the tables could all be covered by the pictures ||| although there were not few of the [tables].', 'the benches could all be covered by the pictures ||| although there were not few of the [benches].', 'the tables could carry all the cups ||| although there were not few of the [tables].', 'the benches could carry all the cups ||| although there were not few of the [benches].', 'the tables could carry all the pictures ||| although there were not few of the [tables].', 'the benches could carry all the pictures ||| although there were not few of the [benches].'], ['the tables could all be covered by the cups ||| although there were few of the [cups].', 'the benches could all be covered by the cups ||| although there were few of the [cups].', 'the tables could all be covered by the pictures ||| although there were few of the [pictures].', 'the benches could all be covered by the pictures ||| although there were few of the [pictures].', 'the tables could carry all the cups ||| although there were few of the [cups].', 'the benches could carry all the cups ||| although there were few of the [cups].', 'the tables could carry all the pictures ||| although there were few of the [pictures].', 'the benches could carry all the pictures ||| although there were few of the [pictures].'], ['the tables could all be covered by the cups ||| although there were many of the [tables].', 'the benches could all be covered by the cups ||| although there were many of the [benches].', 'the tables could all be covered by the pictures ||| although there were many of the [tables].', 'the benches could all be covered by the pictures ||| although there were many of the [benches].', 'the tables could carry all the cups ||| although there were many of the [tables].', 'the benches could carry all the cups ||| although there were many of the [benches].', 'the tables could carry all the pictures ||| although there were many of the [tables].', 'the benches could carry all the pictures ||| although there were many of the [benches].'], ["the cups couldn't be placed on all the tables ||| although there were many of the [cups].", "the cups couldn't be placed on all the benches ||| although there were many of the [cups].", "the pictures couldn't be placed on all the tables ||| although there were many of the [pictures].", "the pictures couldn't be placed on all the benches ||| although there were many of the [pictures].", "the cups couldn't be put on all the tables ||| although there were many of the [cups].", "the cups couldn't be put on all the benches ||| although there were many of the [cups].", "the pictures couldn't be put on all the tables ||| although there were many of the [pictures].", "the pictures couldn't be put on all the benches ||| although there were many of the [pictures]."], ["the cups couldn't be placed on all the tables ||| although there were few of the [tables].", "the cups couldn't be placed on all the benches ||| although there were few of the [benches].", "the pictures couldn't be placed on all the tables ||| although there were few of the [tables].", "the pictures couldn't be placed on all the benches ||| although there were few of the [benches].", "the cups couldn't be put on all the tables ||| although there were few of the [tables].", "the cups couldn't be put on all the benches ||| although there were few of the [benches].", "the pictures couldn't be put on all the tables ||| although there were few of the [tables].", "the pictures couldn't be put on all the benches ||| although there were few of the [benches]."], ["the cups couldn't be placed on all the tables ||| although there were not few of the [cups].", "the cups couldn't be placed on all the benches ||| although there were not few of the [cups].", "the pictures couldn't be placed on all the tables ||| although there were not few of the [pictures].", "the pictures couldn't be placed on all the benches ||| although there were not few of the [pictures].", "the cups couldn't be put on all the tables ||| although there were not few of the [cups].", "the cups couldn't be put on all the benches ||| although there were not few of the [cups].", "the pictures couldn't be put on all the tables ||| although there were not few of the [pictures].", "the pictures couldn't be put on all the benches ||| although there were not few of the [pictures]."], ["the cups couldn't be placed on all the tables ||| although there were not many of the [tables].", "the cups couldn't be placed on all the benches ||| although there were not many of the [benches].", "the pictures couldn't be placed on all the tables ||| although there were not many of the [tables].", "the pictures couldn't be placed on all the benches ||| although there were not many of the [benches].", "the cups couldn't be put on all the tables ||| although there were not many of the [tables].", "the cups couldn't be put on all the benches ||| although there were not many of the [benches].", "the pictures couldn't be put on all the tables ||| although there were not many of the [tables].", "the pictures couldn't be put on all the benches ||| although there were not many of the [benches]."], ["the tables couldn't all be covered by the cups ||| although there were many of the [cups].", "the benches couldn't all be covered by the cups ||| although there were many of the [cups].", "the tables couldn't all be covered by the pictures ||| although there were many of the [pictures].", "the benches couldn't all be covered by the pictures ||| although there were many of the [pictures].", "the tables couldn't carry all the cups ||| although there were many of the [cups].", "the benches couldn't carry all the cups ||| although there were many of the [cups].", "the tables couldn't carry all the pictures ||| although there were many of the [pictures].", "the benches couldn't carry all the pictures ||| although there were many of the [pictures]."], ["the tables couldn't all be covered by the cups ||| although there were few of the [tables].", "the benches couldn't all be covered by the cups ||| although there were few of the [benches].", "the tables couldn't all be covered by the pictures ||| although there were few of the [tables].", "the benches couldn't all be covered by the pictures ||| although there were few of the [benches].", "the tables couldn't carry all the cups ||| although there were few of the [tables].", "the benches couldn't carry all the cups ||| although there were few of the [benches].", "the tables couldn't carry all the pictures ||| although there were few of the [tables].", "the benches couldn't carry all the pictures ||| although there were few of the [benches]."], ["the tables couldn't all be covered by the cups ||| although there were not few of the [cups].", "the benches couldn't all be covered by the cups ||| although there were not few of the [cups].", "the tables couldn't all be covered by the pictures ||| although there were not few of the [pictures].", "the benches couldn't all be covered by the pictures ||| although there were not few of the [pictures].", "the tables couldn't carry all the cups ||| although there were not few of the [cups].", "the benches couldn't carry all the cups ||| although there were not few of the [cups].", "the tables couldn't carry all the pictures ||| although there were not few of the [pictures].", "the benches couldn't carry all the pictures ||| although there were not few of the [pictures]."], ["the tables couldn't all be covered by the cups ||| although there were not many of the [tables].", "the benches couldn't all be covered by the cups ||| although there were not many of the [benches].", "the tables couldn't all be covered by the pictures ||| although there were not many of the [tables].", "the benches couldn't all be covered by the pictures ||| although there were not many of the [benches].", "the tables couldn't carry all the cups ||| although there were not many of the [tables].", "the benches couldn't carry all the cups ||| although there were not many of the [benches].", "the tables couldn't carry all the pictures ||| although there were not many of the [tables].", "the benches couldn't carry all the pictures ||| although there were not many of the [benches]."]], ['Germany defeated Italy ||| because [Germany] was more powerful.', 'Germany defeated Italy ||| because [Italy] was less powerful.', "Germany defeated Italy ||| because [Germany] wasn't less powerful.", "Germany defeated Italy ||| because [Italy] wasn't more powerful.", 'Italy was defeated by Germany ||| because [Germany] was more powerful.', 'Italy was defeated by Germany ||| because [Italy] was less powerful.', "Italy was defeated by Germany ||| because [Germany] wasn't less powerful.", "Italy was defeated by Germany ||| because [Italy] wasn't more powerful.", "Germany didn't defeat Italy ||| because [Germany] wasn't more powerful.", "Germany didn't defeat Italy ||| because [Italy] wasn't less powerful.", "Germany didn't defeat Italy ||| because [Germany] was less powerful.", "Germany didn't defeat Italy ||| because [Italy] was more powerful.", "Italy wasn't defeated by Germany ||| because [Germany] wasn't more powerful.", "Italy wasn't defeated by Germany ||| because [Italy] wasn't less powerful.", "Italy wasn't defeated by Germany ||| because [Germany] was less powerful.", "Italy wasn't defeated by Germany ||| because [Italy] was more powerful."], ['James ceded the presidency to Amy ||| because [James] was notorious.', 'James ceded the presidency to Amy ||| because [Amy] was popular.', "James ceded the presidency to Amy ||| because [James] wasn't popular.", 'James ceded the presidency to Amy ||| because [Amy] was not notorious.', 'Amy took over the presidency from James ||| because [James] was notorious.', 'Amy took over the presidency from James ||| because [Amy] was popular.', "Amy took over the presidency from James ||| because [James] wasn't popular.", 'Amy took over the presidency from James ||| because [Amy] was not notorious.', "James didn't cede the presidency to Amy ||| because [James] was not notorious.", "James didn't cede the presidency to Amy ||| because [Amy] wasn't popular.", "James didn't cede the presidency to Amy ||| because [James] was popular.", "James didn't cede the presidency to Amy ||| because [Amy] was notorious.", "Amy didn't take over the presidency from James ||| because [James] was not notorious.", "Amy didn't take over the presidency from James ||| because [Amy] wasn't popular.", "Amy didn't take over the presidency from James ||| because [James] was popular.", "Amy didn't take over the presidency from James ||| because [Amy] was notorious."], [('James ceded the presidency to Amy ||| because [James] was notorious.', 'James gave the presidency to Amy ||| because [James] was notorious.'), ('James ceded the presidency to Amy ||| because [Amy] was popular.', 'James gave the presidency to Amy ||| because [Amy] was popular.'), ("James ceded the presidency to Amy ||| because [James] wasn't popular.", "James gave the presidency to Amy ||| because [James] wasn't popular."), ('James ceded the presidency to Amy ||| because [Amy] was not notorious.', 'James gave the presidency to Amy ||| because [Amy] was not notorious.'), ('Amy took over the presidency from James ||| because [James] was notorious.', 'Amy got the presidency from James ||| because [James] was notorious.'), ('Amy took over the presidency from James ||| because [Amy] was popular.', 'Amy got the presidency from James ||| because [Amy] was popular.'), ("Amy took over the presidency from James ||| because [James] wasn't popular.", "Amy got the presidency from James ||| because [James] wasn't popular."), ('Amy took over the presidency from James ||| because [Amy] was not notorious.', 'Amy got the presidency from James ||| because [Amy] was not notorious.'), ("James didn't cede the presidency to Amy ||| because [James] was not notorious.", "James didn't give the presidency to Amy ||| because [James] was not notorious."), ("James didn't cede the presidency to Amy ||| because [Amy] wasn't popular.", "James didn't give the presidency to Amy ||| because [Amy] wasn't popular."), ("James didn't cede the presidency to Amy ||| because [James] was popular.", "James didn't give the presidency to Amy ||| because [James] was popular."), ("James didn't cede the presidency to Amy ||| because [Amy] was notorious.", "James didn't give the presidency to Amy ||| because [Amy] was notorious."), ("Amy didn't take over the presidency from James ||| because [James] was not notorious.", "Amy didn't get the presidency from James ||| because [James] was not notorious."), ("Amy didn't take over the presidency from James ||| because [Amy] wasn't popular.", "Amy didn't get the presidency from James ||| because [Amy] wasn't popular."), ("Amy didn't take over the presidency from James ||| because [James] was popular.", "Amy didn't get the presidency from James ||| because [James] was popular."), ("Amy didn't take over the presidency from James ||| because [Amy] was notorious.", "Amy didn't get the presidency from James ||| because [Amy] was notorious."), ('James ceded the presidency to Amy ||| although [James] was not notorious.', 'James gave the presidency to Amy ||| although [James] was not notorious.'), ("James ceded the presidency to Amy ||| although [Amy] wasn't popular.", "James gave the presidency to Amy ||| although [Amy] wasn't popular."), ('James ceded the presidency to Amy ||| although [James] was popular.', 'James gave the presidency to Amy ||| although [James] was popular.'), ('James ceded the presidency to Amy ||| although [Amy] was notorious.', 'James gave the presidency to Amy ||| although [Amy] was notorious.'), ('Amy took over the presidency from James ||| although [James] was not notorious.', 'Amy got the presidency from James ||| although [James] was not notorious.'), ("Amy took over the presidency from James ||| although [Amy] wasn't popular.", "Amy got the presidency from James ||| although [Amy] wasn't popular."), ('Amy took over the presidency from James ||| although [James] was popular.', 'Amy got the presidency from James ||| although [James] was popular.'), ('Amy took over the presidency from James ||| although [Amy] was notorious.', 'Amy got the presidency from James ||| although [Amy] was notorious.'), ("James didn't cede the presidency to Amy ||| although [James] was notorious.", "James didn't give the presidency to Amy ||| although [James] was notorious."), ("James didn't cede the presidency to Amy ||| although [Amy] was popular.", "James didn't give the presidency to Amy ||| although [Amy] was popular."), ("James didn't cede the presidency to Amy ||| although [James] wasn't popular.", "James didn't give the presidency to Amy ||| although [James] wasn't popular."), ("James didn't cede the presidency to Amy ||| although [Amy] was not notorious.", "James didn't give the presidency to Amy ||| although [Amy] was not notorious."), ("Amy didn't take over the presidency from James ||| although [James] was notorious.", "Amy didn't get the presidency from James ||| although [James] was notorious."), ("Amy didn't take over the presidency from James ||| although [Amy] was popular.", "Amy didn't get the presidency from James ||| although [Amy] was popular."), ("Amy didn't take over the presidency from James ||| although [James] wasn't popular.", "Amy didn't get the presidency from James ||| although [James] wasn't popular."), ("Amy didn't take over the presidency from James ||| although [Amy] was not notorious.", "Amy didn't get the presidency from James ||| although [Amy] was not notorious.")], [['the apples are more popular than the grapes ||| so the [apples] should be made more next time.', 'the apples are more popular than the sandwiches ||| so the [apples] should be made more next time.', 'the bananas are more popular than the grapes ||| so the [bananas] should be made more next time.', 'the bananas are more popular than the sandwiches ||| so the [bananas] should be made more next time.', 'the apples are sold more than the grapes ||| so the [apples] should be made more next time.', 'the apples are sold more than the sandwiches ||| so the [apples] should be made more next time.', 'the bananas are sold more than the grapes ||| so the [bananas] should be made more next time.', 'the bananas are sold more than the sandwiches ||| so the [bananas] should be made more next time.'], ['the apples are more popular than the grapes ||| so the [grapes] should be made less next time.', 'the apples are more popular than the sandwiches ||| so the [sandwiches] should be made less next time.', 'the bananas are more popular than the grapes ||| so the [grapes] should be made less next time.', 'the bananas are more popular than the sandwiches ||| so the [sandwiches] should be made less next time.', 'the apples are sold more than the grapes ||| so the [grapes] should be made less next time.', 'the apples are sold more than the sandwiches ||| so the [sandwiches] should be made less next time.', 'the bananas are sold more than the grapes ||| so the [grapes] should be made less next time.', 'the bananas are sold more than the sandwiches ||| so the [sandwiches] should be made less next time.'], ["the apples are more popular than the grapes ||| so the [apples] shouldn't be made less next time.", "the apples are more popular than the sandwiches ||| so the [apples] shouldn't be made less next time.", "the bananas are more popular than the grapes ||| so the [bananas] shouldn't be made less next time.", "the bananas are more popular than the sandwiches ||| so the [bananas] shouldn't be made less next time.", "the apples are sold more than the grapes ||| so the [apples] shouldn't be made less next time.", "the apples are sold more than the sandwiches ||| so the [apples] shouldn't be made less next time.", "the bananas are sold more than the grapes ||| so the [bananas] shouldn't be made less next time.", "the bananas are sold more than the sandwiches ||| so the [bananas] shouldn't be made less next time."], ["the apples are more popular than the grapes ||| so the [grapes] shouldn't be made more next time.", "the apples are more popular than the sandwiches ||| so the [sandwiches] shouldn't be made more next time.", "the bananas are more popular than the grapes ||| so the [grapes] shouldn't be made more next time.", "the bananas are more popular than the sandwiches ||| so the [sandwiches] shouldn't be made more next time.", "the apples are sold more than the grapes ||| so the [grapes] shouldn't be made more next time.", "the apples are sold more than the sandwiches ||| so the [sandwiches] shouldn't be made more next time.", "the bananas are sold more than the grapes ||| so the [grapes] shouldn't be made more next time.", "the bananas are sold more than the sandwiches ||| so the [sandwiches] shouldn't be made more next time."], ['the grapes lose to the apples ||| so the [apples] should be made more next time.', 'the sandwiches lose to the apples ||| so the [apples] should be made more next time.', 'the grapes lose to the bananas ||| so the [bananas] should be made more next time.', 'the sandwiches lose to the bananas ||| so the [bananas] should be made more next time.', 'the grapes are not as popular as the apples ||| so the [apples] should be made more next time.', 'the sandwiches are not as popular as the apples ||| so the [apples] should be made more next time.', 'the grapes are not as popular as the bananas ||| so the [bananas] should be made more next time.', 'the sandwiches are not as popular as the bananas ||| so the [bananas] should be made more next time.'], ['the grapes lose to the apples ||| so the [grapes] should be made less next time.', 'the sandwiches lose to the apples ||| so the [sandwiches] should be made less next time.', 'the grapes lose to the bananas ||| so the [grapes] should be made less next time.', 'the sandwiches lose to the bananas ||| so the [sandwiches] should be made less next time.', 'the grapes are not as popular as the apples ||| so the [grapes] should be made less next time.', 'the sandwiches are not as popular as the apples ||| so the [sandwiches] should be made less next time.', 'the grapes are not as popular as the bananas ||| so the [grapes] should be made less next time.', 'the sandwiches are not as popular as the bananas ||| so the [sandwiches] should be made less next time.'], ["the grapes lose to the apples ||| so the [apples] shouldn't be made less next time.", "the sandwiches lose to the apples ||| so the [apples] shouldn't be made less next time.", "the grapes lose to the bananas ||| so the [bananas] shouldn't be made less next time.", "the sandwiches lose to the bananas ||| so the [bananas] shouldn't be made less next time.", "the grapes are not as popular as the apples ||| so the [apples] shouldn't be made less next time.", "the sandwiches are not as popular as the apples ||| so the [apples] shouldn't be made less next time.", "the grapes are not as popular as the bananas ||| so the [bananas] shouldn't be made less next time.", "the sandwiches are not as popular as the bananas ||| so the [bananas] shouldn't be made less next time."], ["the grapes lose to the apples ||| so the [grapes] shouldn't be made more next time.", "the sandwiches lose to the apples ||| so the [sandwiches] shouldn't be made more next time.", "the grapes lose to the bananas ||| so the [grapes] shouldn't be made more next time.", "the sandwiches lose to the bananas ||| so the [sandwiches] shouldn't be made more next time.", "the grapes are not as popular as the apples ||| so the [grapes] shouldn't be made more next time.", "the sandwiches are not as popular as the apples ||| so the [sandwiches] shouldn't be made more next time.", "the grapes are not as popular as the bananas ||| so the [grapes] shouldn't be made more next time.", "the sandwiches are not as popular as the bananas ||| so the [sandwiches] shouldn't be made more next time."], ["the apples are less popular than the grapes ||| so the [apples] shouldn't be made more next time.", "the apples are less popular than the sandwiches ||| so the [apples] shouldn't be made more next time.", "the bananas are less popular than the grapes ||| so the [bananas] shouldn't be made more next time.", "the bananas are less popular than the sandwiches ||| so the [bananas] shouldn't be made more next time.", "the apples are sold less than the grapes ||| so the [apples] shouldn't be made more next time.", "the apples are sold less than the sandwiches ||| so the [apples] shouldn't be made more next time.", "the bananas are sold less than the grapes ||| so the [bananas] shouldn't be made more next time.", "the bananas are sold less than the sandwiches ||| so the [bananas] shouldn't be made more next time."], ["the apples are less popular than the grapes ||| so the [grapes] shouldn't be made less next time.", "the apples are less popular than the sandwiches ||| so the [sandwiches] shouldn't be made less next time.", "the bananas are less popular than the grapes ||| so the [grapes] shouldn't be made less next time.", "the bananas are less popular than the sandwiches ||| so the [sandwiches] shouldn't be made less next time.", "the apples are sold less than the grapes ||| so the [grapes] shouldn't be made less next time.", "the apples are sold less than the sandwiches ||| so the [sandwiches] shouldn't be made less next time.", "the bananas are sold less than the grapes ||| so the [grapes] shouldn't be made less next time.", "the bananas are sold less than the sandwiches ||| so the [sandwiches] shouldn't be made less next time."], ['the apples are less popular than the grapes ||| so the [apples] should be made less next time.', 'the apples are less popular than the sandwiches ||| so the [apples] should be made less next time.', 'the bananas are less popular than the grapes ||| so the [bananas] should be made less next time.', 'the bananas are less popular than the sandwiches ||| so the [bananas] should be made less next time.', 'the apples are sold less than the grapes ||| so the [apples] should be made less next time.', 'the apples are sold less than the sandwiches ||| so the [apples] should be made less next time.', 'the bananas are sold less than the grapes ||| so the [bananas] should be made less next time.', 'the bananas are sold less than the sandwiches ||| so the [bananas] should be made less next time.'], ['the apples are less popular than the grapes ||| so the [grapes] should be made more next time.', 'the apples are less popular than the sandwiches ||| so the [sandwiches] should be made more next time.', 'the bananas are less popular than the grapes ||| so the [grapes] should be made more next time.', 'the bananas are less popular than the sandwiches ||| so the [sandwiches] should be made more next time.', 'the apples are sold less than the grapes ||| so the [grapes] should be made more next time.', 'the apples are sold less than the sandwiches ||| so the [sandwiches] should be made more next time.', 'the bananas are sold less than the grapes ||| so the [grapes] should be made more next time.', 'the bananas are sold less than the sandwiches ||| so the [sandwiches] should be made more next time.'], ["the grapes don't lose to the apples ||| so the [apples] shouldn't be made more next time.", "the sandwiches don't lose to the apples ||| so the [apples] shouldn't be made more next time.", "the grapes don't lose to the bananas ||| so the [bananas] shouldn't be made more next time.", "the sandwiches don't lose to the bananas ||| so the [bananas] shouldn't be made more next time.", "the grapes are as popular as the apples ||| so the [apples] shouldn't be made more next time.", "the sandwiches are as popular as the apples ||| so the [apples] shouldn't be made more next time.", "the grapes are as popular as the bananas ||| so the [bananas] shouldn't be made more next time.", "the sandwiches are as popular as the bananas ||| so the [bananas] shouldn't be made more next time."], ["the grapes don't lose to the apples ||| so the [grapes] shouldn't be made less next time.", "the sandwiches don't lose to the apples ||| so the [sandwiches] shouldn't be made less next time.", "the grapes don't lose to the bananas ||| so the [grapes] shouldn't be made less next time.", "the sandwiches don't lose to the bananas ||| so the [sandwiches] shouldn't be made less next time.", "the grapes are as popular as the apples ||| so the [grapes] shouldn't be made less next time.", "the sandwiches are as popular as the apples ||| so the [sandwiches] shouldn't be made less next time.", "the grapes are as popular as the bananas ||| so the [grapes] shouldn't be made less next time.", "the sandwiches are as popular as the bananas ||| so the [sandwiches] shouldn't be made less next time."], ["the grapes don't lose to the apples ||| so the [apples] should be made less next time.", "the sandwiches don't lose to the apples ||| so the [apples] should be made less next time.", "the grapes don't lose to the bananas ||| so the [bananas] should be made less next time.", "the sandwiches don't lose to the bananas ||| so the [bananas] should be made less next time.", 'the grapes are as popular as the apples ||| so the [apples] should be made less next time.', 'the sandwiches are as popular as the apples ||| so the [apples] should be made less next time.', 'the grapes are as popular as the bananas ||| so the [bananas] should be made less next time.', 'the sandwiches are as popular as the bananas ||| so the [bananas] should be made less next time.'], ["the grapes don't lose to the apples ||| so the [grapes] should be made more next time.", "the sandwiches don't lose to the apples ||| so the [sandwiches] should be made more next time.", "the grapes don't lose to the bananas ||| so the [grapes] should be made more next time.", "the sandwiches don't lose to the bananas ||| so the [sandwiches] should be made more next time.", 'the grapes are as popular as the apples ||| so the [grapes] should be made more next time.', 'the sandwiches are as popular as the apples ||| so the [sandwiches] should be made more next time.', 'the grapes are as popular as the bananas ||| so the [grapes] should be made more next time.', 'the sandwiches are as popular as the bananas ||| so the [sandwiches] should be made more next time.'], ["the apples are more popular than the grapes ||| but the [apples] shouldn't be made more next time.", "the apples are more popular than the sandwiches ||| but the [apples] shouldn't be made more next time.", "the bananas are more popular than the grapes ||| but the [bananas] shouldn't be made more next time.", "the bananas are more popular than the sandwiches ||| but the [bananas] shouldn't be made more next time.", "the apples are sold more than the grapes ||| but the [apples] shouldn't be made more next time.", "the apples are sold more than the sandwiches ||| but the [apples] shouldn't be made more next time.", "the bananas are sold more than the grapes ||| but the [bananas] shouldn't be made more next time.", "the bananas are sold more than the sandwiches ||| but the [bananas] shouldn't be made more next time."], ["the apples are more popular than the grapes ||| but the [grapes] shouldn't be made less next time.", "the apples are more popular than the sandwiches ||| but the [sandwiches] shouldn't be made less next time.", "the bananas are more popular than the grapes ||| but the [grapes] shouldn't be made less next time.", "the bananas are more popular than the sandwiches ||| but the [sandwiches] shouldn't be made less next time.", "the apples are sold more than the grapes ||| but the [grapes] shouldn't be made less next time.", "the apples are sold more than the sandwiches ||| but the [sandwiches] shouldn't be made less next time.", "the bananas are sold more than the grapes ||| but the [grapes] shouldn't be made less next time.", "the bananas are sold more than the sandwiches ||| but the [sandwiches] shouldn't be made less next time."], ['the apples are more popular than the grapes ||| but the [apples] should be made less next time.', 'the apples are more popular than the sandwiches ||| but the [apples] should be made less next time.', 'the bananas are more popular than the grapes ||| but the [bananas] should be made less next time.', 'the bananas are more popular than the sandwiches ||| but the [bananas] should be made less next time.', 'the apples are sold more than the grapes ||| but the [apples] should be made less next time.', 'the apples are sold more than the sandwiches ||| but the [apples] should be made less next time.', 'the bananas are sold more than the grapes ||| but the [bananas] should be made less next time.', 'the bananas are sold more than the sandwiches ||| but the [bananas] should be made less next time.'], ['the apples are more popular than the grapes ||| but the [grapes] should be made more next time.', 'the apples are more popular than the sandwiches ||| but the [sandwiches] should be made more next time.', 'the bananas are more popular than the grapes ||| but the [grapes] should be made more next time.', 'the bananas are more popular than the sandwiches ||| but the [sandwiches] should be made more next time.', 'the apples are sold more than the grapes ||| but the [grapes] should be made more next time.', 'the apples are sold more than the sandwiches ||| but the [sandwiches] should be made more next time.', 'the bananas are sold more than the grapes ||| but the [grapes] should be made more next time.', 'the bananas are sold more than the sandwiches ||| but the [sandwiches] should be made more next time.'], ["the grapes lose to the apples ||| but the [apples] shouldn't be made more next time.", "the sandwiches lose to the apples ||| but the [apples] shouldn't be made more next time.", "the grapes lose to the bananas ||| but the [bananas] shouldn't be made more next time.", "the sandwiches lose to the bananas ||| but the [bananas] shouldn't be made more next time.", "the grapes are not as popular as the apples ||| but the [apples] shouldn't be made more next time.", "the sandwiches are not as popular as the apples ||| but the [apples] shouldn't be made more next time.", "the grapes are not as popular as the bananas ||| but the [bananas] shouldn't be made more next time.", "the sandwiches are not as popular as the bananas ||| but the [bananas] shouldn't be made more next time."], ["the grapes lose to the apples ||| but the [grapes] shouldn't be made less next time.", "the sandwiches lose to the apples ||| but the [sandwiches] shouldn't be made less next time.", "the grapes lose to the bananas ||| but the [grapes] shouldn't be made less next time.", "the sandwiches lose to the bananas ||| but the [sandwiches] shouldn't be made less next time.", "the grapes are not as popular as the apples ||| but the [grapes] shouldn't be made less next time.", "the sandwiches are not as popular as the apples ||| but the [sandwiches] shouldn't be made less next time.", "the grapes are not as popular as the bananas ||| but the [grapes] shouldn't be made less next time.", "the sandwiches are not as popular as the bananas ||| but the [sandwiches] shouldn't be made less next time."], ['the grapes lose to the apples ||| but the [apples] should be made less next time.', 'the sandwiches lose to the apples ||| but the [apples] should be made less next time.', 'the grapes lose to the bananas ||| but the [bananas] should be made less next time.', 'the sandwiches lose to the bananas ||| but the [bananas] should be made less next time.', 'the grapes are not as popular as the apples ||| but the [apples] should be made less next time.', 'the sandwiches are not as popular as the apples ||| but the [apples] should be made less next time.', 'the grapes are not as popular as the bananas ||| but the [bananas] should be made less next time.', 'the sandwiches are not as popular as the bananas ||| but the [bananas] should be made less next time.'], ['the grapes lose to the apples ||| but the [grapes] should be made more next time.', 'the sandwiches lose to the apples ||| but the [sandwiches] should be made more next time.', 'the grapes lose to the bananas ||| but the [grapes] should be made more next time.', 'the sandwiches lose to the bananas ||| but the [sandwiches] should be made more next time.', 'the grapes are not as popular as the apples ||| but the [grapes] should be made more next time.', 'the sandwiches are not as popular as the apples ||| but the [sandwiches] should be made more next time.', 'the grapes are not as popular as the bananas ||| but the [grapes] should be made more next time.', 'the sandwiches are not as popular as the bananas ||| but the [sandwiches] should be made more next time.'], ['the apples are less popular than the grapes ||| but the [apples] should be made more next time.', 'the apples are less popular than the sandwiches ||| but the [apples] should be made more next time.', 'the bananas are less popular than the grapes ||| but the [bananas] should be made more next time.', 'the bananas are less popular than the sandwiches ||| but the [bananas] should be made more next time.', 'the apples are sold less than the grapes ||| but the [apples] should be made more next time.', 'the apples are sold less than the sandwiches ||| but the [apples] should be made more next time.', 'the bananas are sold less than the grapes ||| but the [bananas] should be made more next time.', 'the bananas are sold less than the sandwiches ||| but the [bananas] should be made more next time.'], ['the apples are less popular than the grapes ||| but the [grapes] should be made less next time.', 'the apples are less popular than the sandwiches ||| but the [sandwiches] should be made less next time.', 'the bananas are less popular than the grapes ||| but the [grapes] should be made less next time.', 'the bananas are less popular than the sandwiches ||| but the [sandwiches] should be made less next time.', 'the apples are sold less than the grapes ||| but the [grapes] should be made less next time.', 'the apples are sold less than the sandwiches ||| but the [sandwiches] should be made less next time.', 'the bananas are sold less than the grapes ||| but the [grapes] should be made less next time.', 'the bananas are sold less than the sandwiches ||| but the [sandwiches] should be made less next time.'], ["the apples are less popular than the grapes ||| but the [apples] shouldn't be made less next time.", "the apples are less popular than the sandwiches ||| but the [apples] shouldn't be made less next time.", "the bananas are less popular than the grapes ||| but the [bananas] shouldn't be made less next time.", "the bananas are less popular than the sandwiches ||| but the [bananas] shouldn't be made less next time.", "the apples are sold less than the grapes ||| but the [apples] shouldn't be made less next time.", "the apples are sold less than the sandwiches ||| but the [apples] shouldn't be made less next time.", "the bananas are sold less than the grapes ||| but the [bananas] shouldn't be made less next time.", "the bananas are sold less than the sandwiches ||| but the [bananas] shouldn't be made less next time."], ["the apples are less popular than the grapes ||| but the [grapes] shouldn't be made more next time.", "the apples are less popular than the sandwiches ||| but the [sandwiches] shouldn't be made more next time.", "the bananas are less popular than the grapes ||| but the [grapes] shouldn't be made more next time.", "the bananas are less popular than the sandwiches ||| but the [sandwiches] shouldn't be made more next time.", "the apples are sold less than the grapes ||| but the [grapes] shouldn't be made more next time.", "the apples are sold less than the sandwiches ||| but the [sandwiches] shouldn't be made more next time.", "the bananas are sold less than the grapes ||| but the [grapes] shouldn't be made more next time.", "the bananas are sold less than the sandwiches ||| but the [sandwiches] shouldn't be made more next time."], ["the grapes don't lose to the apples ||| but the [apples] should be made more next time.", "the sandwiches don't lose to the apples ||| but the [apples] should be made more next time.", "the grapes don't lose to the bananas ||| but the [bananas] should be made more next time.", "the sandwiches don't lose to the bananas ||| but the [bananas] should be made more next time.", 'the grapes are as popular as the apples ||| but the [apples] should be made more next time.', 'the sandwiches are as popular as the apples ||| but the [apples] should be made more next time.', 'the grapes are as popular as the bananas ||| but the [bananas] should be made more next time.', 'the sandwiches are as popular as the bananas ||| but the [bananas] should be made more next time.'], ["the grapes don't lose to the apples ||| but the [grapes] should be made less next time.", "the sandwiches don't lose to the apples ||| but the [sandwiches] should be made less next time.", "the grapes don't lose to the bananas ||| but the [grapes] should be made less next time.", "the sandwiches don't lose to the bananas ||| but the [sandwiches] should be made less next time.", 'the grapes are as popular as the apples ||| but the [grapes] should be made less next time.', 'the sandwiches are as popular as the apples ||| but the [sandwiches] should be made less next time.', 'the grapes are as popular as the bananas ||| but the [grapes] should be made less next time.', 'the sandwiches are as popular as the bananas ||| but the [sandwiches] should be made less next time.'], ["the grapes don't lose to the apples ||| but the [apples] shouldn't be made less next time.", "the sandwiches don't lose to the apples ||| but the [apples] shouldn't be made less next time.", "the grapes don't lose to the bananas ||| but the [bananas] shouldn't be made less next time.", "the sandwiches don't lose to the bananas ||| but the [bananas] shouldn't be made less next time.", "the grapes are as popular as the apples ||| but the [apples] shouldn't be made less next time.", "the sandwiches are as popular as the apples ||| but the [apples] shouldn't be made less next time.", "the grapes are as popular as the bananas ||| but the [bananas] shouldn't be made less next time.", "the sandwiches are as popular as the bananas ||| but the [bananas] shouldn't be made less next time."], ["the grapes don't lose to the apples ||| but the [grapes] shouldn't be made more next time.", "the sandwiches don't lose to the apples ||| but the [sandwiches] shouldn't be made more next time.", "the grapes don't lose to the bananas ||| but the [grapes] shouldn't be made more next time.", "the sandwiches don't lose to the bananas ||| but the [sandwiches] shouldn't be made more next time.", "the grapes are as popular as the apples ||| but the [grapes] shouldn't be made more next time.", "the sandwiches are as popular as the apples ||| but the [sandwiches] shouldn't be made more next time.", "the grapes are as popular as the bananas ||| but the [grapes] shouldn't be made more next time.", "the sandwiches are as popular as the bananas ||| but the [sandwiches] shouldn't be made more next time."]], [("George gave the tickets of the play to Linda ||| because [George] wasn't interested in it.", "George sent the tickets of the play to Linda ||| because [George] wasn't interested in it."), ('George gave the tickets of the play to Linda ||| because [Linda] was eager to see it.', 'George sent the tickets of the play to Linda ||| because [Linda] was eager to see it.'), ("George gave the tickets of the play to Linda ||| because [George] wasn't eager to see it.", "George sent the tickets of the play to Linda ||| because [George] wasn't eager to see it."), ('George gave the tickets of the play to Linda ||| because [Linda] was interested in it.', 'George sent the tickets of the play to Linda ||| because [Linda] was interested in it.'), ("Linda received the tickets of the play from George ||| because [George] wasn't interested in it.", "Linda took the tickets of the play from George ||| because [George] wasn't interested in it."), ('Linda received the tickets of the play from George ||| because [Linda] was eager to see it.', 'Linda took the tickets of the play from George ||| because [Linda] was eager to see it.'), ("Linda received the tickets of the play from George ||| because [George] wasn't eager to see it.", "Linda took the tickets of the play from George ||| because [George] wasn't eager to see it."), ('Linda received the tickets of the play from George ||| because [Linda] was interested in it.', 'Linda took the tickets of the play from George ||| because [Linda] was interested in it.'), ("George didn't give the tickets of the play to Linda ||| because [George] was interested in it.", "George didn't send the tickets of the play to Linda ||| because [George] was interested in it."), ("George didn't give the tickets of the play to Linda ||| because [Linda] wasn't eager to see it.", "George didn't send the tickets of the play to Linda ||| because [Linda] wasn't eager to see it."), ("George didn't give the tickets of the play to Linda ||| because [George] was eager to see it.", "George didn't send the tickets of the play to Linda ||| because [George] was eager to see it."), ("George didn't give the tickets of the play to Linda ||| because [Linda] wasn't interested in it.", "George didn't send the tickets of the play to Linda ||| because [Linda] wasn't interested in it."), ("Linda didn't receive the tickets of the play from George ||| because [George] was interested in it.", "Linda didn't take the tickets of the play from George ||| because [George] was interested in it."), ("Linda didn't receive the tickets of the play from George ||| because [Linda] wasn't eager to see it.", "Linda didn't take the tickets of the play from George ||| because [Linda] wasn't eager to see it."), ("Linda didn't receive the tickets of the play from George ||| because [George] was eager to see it.", "Linda didn't take the tickets of the play from George ||| because [George] was eager to see it."), ("Linda didn't receive the tickets of the play from George ||| because [Linda] wasn't interested in it.", "Linda didn't take the tickets of the play from George ||| because [Linda] wasn't interested in it."), ('George gave the tickets of the play to Linda ||| although [George] was interested in it.', 'George sent the tickets of the play to Linda ||| although [George] was interested in it.'), ("George gave the tickets of the play to Linda ||| although [Linda] wasn't eager to see it.", "George sent the tickets of the play to Linda ||| although [Linda] wasn't eager to see it."), ('George gave the tickets of the play to Linda ||| although [George] was eager to see it.', 'George sent the tickets of the play to Linda ||| although [George] was eager to see it.'), ("George gave the tickets of the play to Linda ||| although [Linda] wasn't interested in it.", "George sent the tickets of the play to Linda ||| although [Linda] wasn't interested in it."), ('Linda received the tickets of the play from George ||| although [George] was interested in it.', 'Linda took the tickets of the play from George ||| although [George] was interested in it.'), ("Linda received the tickets of the play from George ||| although [Linda] wasn't eager to see it.", "Linda took the tickets of the play from George ||| although [Linda] wasn't eager to see it."), ('Linda received the tickets of the play from George ||| although [George] was eager to see it.', 'Linda took the tickets of the play from George ||| although [George] was eager to see it.'), ("Linda received the tickets of the play from George ||| although [Linda] wasn't interested in it.", "Linda took the tickets of the play from George ||| although [Linda] wasn't interested in it."), ("George didn't give the tickets of the play to Linda ||| although [George] wasn't interested in it.", "George didn't send the tickets of the play to Linda ||| although [George] wasn't interested in it."), ("George didn't give the tickets of the play to Linda ||| although [Linda] was eager to see it.", "George didn't send the tickets of the play to Linda ||| although [Linda] was eager to see it."), ("George didn't give the tickets of the play to Linda ||| although [George] wasn't eager to see it.", "George didn't send the tickets of the play to Linda ||| although [George] wasn't eager to see it."), ("George didn't give the tickets of the play to Linda ||| although [Linda] was interested in it.", "George didn't send the tickets of the play to Linda ||| although [Linda] was interested in it."), ("Linda didn't receive the tickets of the play from George ||| although [George] wasn't interested in it.", "Linda didn't take the tickets of the play from George ||| although [George] wasn't interested in it."), ("Linda didn't receive the tickets of the play from George ||| although [Linda] was eager to see it.", "Linda didn't take the tickets of the play from George ||| although [Linda] was eager to see it."), ("Linda didn't receive the tickets of the play from George ||| although [George] wasn't eager to see it.", "Linda didn't take the tickets of the play from George ||| although [George] wasn't eager to see it."), ("Linda didn't receive the tickets of the play from George ||| although [Linda] was interested in it.", "Linda didn't take the tickets of the play from George ||| although [Linda] was interested in it.")], [('Peter envied Mandy ||| because [Peter] failed.', 'Peter was jealous of Mandy ||| because [Peter] failed.'), ('Peter envied Mandy ||| because [Mandy] was successful.', 'Peter was jealous of Mandy ||| because [Mandy] was successful.'), ("Peter envied Mandy ||| because [Peter] wasn't successful.", "Peter was jealous of Mandy ||| because [Peter] wasn't successful."), ("Peter envied Mandy ||| because [Mandy] didn't fail.", "Peter was jealous of Mandy ||| because [Mandy] didn't fail."), ('Mandy was envied by Peter ||| because [Peter] failed.', 'Mandy was admired by Peter ||| because [Peter] failed.'), ('Mandy was envied by Peter ||| because [Mandy] was successful.', 'Mandy was admired by Peter ||| because [Mandy] was successful.'), ("Mandy was envied by Peter ||| because [Peter] wasn't successful.", "Mandy was admired by Peter ||| because [Peter] wasn't successful."), ("Mandy was envied by Peter ||| because [Mandy] didn't fail.", "Mandy was admired by Peter ||| because [Mandy] didn't fail."), ("Peter didn't envy Mandy ||| because [Peter] didn't fail.", "Peter wasn't jealous of Mandy ||| because [Peter] didn't fail."), ("Peter didn't envy Mandy ||| because [Mandy] wasn't successful.", "Peter wasn't jealous of Mandy ||| because [Mandy] wasn't successful."), ("Peter didn't envy Mandy ||| because [Peter] was successful.", "Peter wasn't jealous of Mandy ||| because [Peter] was successful."), ("Peter didn't envy Mandy ||| because [Mandy] failed.", "Peter wasn't jealous of Mandy ||| because [Mandy] failed."), ("Mandy wasn't envied by Peter ||| because [Peter] didn't fail.", "Mandy wasn't admired by Peter ||| because [Peter] didn't fail."), ("Mandy wasn't envied by Peter ||| because [Mandy] wasn't successful.", "Mandy wasn't admired by Peter ||| because [Mandy] wasn't successful."), ("Mandy wasn't envied by Peter ||| because [Peter] was successful.", "Mandy wasn't admired by Peter ||| because [Peter] was successful."), ("Mandy wasn't envied by Peter ||| because [Mandy] failed.", "Mandy wasn't admired by Peter ||| because [Mandy] failed."), ("Peter envied Mandy ||| although [Peter] didn't fail.", "Peter was jealous of Mandy ||| although [Peter] didn't fail."), ("Peter envied Mandy ||| although [Mandy] wasn't successful.", "Peter was jealous of Mandy ||| although [Mandy] wasn't successful."), ('Peter envied Mandy ||| although [Peter] was successful.', 'Peter was jealous of Mandy ||| although [Peter] was successful.'), ('Peter envied Mandy ||| although [Mandy] failed.', 'Peter was jealous of Mandy ||| although [Mandy] failed.'), ("Mandy was envied by Peter ||| although [Peter] didn't fail.", "Mandy was admired by Peter ||| although [Peter] didn't fail."), ("Mandy was envied by Peter ||| although [Mandy] wasn't successful.", "Mandy was admired by Peter ||| although [Mandy] wasn't successful."), ('Mandy was envied by Peter ||| although [Peter] was successful.', 'Mandy was admired by Peter ||| although [Peter] was successful.'), ('Mandy was envied by Peter ||| although [Mandy] failed.', 'Mandy was admired by Peter ||| although [Mandy] failed.'), ("Peter didn't envy Mandy ||| although [Peter] failed.", "Peter wasn't jealous of Mandy ||| although [Peter] failed."), ("Peter didn't envy Mandy ||| although [Mandy] was successful.", "Peter wasn't jealous of Mandy ||| although [Mandy] was successful."), ("Peter didn't envy Mandy ||| although [Peter] wasn't successful.", "Peter wasn't jealous of Mandy ||| although [Peter] wasn't successful."), ("Peter didn't envy Mandy ||| although [Mandy] didn't fail.", "Peter wasn't jealous of Mandy ||| although [Mandy] didn't fail."), ("Mandy wasn't envied by Peter ||| although [Peter] failed.", "Mandy wasn't admired by Peter ||| although [Peter] failed."), ("Mandy wasn't envied by Peter ||| although [Mandy] was successful.", "Mandy wasn't admired by Peter ||| although [Mandy] was successful."), ("Mandy wasn't envied by Peter ||| although [Peter] wasn't successful.", "Mandy wasn't admired by Peter ||| although [Peter] wasn't successful."), ("Mandy wasn't envied by Peter ||| although [Mandy] didn't fail.", "Mandy wasn't admired by Peter ||| although [Mandy] didn't fail.")], ['James ceded the presidency to Amy ||| although [James] was not notorious.', "James ceded the presidency to Amy ||| although [Amy] wasn't popular.", 'James ceded the presidency to Amy ||| although [James] was popular.', 'James ceded the presidency to Amy ||| although [Amy] was notorious.', 'Amy took over the presidency from James ||| although [James] was not notorious.', "Amy took over the presidency from James ||| although [Amy] wasn't popular.", 'Amy took over the presidency from James ||| although [James] was popular.', 'Amy took over the presidency from James ||| although [Amy] was notorious.', "James didn't cede the presidency to Amy ||| although [James] was notorious.", "James didn't cede the presidency to Amy ||| although [Amy] was popular.", "James didn't cede the presidency to Amy ||| although [James] wasn't popular.", "James didn't cede the presidency to Amy ||| although [Amy] was not notorious.", "Amy didn't take over the presidency from James ||| although [James] was notorious.", "Amy didn't take over the presidency from James ||| although [Amy] was popular.", "Amy didn't take over the presidency from James ||| although [James] wasn't popular.", "Amy didn't take over the presidency from James ||| although [Amy] was not notorious."]]Traceback (most recent call last): + File "/home/qsj/miniconda3/lib/python3.6/runpy.py", line 183, in _run_module_as_main + mod_name, mod_spec, code = _get_module_details(mod_name, _Error) + File "/home/qsj/miniconda3/lib/python3.6/runpy.py", line 109, in _get_module_details + __import__(pkg_name) + File "/home/xd/projects/pytorch-pretrained-BERT/train_child.py", line 91, in + child_dataset = CHILDDataset(tokenizer, sentences, dev_percent=args.dev_percent) + File "/home/xd/projects/pytorch-pretrained-BERT/run_child_finetuning.py", line 68, in __init__ + t1, t2, is_next_label = self.split_sent(line) + File "/home/xd/projects/pytorch-pretrained-BERT/run_child_finetuning.py", line 128, in split_sent + t1, t2 = line.strip(), None +AttributeError: 'list' object has no attribute 'strip' + +06/09/2019 22:23:17 - ERROR - pytorch_pretrained_bert.tokenization - Model name '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/vocab.txt' was not found in model name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese). We assumed '/nas/pretrain-bert/pretrain-tensorflow/uncased_L-12_H-768_A-12/vocab.txt' was a path or url but couldn't find any file associated to this path or url. +06/09/2019 22:23:17 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +Traceback (most recent call last): + File "/home/qsj/miniconda3/lib/python3.6/runpy.py", line 183, in _run_module_as_main + mod_name, mod_spec, code = _get_module_details(mod_name, _Error) + File "/home/qsj/miniconda3/lib/python3.6/runpy.py", line 109, in _get_module_details + __import__(pkg_name) + File "/home/xd/projects/pytorch-pretrained-BERT/train_child.py", line 88, in + sentences += make_sentences(**frame)[-1] + File "/home/xd/projects/pytorch-pretrained-BERT/child_generator.py", line 53, in make_sentences + assert entities[0].lower() in tokenizer.vocab , entities[0] +AttributeError: 'NoneType' object has no attribute 'vocab' +Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten. +Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead. +Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable. +Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable. +Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex. +Namespace(dev_percent=0.3, do_eval=True, do_lower_case=True, do_train=True, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=3e-05, max_seq_length=128, no_cuda=False, num_train_epochs=6.0, seed=42, train_batch_size=32, warmup_proportion=0.1) +06/09/2019 22:24:05 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 22:24:05 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +Traceback (most recent call last): + File "/home/qsj/miniconda3/lib/python3.6/runpy.py", line 183, in _run_module_as_main + mod_name, mod_spec, code = _get_module_details(mod_name, _Error) + File "/home/qsj/miniconda3/lib/python3.6/runpy.py", line 109, in _get_module_details + __import__(pkg_name) + File "/home/xd/projects/pytorch-pretrained-BERT/train_child.py", line 88, in + sentences += make_sentences(**frame)[-1] + File "/home/xd/projects/pytorch-pretrained-BERT/child_generator.py", line 80, in make_sentences + B_template = B_template[int(prepositive_pred)] +UnboundLocalError: local variable 'B_template' referenced before assignment +Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten. +Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead. +Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable. +Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable. +Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex. +Namespace(dev_percent=0.3, do_eval=True, do_lower_case=True, do_train=True, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=3e-05, max_seq_length=128, no_cuda=False, num_train_epochs=6.0, seed=42, train_batch_size=32, warmup_proportion=0.1) +06/09/2019 22:28:12 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 22:28:12 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 22:28:12 - INFO - run_child_finetuning - num_sent = 15840 +Traceback (most recent call last): + File "/home/qsj/miniconda3/lib/python3.6/runpy.py", line 183, in _run_module_as_main + mod_name, mod_spec, code = _get_module_details(mod_name, _Error) + File "/home/qsj/miniconda3/lib/python3.6/runpy.py", line 109, in _get_module_details + __import__(pkg_name) + File "/home/xd/projects/pytorch-pretrained-BERT/train_child.py", line 91, in + child_dataset = CHILDDataset(tokenizer, sentences, dev_percent=args.dev_percent) + File "/home/xd/projects/pytorch-pretrained-BERT/run_child_finetuning.py", line 88, in __init__ + self.features = [convert_example_to_features(example, self.seq_len, self.tokenizer) for example in self.examples] + File "/home/xd/projects/pytorch-pretrained-BERT/run_child_finetuning.py", line 88, in + self.features = [convert_example_to_features(example, self.seq_len, self.tokenizer) for example in self.examples] + File "/home/xd/projects/pytorch-pretrained-BERT/run_child_finetuning.py", line 204, in convert_example_to_features + assert len(input_ids) == max_seq_length +AssertionError +Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten. +Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead. +Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable. +Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable. +Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex. +Namespace(dev_percent=0.3, do_eval=True, do_lower_case=True, do_train=True, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=3e-05, max_seq_length=128, no_cuda=False, num_train_epochs=6.0, seed=42, train_batch_size=32, warmup_proportion=0.1) +['the students were not more in number than the sponsors ||| so the [students] were in the minority.', "Tom dosen't look after Betty ||| because [Betty] is older.", "the ball can't be put into the bag ||| although the [bag] isn't small.", "Amy didn't take over the presidency from Jack ||| although [Amy] was not notorious.", "Jack didn't cede the presidency to Donna ||| because [Donna] was notorious.", "Susan was fooled by Sam ||| but [Sam] didn't get a lot of money.", "Wendy is imitated by George in everything ||| although [Wendy] isn't good at making decisions.", "the apples are sold more than the grapes ||| so the [apples] shouldn't be made less next time.", "Donna didn't obstruct the sight of Edward ||| although [Edward] isn't tall.", "Michael didn't subsidize Mary ||| because [Michael] was poor.", "Lucy didn't perform better than Andy on the test ||| although [Andy] was lazy in doing homework.", "Mary wasn't appreciated by Michael ||| because [Michael] hadn't received a lot of help.", 'Linda was defeated by Michael in the game ||| but [Michael] was sad.', "Caroline was replaced by Amy as the actress's new name ||| although [Caroline] is easy to pronounce.", "the bag of flour hadn't been put below the bag of candy ||| so the bag of [candy] couldn't be moved first.", 'the apples are less popular than the sandwiches ||| but the [apples] should be made more next time.', "the sandwiches don't lose to the apples ||| so the [apples] should be made less next time.", 'Running at about the same speed, Sally lost to Tom in the running race ||| although [Sally] had a good start.', "the boss didn't arrive before the employee ||| because the [employee] didn't come from far away.", "Charles can't defeat Cindy at tennis ||| although [Cindy] is younger."] +06/09/2019 22:36:18 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 22:36:18 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 22:36:18 - INFO - run_child_finetuning - num_sent = 15840 +Traceback (most recent call last): + File "/home/qsj/miniconda3/lib/python3.6/runpy.py", line 183, in _run_module_as_main + mod_name, mod_spec, code = _get_module_details(mod_name, _Error) + File "/home/qsj/miniconda3/lib/python3.6/runpy.py", line 109, in _get_module_details + __import__(pkg_name) + File "/home/xd/projects/pytorch-pretrained-BERT/train_child.py", line 91, in + child_dataset = CHILDDataset(tokenizer, sentences, dev_percent=args.dev_percent) + File "/home/xd/projects/pytorch-pretrained-BERT/run_child_finetuning.py", line 88, in __init__ + self.features = [convert_example_to_features(example, self.seq_len, self.tokenizer) for example in self.examples] + File "/home/xd/projects/pytorch-pretrained-BERT/run_child_finetuning.py", line 88, in + self.features = [convert_example_to_features(example, self.seq_len, self.tokenizer) for example in self.examples] + File "/home/xd/projects/pytorch-pretrained-BERT/run_child_finetuning.py", line 204, in convert_example_to_features + assert len(input_ids) == max_seq_length, '%d != %d' % (len(input_ids), max_seq_length) +AssertionError: 29 != 28 +Warning: apex was installed without --cpp_ext. Falling back to Python flatten and unflatten. +Warning: apex was installed without --cuda_ext. Fused syncbn kernels will be unavailable. Python fallbacks will be used instead. +Warning: apex was installed without --cuda_ext. FusedAdam will be unavailable. +Warning: apex was installed without --cuda_ext. FusedLayerNorm will be unavailable. +Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex. +Namespace(dev_percent=0.3, do_eval=True, do_lower_case=True, do_train=True, eval_batch_size=128, gradient_accumulation_steps=1, learning_rate=3e-05, max_seq_length=128, no_cuda=False, num_train_epochs=6.0, seed=42, train_batch_size=32, warmup_proportion=0.1) +["Running at about the same speed, Susan lost to David in the running race ||| because [Susan] didn't have a good start.", 'Charles cast the schoolbag up to Emma ||| before [Emma] reached the bottom of the stairs.', "the sandwiches lose to the apples ||| but the [sandwiches] shouldn't be made less next time.", 'the bucket was suffused with water from the cup ||| before the [cup] was empty.', "Jack dosen't take care of Betty ||| because [Jack] isn't older.", 'the cup was suffused with water from the bowl ||| after the [bowl] was full.', "Lucy wasn't fooled by Bush ||| so [Lucy] didn't lose a lot of money.", 'the trophy can fit into the box ||| because the [trophy] is small.', "Mary was told by Michael what time the library closes ||| although [Michael] didn't remember.", "Robert didn't cede the presidency to Donna ||| because [Robert] was popular.", 'the cup dripped water into the bowl ||| after the [cup] was full.', "Donna didn't get the presidency from Jack ||| because [Donna] wasn't popular.", "Betty didn't block the view of John ||| because [Betty] isn't tall.", "Susan didn't receive a lot of money from John ||| because [Susan] wasn't poor.", "Eric didn't send the tickets of the play to Linda ||| although [Linda] was eager to see it.", 'the bottle dripped water into the tube ||| before the [bottle] was empty.', "Running at about the same speed, Sally wasn't defeated by John in the running race ||| although [Sally] didn't have a good start.", "the bicycle was left behind the ambulance ||| because the [ambulance] wasn't going slow.", 'the pictures could be placed on all the chairs ||| although there were few of the [pictures].', "Tom always looks after Betty ||| although [Tom] isn't older."] +06/09/2019 22:42:30 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 22:42:30 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 22:42:30 - INFO - run_child_finetuning - num_sent = 15840 +06/09/2019 22:42:34 - INFO - run_child_finetuning - num_train_steps = 2079 +06/09/2019 22:42:37 - INFO - run_child_finetuning - device: cuda n_gpu: 1 +06/09/2019 22:42:37 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased +06/09/2019 22:42:37 - INFO - pytorch_pretrained_bert.modeling - Model config { + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "type_vocab_size": 2, + "vocab_size": 30522 +} + +06/09/2019 22:42:40 - INFO - pytorch_pretrained_bert.modeling - Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias'] +06/09/2019 22:42:45 - INFO - run_child_finetuning - Epoch 0 +06/09/2019 22:42:45 - INFO - run_child_finetuning - Evaluating on train set... +06/09/2019 22:42:45 - INFO - run_child_finetuning - Evaluating on valid set... + Epoch: 0%| | 0/6 [00:00 7680 +num_train_steps = 12000 +global_step 0, lr = 0.000000 +global_step 1000, lr = 0.000417 +global_step 2000, lr = 0.000417 +global_step 3000, lr = 0.000375 +global_step 4000, lr = 0.000333 +global_step 5000, lr = 0.000292 +global_step 6000, lr = 0.000250 +global_step 7000, lr = 0.000208 +global_step 8000, lr = 0.000167 +global_step 9000, lr = 0.000125 +global_step 10000, lr = 0.000083 +global_step 11000, lr = 0.000042 diff --git a/train_child_yesnoonly_maybe0_bert.out b/train_child_yesnoonly_maybe0_bert.out new file mode 100644 index 00000000000000..3e09894c9f001a --- /dev/null +++ b/train_child_yesnoonly_maybe0_bert.out @@ -0,0 +1,41 @@ +06/09/2019 16:56:11 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 16:56:11 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 16:56:16 - INFO - run_child_finetuning - device: cuda n_gpu: 1 +06/09/2019 16:56:16 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased +06/09/2019 16:56:16 - INFO - pytorch_pretrained_bert.modeling - Model config { + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "type_vocab_size": 2, + "vocab_size": 30522 +} + +06/09/2019 16:56:19 - INFO - pytorch_pretrained_bert.modeling - Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias'] +06/09/2019 16:56:24 - INFO - run_child_finetuning - Epoch 0 +06/09/2019 16:56:24 - INFO - run_child_finetuning - Evaluating on train set... +06/09/2019 16:56:24 - INFO - run_child_finetuning - Evaluating on valid set... + Epoch: 0%| | 0/3 [00:00 11520 +num_train_steps = 720 +global_step 0, lr = 0.000000 +06/09/2019 17:05:29 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 17:05:30 - INFO - pytorch_pretrained_bert.tokenization - loading vocabulary file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased-vocab.txt +06/09/2019 17:05:38 - INFO - run_child_finetuning - device: cuda n_gpu: 1 +06/09/2019 17:05:38 - INFO - pytorch_pretrained_bert.modeling - loading archive file /nas/pretrain-bert/pretrain-pytorch/bert-base-uncased +06/09/2019 17:05:38 - INFO - pytorch_pretrained_bert.modeling - Model config { + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "type_vocab_size": 2, + "vocab_size": 30522 +} + +06/09/2019 17:05:41 - INFO - pytorch_pretrained_bert.modeling - Weights from pretrained model not used in BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias'] +06/09/2019 17:05:45 - INFO - run_child_finetuning - Epoch 0 +06/09/2019 17:05:45 - INFO - run_child_finetuning - Evaluating on train set... +06/09/2019 17:05:45 - INFO - run_child_finetuning - Evaluating on valid set... + Epoch: 0%| | 0/6 [00:00