-
Notifications
You must be signed in to change notification settings - Fork 2
/
5. Categorizing and Tagging Words.html
3164 lines (3091 loc) · 216 KB
/
5. Categorizing and Tagging Words.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!--?xml version="1.0" encoding="ascii" ?-->
<html xml:lang="en" xmlns="http://www.w3.org/1999/xhtml" lang="en"><head><script language="javascript" type="text/javascript">
function astext(node)
{
return node.innerHTML.replace(/(<([^>]+)>)/ig,"")
.replace(/>/ig, ">")
.replace(/</ig, "<")
.replace(/"/ig, '"')
.replace(/&/ig, "&");
}
function copy_notify(node, bar_color, data)
{
// The outer box: relative + inline positioning.
var box1 = document.createElement("div");
box1.style.position = "relative";
box1.style.display = "inline";
box1.style.top = "2em";
box1.style.left = "1em";
// A shadow for fun
var shadow = document.createElement("div");
shadow.style.position = "absolute";
shadow.style.left = "-1.3em";
shadow.style.top = "-1.3em";
shadow.style.background = "#404040";
// The inner box: absolute positioning.
var box2 = document.createElement("div");
box2.style.position = "relative";
box2.style.border = "1px solid #a0a0a0";
box2.style.left = "-.2em";
box2.style.top = "-.2em";
box2.style.background = "white";
box2.style.padding = ".3em .4em .3em .4em";
box2.style.fontStyle = "normal";
box2.style.background = "#f0e0e0";
node.insertBefore(box1, node.childNodes.item(0));
box1.appendChild(shadow);
shadow.appendChild(box2);
box2.innerHTML="Copied to the clipboard: " +
"<pre class='copy-notify'>"+
data+"</pre>";
setTimeout(function() { node.removeChild(box1); }, 1000);
var elt = node.parentNode.firstChild;
elt.style.background = "#ffc0c0";
setTimeout(function() { elt.style.background = bar_color; }, 200);
}
function copy_codeblock_to_clipboard(node)
{
var data = astext(node)+"\n";
if (copy_text_to_clipboard(data)) {
copy_notify(node, "#40a060", data);
}
}
function copy_doctest_to_clipboard(node)
{
var s = astext(node)+"\n ";
var data = "";
var start = 0;
var end = s.indexOf("\n");
while (end >= 0) {
if (s.substring(start, start+4) == ">>> ") {
data += s.substring(start+4, end+1);
}
else if (s.substring(start, start+4) == "... ") {
data += s.substring(start+4, end+1);
}
/*
else if (end-start > 1) {
data += "# " + s.substring(start, end+1);
}*/
// Grab the next line.
start = end+1;
end = s.indexOf("\n", start);
}
if (copy_text_to_clipboard(data)) {
copy_notify(node, "#4060a0", data);
}
}
function copy_text_to_clipboard(data)
{
if (window.clipboardData) {
window.clipboardData.setData("Text", data);
return true;
}
else if (window.netscape) {
// w/ default firefox settings, permission will be denied for this:
netscape.security.PrivilegeManager
.enablePrivilege("UniversalXPConnect");
var clip = Components.classes["@mozilla.org/widget/clipboard;1"]
.createInstance(Components.interfaces.nsIClipboard);
if (!clip) return;
var trans = Components.classes["@mozilla.org/widget/transferable;1"]
.createInstance(Components.interfaces.nsITransferable);
if (!trans) return;
trans.addDataFlavor("text/unicode");
var str = new Object();
var len = new Object();
var str = Components.classes["@mozilla.org/supports-string;1"]
.createInstance(Components.interfaces.nsISupportsString);
var datacopy=data;
str.data=datacopy;
trans.setTransferData("text/unicode",str,datacopy.length*2);
var clipid=Components.interfaces.nsIClipboard;
if (!clip) return false;
clip.setData(trans,null,clipid.kGlobalClipboard);
return true;
}
return false;
}
//-->
</script>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="generator" content="Docutils 0.12: http://docutils.sourceforge.net/">
<title>5. Categorizing and Tagging Words</title>
<style type="text/css">
/*
:Author: Edward Loper, James Curran
:Copyright: This stylesheet has been placed in the public domain.
Stylesheet for use with Docutils.
This stylesheet defines new css classes used by NLTK.
It uses a Python syntax highlighting scheme that matches
the colour scheme used by IDLE, which makes it easier for
beginners to check they are typing things in correctly.
*/
/* Include the standard docutils stylesheet. */
@import url(default.css);
/* Custom inline roles */
span.placeholder { font-style: italic; font-family: monospace; }
span.example { font-style: italic; }
span.emphasis { font-style: italic; }
span.termdef { font-weight: bold; }
/*span.term { font-style: italic; }*/
span.category { font-variant: small-caps; }
span.feature { font-variant: small-caps; }
span.fval { font-style: italic; }
span.math { font-style: italic; }
span.mathit { font-style: italic; }
span.lex { font-variant: small-caps; }
span.guide-linecount{ text-align: right; display: block;}
/* Python souce code listings */
span.pysrc-prompt { color: #9b0000; }
span.pysrc-more { color: #9b00ff; }
span.pysrc-keyword { color: #e06000; }
span.pysrc-builtin { color: #940094; }
span.pysrc-string { color: #00aa00; }
span.pysrc-comment { color: #ff0000; }
span.pysrc-output { color: #0000ff; }
span.pysrc-except { color: #ff0000; }
span.pysrc-defname { color: #008080; }
/* Doctest blocks */
pre.doctest { margin: 0; padding: 0; font-weight: bold; }
div.doctest { margin: 0 1em 1em 1em; padding: 0; }
table.doctest { margin: 0; padding: 0;
border-top: 1px solid gray;
border-bottom: 1px solid gray; }
pre.copy-notify { margin: 0; padding: 0.2em; font-weight: bold;
background-color: #ffffff; }
/* Python source listings */
div.pylisting { margin: 0 1em 1em 1em; padding: 0; }
table.pylisting { margin: 0; padding: 0;
border-top: 1px solid gray; }
td.caption { border-top: 1px solid black; margin: 0; padding: 0; }
.caption-label { font-weight: bold; }
td.caption p { margin: 0; padding: 0; font-style: normal;}
table tr td.codeblock {
padding: 0.2em ! important; margin: 0;
border-left: 1px solid gray;
border-right: 2px solid gray;
border-top: 0px solid gray;
border-bottom: 1px solid gray;
font-weight: bold; background-color: #eeffee;
}
table tr td.doctest {
padding: 0.2em; margin: 0;
border-left: 1px solid gray;
border-right: 2px solid gray;
border-top: 0px solid gray;
border-bottom: 1px solid gray;
font-weight: bold; background-color: #eeeeff;
}
td.codeblock table tr td.copybar {
background: #40a060; border: 1px solid gray;
font-family: monospace; padding: 0; margin: 0; }
td.doctest table tr td.copybar {
background: #4060a0; border: 1px solid gray;
font-family: monospace; padding: 0; margin: 0; }
td.pysrc { padding-left: 0.5em; }
img.callout { border-width: 0px; }
table.docutils {
border-style: solid;
border-width: 1px;
margin-top: 6px;
border-color: grey;
border-collapse: collapse; }
table.docutils th {
border-style: none;
border-width: 1px;
border-color: grey;
padding: 0 .5em 0 .5em; }
table.docutils td {
border-style: none;
border-width: 1px;
border-color: grey;
padding: 0 .5em 0 .5em; }
table.footnote td { padding: 0; }
table.footnote { border-width: 0; }
table.footnote td { border-width: 0; }
table.footnote th { border-width: 0; }
table.noborder { border-width: 0; }
table.example pre { margin-top: 4px; margin-bottom: 0; }
/* For figures & tables */
p.caption { margin-bottom: 0; }
div.figure { text-align: center; }
/* The index */
div.index { border: 1px solid black;
background-color: #eeeeee; }
div.index h1 { padding-left: 0.5em; margin-top: 0.5ex;
border-bottom: 1px solid black; }
ul.index { margin-left: 0.5em; padding-left: 0; }
li.index { list-style-type: none; }
p.index-heading { font-size: 120%; font-style: italic; margin: 0; }
li.index ul { margin-left: 2em; padding-left: 0; }
/* 'Note' callouts */
div.note
{
border-right: #87ceeb 1px solid;
padding-right: 4px;
border-top: #87ceeb 1px solid;
padding-left: 4px;
padding-bottom: 4px;
margin: 2px 5% 10px;
border-left: #87ceeb 1px solid;
padding-top: 4px;
border-bottom: #87ceeb 1px solid;
font-style: normal;
font-family: verdana, arial;
background-color: #b0c4de;
}
table.avm { border: 0px solid black; width: 0; }
table.avm tbody tr {border: 0px solid black; }
table.avm tbody tr td { padding: 2px; }
table.avm tbody tr td.avm-key { padding: 5px; font-variant: small-caps; }
table.avm tbody tr td.avm-eq { padding: 5px; }
table.avm tbody tr td.avm-val { padding: 5px; font-style: italic; }
p.avm-empty { font-style: normal; }
table.avm colgroup col { border: 0px solid black; }
table.avm tbody tr td.avm-topleft
{ border-left: 2px solid #000080; border-top: 2px solid #000080; }
table.avm tbody tr td.avm-botleft
{ border-left: 2px solid #000080; border-bottom: 2px solid #000080; }
table.avm tbody tr td.avm-topright
{ border-right: 2px solid #000080; border-top: 2px solid #000080; }
table.avm tbody tr td.avm-botright
{ border-right: 2px solid #000080; border-bottom: 2px solid #000080; }
table.avm tbody tr td.avm-left
{ border-left: 2px solid #000080; }
table.avm tbody tr td.avm-right
{ border-right: 2px solid #000080; }
table.avm tbody tr td.avm-topbotleft
{ border: 2px solid #000080; border-right: 0px solid black; }
table.avm tbody tr td.avm-topbotright
{ border: 2px solid #000080; border-left: 0px solid black; }
table.avm tbody tr td.avm-ident
{ font-size: 80%; padding: 0; padding-left: 2px; vertical-align: top; }
.avm-pointer
{ border: 1px solid #008000; padding: 1px; color: #008000;
background: #c0ffc0; font-style: normal; }
table.gloss { border: 0px solid black; width: 0; }
table.gloss tbody tr { border: 0px solid black; }
table.gloss tbody tr td { border: 0px solid black; }
table.gloss colgroup col { border: 0px solid black; }
table.gloss p { margin: 0; padding: 0; }
table.rst-example { border: 1px solid black; }
table.rst-example tbody tr td { background: #eeeeee; }
table.rst-example thead tr th { background: #c0ffff; }
td.rst-raw { width: 0; }
/* Used by nltk.org/doc/test: */
div.doctest-list { text-align: center; }
table.doctest-list { border: 1px solid black;
margin-left: auto; margin-right: auto;
}
table.doctest-list tbody tr td { background: #eeeeee;
border: 1px solid #cccccc; text-align: left; }
table.doctest-list thead tr th { background: #304050; color: #ffffff;
border: 1px solid #000000;}
table.doctest-list thead tr a { color: #ffffff; }
span.doctest-passed { color: #008000; }
span.doctest-failed { color: #800000; }
</style>
</head>
<body>
<div class="document" id="categorizing-and-tagging-words">
<span id="chap-tag"></span>
<h1 class="title">5. Categorizing and Tagging Words</h1>
<!-- -*- mode: rst -*- -->
<!-- -*- mode: rst -*- -->
<!-- CAP abbreviations (map to small caps in LaTeX) -->
<!-- Other candidates for global consistency -->
<!-- PTB removed since it must be indexed -->
<!-- WN removed since it must be indexed -->
<!-- misc & punctuation -->
<!-- cdots was unicode U+22EF but not working -->
<!-- exercise meta-tags -->
<!-- Unicode tests -->
<!-- phonetic -->
<!-- misc -->
<!-- used in Unicode section -->
<!-- arrows -->
<!-- unification stuff -->
<!-- Math & Logic -->
<!-- sets -->
<!-- Greek -->
<!-- Chinese -->
<!-- URLs -->
<!-- Python example - a snippet of code in running text -->
<!-- PlaceHolder example - something that should be replaced by actual code -->
<!-- Linguistic eXample - cited form in running text -->
<!-- Emphasized (more declarative than just using *) -->
<!-- Grammatical Category - e.g. NP and verb as technical terms
.. role:: gc
:class: category -->
<!-- Math expression - e.g. especially for variables -->
<!-- Textual Math expression - for words 'inside' a math environment -->
<!-- Feature (or attribute) -->
<!-- Raw LaTeX -->
<!-- Raw HTML -->
<!-- Feature-value -->
<!-- Lexemes -->
<!-- Replacements that rely on previous definitions :-) -->
<!-- standard global imports
>>> import nltk, re, pprint
>>> from nltk import word_tokenize -->
<!-- TODO: exercise on cascaded tagging -->
<!-- TODO: motivate trigram tagging by showing some cases where bigram tagging doesn't work -->
<!-- TODO: xref to unicode section in prog chapter -->
<!-- TODO: * outstanding problems:
- what are we doing with ConditionalFreqDist?
- nltk.tag contains all of math library
- nltk.corpus.brown.tagged_sents() is too verbose? -->
<!-- TODO: type conversions: ``str()``, ``int()``, ``list()``. -->
<!-- TODO: tagging for language analysis: find all pairs of nouns which occur in the same sentence -->
<!-- TODO: possibly add section on exploring tagged corpora -->
<!-- TODO: add back in short section on Brill and HMM tagging -->
<!-- TODO: how to tag unknown words -->
<!-- TODO: how POS tagging disambiguates the word "like" and this can be
useful for sentiment detection -->
<!-- TODO: classification of unknown words using string patterns. -->
<p>Back in elementary school you learnt the difference between nouns, verbs,
adjectives, and adverbs. These "word classes" are not just
the idle invention of grammarians, but are useful categories for many
language processing tasks. As we will see, they arise from simple analysis
of the distribution of words in text. The goal of this chapter is to
answer the following questions:</p>
<ol class="arabic simple">
<li>What are lexical categories and how are they used in natural language processing?</li>
<li>What is a good Python data structure for storing words and their categories?</li>
<li>How can we automatically tag each word of a text with its word class?</li>
</ol>
<p>Along the way, we'll cover some fundamental techniques in NLP, including
sequence labeling, n-gram models, backoff, and evaluation. These techniques
are useful in many areas, and tagging gives us a simple context in which
to present them. We will also see how tagging is the second step in the typical
NLP pipeline, following tokenization.</p>
<p>The process of classifying words into their <a name="parts_of_speech_index_term"><span class="termdef">parts of speech</span> and
labeling them accordingly is known as </a><a name="part_of_speech_tagging_index_term"><span class="termdef">part-of-speech tagging</span>,
</a><a name="pos_tagging_index_term"><span class="termdef">POS-tagging</span>, or simply </a><a name="tagging_index_term"><span class="termdef">tagging</span>. Parts of speech
are also known as </a><a name="word_classes_index_term"><span class="termdef">word classes</span> or </a><a name="lexical_categories_index_term"><span class="termdef">lexical categories</span>.
The collection of tags
used for a particular task is known as a </a><a name="tagset_index_term"><span class="termdef">tagset</span>. Our emphasis
in this chapter is on exploiting tags, and tagging text automatically.</a></p><a name="tagset_index_term">
</a><div class="section" id="using-a-tagger"><a name="tagset_index_term">
<span id="sec-using-a-tagger"></span><h1>1 Using a Tagger</h1>
</a><p><a name="tagset_index_term">A part-of-speech tagger, or </a><a name="pos_tagger_index_term"><span class="termdef">POS-tagger</span>, processes a sequence of words, and attaches a
part of speech tag to each word (don't forget to <tt class="doctest"><span class="pre"><span class="pysrc-keyword">import</span> nltk</span></tt>):</a></p><a name="pos_tagger_index_term">
<div class="doctest">
<table class="doctest" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="doctest">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_doctest_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-prompt">>>> </span>text = word_tokenize(<span class="pysrc-string">"And now for something completely different"</span>)
<span class="pysrc-prompt">>>> </span>nltk.pos_tag(text)
<span class="pysrc-output">[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),</span>
<span class="pysrc-output">('completely', 'RB'), ('different', 'JJ')]</span></pre>
</td>
</tr></tbody></table></td></tr>
</tbody></table></div>
<p>Here we see that <span class="example">and</span> is <tt class="doctest"><span class="pre">CC</span></tt>, a coordinating conjunction;
<span class="example">now</span> and <span class="example">completely</span> are <tt class="doctest"><span class="pre">RB</span></tt>, or adverbs;
<span class="example">for</span> is <tt class="doctest"><span class="pre">IN</span></tt>, a preposition;
<span class="example">something</span> is <tt class="doctest"><span class="pre">NN</span></tt>, a noun; and
<span class="example">different</span> is <tt class="doctest"><span class="pre">JJ</span></tt>, an adjective.</p>
<div class="note">
<p class="first admonition-title">Note</p>
<p class="last">NLTK provides documentation for each tag, which can be queried using
the tag, e.g. <tt class="doctest"><span class="pre">nltk.help.upenn_tagset(<span class="pysrc-string">'RB'</span>)</span></tt>, or a regular
expression, e.g. <tt class="doctest"><span class="pre">nltk.help.upenn_tagset(<span class="pysrc-string">'NN.*'</span>)</span></tt>.
Some corpora have README files with tagset documentation,
see <tt class="doctest"><span class="pre">nltk.corpus.???.readme()</span></tt>, substituting in the name
of the corpus.</p>
</div>
<p>Let's look at another example, this time including some homonyms:</p>
<div class="doctest">
<table class="doctest" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="doctest">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_doctest_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-prompt">>>> </span>text = word_tokenize(<span class="pysrc-string">"They refuse to permit us to obtain the refuse permit"</span>)
<span class="pysrc-prompt">>>> </span>nltk.pos_tag(text)
<span class="pysrc-output">[('They', 'PRP'), ('refuse', 'VBP'), ('to', 'TO'), ('permit', 'VB'), ('us', 'PRP'),</span>
<span class="pysrc-output">('to', 'TO'), ('obtain', 'VB'), ('the', 'DT'), ('refuse', 'NN'), ('permit', 'NN')]</span></pre>
</td>
</tr></tbody></table></td></tr>
</tbody></table></div>
<p>Notice that <span class="example">refuse</span> and <span class="example">permit</span> both appear as a
present tense verb (<tt class="doctest"><span class="pre">VBP</span></tt>) and a noun (<tt class="doctest"><span class="pre">NN</span></tt>).
E.g. <span class="example">refUSE</span> is a verb meaning "deny," while <span class="example">REFuse</span> is
a noun meaning "trash" (i.e. they are not homophones).
Thus, we need to know which word is being used in order to pronounce
the text correctly. (For this reason,
text-to-speech systems usually perform POS-tagging.)</p>
<div class="note">
<p class="first admonition-title">Note</p>
<p class="last"><strong>Your Turn:</strong>
Many words, like <span class="example">ski</span> and <span class="example">race</span>, can be used as nouns
or verbs with no difference in pronunciation. Can you think of
others? Hint: think of a commonplace object and try to put
the word <span class="example">to</span> before it to see if it can also be a verb, or
think of an action and try to put <span class="example">the</span> before it to see if
it can also be a noun. Now make up a sentence with both uses
of this word, and run the POS-tagger on this sentence.</p>
</div>
<p>Lexical categories like "noun" and part-of-speech tags like <tt class="doctest"><span class="pre">NN</span></tt> seem to have
their uses, but the details will be obscure to many readers. You might wonder what
justification there is for introducing this extra level of information.
Many of these categories arise from superficial analysis the distribution
of words in text. Consider the following analysis involving
<span class="example">woman</span> (a noun), <span class="example">bought</span> (a verb),
<span class="example">over</span> (a preposition), and <span class="example">the</span> (a determiner).
The <tt class="doctest"><span class="pre">text.similar()</span></tt> method takes a word <span class="math">w</span>, finds all contexts
<span class="math">w</span><sub>1</sub><span class="math">w</span> <span class="math">w</span><sub>2</sub>,
then finds all words <span class="math">w'</span> that appear in the same context,
i.e. <span class="math">w</span><sub>1</sub><span class="math">w'</span><span class="math">w</span><sub>2</sub>.</p>
<div class="doctest">
<table class="doctest" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="doctest">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_doctest_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-prompt">>>> </span>text = nltk.Text(word.lower() <span class="pysrc-keyword">for</span> word <span class="pysrc-keyword">in</span> nltk.corpus.brown.words())
<span class="pysrc-prompt">>>> </span>text.similar(<span class="pysrc-string">'woman'</span>)
<span class="pysrc-output">Building word-context index...</span>
<span class="pysrc-output">man day time year car moment world family house boy child country job</span>
<span class="pysrc-output">state girl place war way case question</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>text.similar(<span class="pysrc-string">'bought'</span>)
<span class="pysrc-output">made done put said found had seen given left heard been brought got</span>
<span class="pysrc-output">set was called felt in that told</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>text.similar(<span class="pysrc-string">'over'</span>)
<span class="pysrc-output">in on to of and for with from at by that into as up out down through</span>
<span class="pysrc-output">about all is</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>text.similar(<span class="pysrc-string">'the'</span>)
<span class="pysrc-output">a his this their its her an that our any all one these my in your no</span>
<span class="pysrc-output">some other and</span></pre>
</td>
</tr></tbody></table></td></tr>
</tbody></table></div>
<p>Observe that searching for <span class="example">woman</span> finds nouns;
searching for <span class="example">bought</span> mostly finds verbs;
searching for <span class="example">over</span> generally finds prepositions;
searching for <span class="example">the</span> finds several determiners.
A tagger can correctly identify the tags on these words
in the context of a sentence, e.g. <span class="example">The woman bought over $150,000
worth of clothes</span>.</p>
<p>A tagger can also model our knowledge of unknown words,
e.g. we can guess that <span class="example">scrobbling</span> is probably a verb,
with the root <span class="example">scrobble</span>,
and likely to occur in contexts like <span class="example">he was scrobbling</span>.</p>
</a></div><a name="pos_tagger_index_term">
</a><div class="section" id="tagged-corpora"><a name="pos_tagger_index_term">
<span id="sec-tagged-corpora"></span><h1>2 Tagged Corpora</h1>
<div class="section" id="representing-tagged-tokens">
<h2>2.1 Representing Tagged Tokens</h2>
<p>By convention in NLTK, a tagged token is represented using a
tuple consisting of the token and the tag.
We can create one of these special tuples from the standard string
representation of a tagged token, using the function <tt class="doctest"><span class="pre">str2tuple()</span></tt>:</p>
<div class="doctest">
<table class="doctest" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="doctest">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_doctest_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-prompt">>>> </span>tagged_token = nltk.tag.str2tuple(<span class="pysrc-string">'fly/NN'</span>)
<span class="pysrc-prompt">>>> </span>tagged_token
<span class="pysrc-output">('fly', 'NN')</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>tagged_token[0]
<span class="pysrc-output">'fly'</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>tagged_token[1]
<span class="pysrc-output">'NN'</span></pre>
</td>
</tr></tbody></table></td></tr>
</tbody></table></div>
<p>We can construct a list of tagged tokens directly from a string. The first
step is to tokenize the string
to access the individual <tt class="doctest"><span class="pre">word/tag</span></tt> strings, and then to convert
each of these into a tuple (using <tt class="doctest"><span class="pre">str2tuple()</span></tt>).</p>
<div class="doctest">
<table class="doctest" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="doctest">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_doctest_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-prompt">>>> </span>sent = <span class="pysrc-string">'''</span>
<span class="pysrc-more">... </span><span class="pysrc-string">The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN</span>
<span class="pysrc-more">... </span><span class="pysrc-string">other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC</span>
<span class="pysrc-more">... </span><span class="pysrc-string">Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PPS</span>
<span class="pysrc-more">... </span><span class="pysrc-string">said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/RB</span>
<span class="pysrc-more">... </span><span class="pysrc-string">accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT</span>
<span class="pysrc-more">... </span><span class="pysrc-string">interest/NN of/IN both/ABX governments/NNS ''/'' ./.</span>
<span class="pysrc-more">... </span><span class="pysrc-string">'''</span>
<span class="pysrc-prompt">>>> </span>[nltk.tag.str2tuple(t) <span class="pysrc-keyword">for</span> t <span class="pysrc-keyword">in</span> sent.split()]
<span class="pysrc-output">[('The', 'AT'), ('grand', 'JJ'), ('jury', 'NN'), ('commented', 'VBD'),</span>
<span class="pysrc-output">('on', 'IN'), ('a', 'AT'), ('number', 'NN'), ... ('.', '.')]</span></pre>
</td>
</tr></tbody></table></td></tr>
</tbody></table></div>
</div>
</a><div class="section" id="reading-tagged-corpora"><a name="pos_tagger_index_term">
<h2>2.2 Reading Tagged Corpora</h2>
</a><p><a name="pos_tagger_index_term">Several of the corpora included with NLTK have been </a><a name="tagged_index_term"><span class="termdef">tagged</span> for
their part-of-speech. Here's an example of what you might see if you
opened a file from the Brown Corpus with a text editor:</a></p><a name="tagged_index_term">
<blockquote>
The/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl
said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$
recent/jj primary/nn election/nn produced/vbd <tt class="doctest"><span class="pre">/</span></tt> no/at
evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd
place/nn ./.</blockquote>
<p>Other corpora use a variety of formats for storing part-of-speech tags.
NLTK's corpus readers provide a uniform interface so that you
don't have to be concerned with the different file formats.
In contrast with the file fragment shown above,
the corpus reader for the Brown Corpus represents the data as shown below.
Note that part-of-speech tags have been converted to uppercase, since this has
become standard practice since the Brown Corpus was published.</p>
<div class="doctest">
<table class="doctest" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="doctest">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_doctest_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-prompt">>>> </span>nltk.corpus.brown.tagged_words()
<span class="pysrc-output">[('The', 'AT'), ('Fulton', 'NP-TL'), ...]</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>nltk.corpus.brown.tagged_words(tagset=<span class="pysrc-string">'universal'</span>)
<span class="pysrc-output">[('The', 'DET'), ('Fulton', 'NOUN'), ...]</span></pre>
</td>
</tr></tbody></table></td></tr>
</tbody></table></div>
<p>Whenever a corpus contains tagged text, the NLTK corpus interface
will have a <tt class="doctest"><span class="pre">tagged_words()</span></tt> method.
Here are some more examples, again using the output format
illustrated for the Brown Corpus:</p>
<div class="doctest">
<table class="doctest" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="doctest">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_doctest_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-prompt">>>> </span><span class="pysrc-keyword">print</span>(nltk.corpus.nps_chat.tagged_words())
<span class="pysrc-output">[('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ...]</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>nltk.corpus.conll2000.tagged_words()
<span class="pysrc-output">[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ...]</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>nltk.corpus.treebank.tagged_words()
<span class="pysrc-output">[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ...]</span></pre>
</td>
</tr></tbody></table></td></tr>
</tbody></table></div>
<p>Not all corpora employ the same set of tags; see the
tagset help functionality and the <tt class="doctest"><span class="pre">readme()</span></tt> methods
mentioned above for documentation.
Initially we want to avoid the complications of these tagsets,
so we use a built-in mapping to the "Universal Tagset":</p>
<div class="doctest">
<table class="doctest" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="doctest">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_doctest_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-prompt">>>> </span>nltk.corpus.brown.tagged_words(tagset=<span class="pysrc-string">'universal'</span>)
<span class="pysrc-output">[('The', 'DET'), ('Fulton', 'NOUN'), ...]</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>nltk.corpus.treebank.tagged_words(tagset=<span class="pysrc-string">'universal'</span>)
<span class="pysrc-output">[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ...]</span></pre>
</td>
</tr></tbody></table></td></tr>
</tbody></table></div>
<p>Tagged corpora for several other languages are distributed with NLTK,
including Chinese, Hindi, Portuguese, Spanish, Dutch and Catalan.
These usually contain non-ASCII text,
and Python always displays this in hexadecimal when printing a larger structure
such as a list.</p>
<div class="doctest">
<table class="doctest" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="doctest">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_doctest_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-prompt">>>> </span>nltk.corpus.sinica_treebank.tagged_words()
<span class="pysrc-output">[('ä', 'Neu'), ('åæ', 'Nad'), ('åç', 'Nba'), ...]</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>nltk.corpus.indian.tagged_words()
<span class="pysrc-output">[('মহিষের', 'NN'), ('সন্তান', 'NN'), (':', 'SYM'), ...]</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>nltk.corpus.mac_morpho.tagged_words()
<span class="pysrc-output">[('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ...]</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>nltk.corpus.conll2002.tagged_words()
<span class="pysrc-output">[('Sao', 'NC'), ('Paulo', 'VMI'), ('(', 'Fpa'), ...]</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>nltk.corpus.cess_cat.tagged_words()
<span class="pysrc-output">[('El', 'da0ms0'), ('Tribunal_Suprem', 'np0000o'), ...]</span></pre>
</td>
</tr></tbody></table></td></tr>
</tbody></table></div>
</a><p><a name="tagged_index_term">If your environment is set up correctly, with appropriate editors and fonts,
you should be able to display individual strings in a human-readable way.
For example, </a><a class="reference internal" href="#fig-tag-indian">2.1</a> shows data accessed using
<tt class="doctest"><span class="pre">nltk.corpus.indian</span></tt>.</p>
<span class="target" id="fig-tag-indian"></span><div class="figure" id="fig-tag-indian">
<img alt="../images/tag-indian.png" src="5.%20Categorizing%20and%20Tagging%20Words_files/tag-indian.png" style="width: 800.4px; height: 213.0px;">
<p class="caption"><span class="caption-label">Figure 2.1</span>: POS-Tagged Data from Four Indian Languages: Bangla, Hindi, Marathi, and Telugu</p>
</div>
<!-- इराक_NNP के_PREP विदेश_NNC मंत्री_NN ने_PREP अमरीका_NNP के_PREP उस_PRP प्रस्ताव_NN का_PREP मजाक_NVB उड़ाया_VFM है_VAUX ... -->
<p>If the corpus is also segmented into sentences, it will have
a <tt class="doctest"><span class="pre">tagged_sents()</span></tt> method that divides up the tagged words into
sentences rather than presenting them as one big list.
This will be useful when we come to developing automatic taggers,
as they are trained and tested on lists of sentences, not words.</p>
</div>
<div class="section" id="a-universal-part-of-speech-tagset">
<h2>2.3 A Universal Part-of-Speech Tagset</h2>
<p>Tagged corpora use many different conventions for tagging words.
To help us get started, we will be looking at a simplified tagset
(shown in <a class="reference internal" href="#tab-universal-tagset">2.1</a>).</p>
<span class="target" id="tab-universal-tagset"></span><p class="caption"><span class="caption-label">Table 2.1</span>: </p><p>Universal Part-of-Speech Tagset</p><p></p><table class="docutils" id="tab-universal-tagset" border="1">
<colgroup>
<col width="11%">
<col width="27%">
<col width="62%">
</colgroup>
<thead valign="bottom">
<tr><th class="head">Tag</th>
<th class="head">Meaning</th>
<th class="head">English Examples</th>
</tr>
</thead>
<tbody valign="top">
<tr><td><tt class="doctest"><span class="pre">ADJ</span></tt></td>
<td>adjective</td>
<td><span class="example">new, good, high, special, big, local</span></td>
</tr>
<tr><td><tt class="doctest"><span class="pre">ADP</span></tt></td>
<td>adposition</td>
<td><span class="example">on, of, at, with, by, into, under</span></td>
</tr>
<tr><td><tt class="doctest"><span class="pre">ADV</span></tt></td>
<td>adverb</td>
<td><span class="example">really, already, still, early, now</span></td>
</tr>
<tr><td><tt class="doctest"><span class="pre">CONJ</span></tt></td>
<td>conjunction</td>
<td><span class="example">and, or, but, if, while, although</span></td>
</tr>
<tr><td><tt class="doctest"><span class="pre">DET</span></tt></td>
<td>determiner, article</td>
<td><span class="example">the, a, some, most, every, no, which</span></td>
</tr>
<tr><td><tt class="doctest"><span class="pre">NOUN</span></tt></td>
<td>noun</td>
<td><span class="example">year, home, costs, time, Africa</span></td>
</tr>
<tr><td><tt class="doctest"><span class="pre">NUM</span></tt></td>
<td>numeral</td>
<td><span class="example">twenty-four, fourth, 1991, 14:24</span></td>
</tr>
<tr><td><tt class="doctest"><span class="pre">PRT</span></tt></td>
<td>particle</td>
<td><span class="example">at, on, out, over per, that, up, with</span></td>
</tr>
<tr><td><tt class="doctest"><span class="pre">PRON</span></tt></td>
<td>pronoun</td>
<td><span class="example">he, their, her, its, my, I, us</span></td>
</tr>
<tr><td><tt class="doctest"><span class="pre">VERB</span></tt></td>
<td>verb</td>
<td><span class="example">is, say, told, given, playing, would</span></td>
</tr>
<tr><td><tt class="doctest"><span class="pre">.</span></tt></td>
<td>punctuation marks</td>
<td><span class="example">. , ; !</span></td>
</tr>
<tr><td><tt class="doctest"><span class="pre">X</span></tt></td>
<td>other</td>
<td><span class="example">ersatz, esprit, dunno, gr8, univeristy</span></td>
</tr>
</tbody>
</table>
<p>Let's see which of these tags are the most common in the news
category of the Brown corpus:</p>
<div class="doctest">
<table class="doctest" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="doctest">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_doctest_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-prompt">>>> </span><span class="pysrc-keyword">from</span> nltk.corpus <span class="pysrc-keyword">import</span> brown
<span class="pysrc-prompt">>>> </span>brown_news_tagged = brown.tagged_words(categories=<span class="pysrc-string">'news'</span>, tagset=<span class="pysrc-string">'universal'</span>)
<span class="pysrc-prompt">>>> </span>tag_fd = nltk.FreqDist(tag <span class="pysrc-keyword">for</span> (word, tag) <span class="pysrc-keyword">in</span> brown_news_tagged)
<span class="pysrc-prompt">>>> </span>tag_fd.most_common()
<span class="pysrc-output">[('NOUN', 30640), ('VERB', 14399), ('ADP', 12355), ('.', 11928), ('DET', 11389),</span>
<span class="pysrc-output"> ('ADJ', 6706), ('ADV', 3349), ('CONJ', 2717), ('PRON', 2535), ('PRT', 2264),</span>
<span class="pysrc-output"> ('NUM', 2166), ('X', 106)]</span></pre>
</td>
</tr></tbody></table></td></tr>
</tbody></table></div>
<div class="note">
<p class="first admonition-title">Note</p>
<p class="last"><strong>Your Turn:</strong>
Plot the above frequency distribution using <tt class="doctest"><span class="pre">tag_fd.plot(cumulative=True)</span></tt>.
What percentage of words are tagged using the first five tags of the above list?</p>
</div>
<p>We can use these tags to do powerful searches using a graphical
POS-concordance tool <tt class="doctest"><span class="pre">nltk.app.concordance()</span></tt>. Use it
to search for any combination of words and POS tags, e.g.
<tt class="doctest"><span class="pre">N N N N</span></tt>, <tt class="doctest"><span class="pre">hit/VD</span></tt>, <tt class="doctest"><span class="pre">hit/VN</span></tt>, or <tt class="doctest"><span class="pre">the ADJ man</span></tt>.</p>
<!-- Screenshot -->
</div>
<div class="section" id="nouns">
<h2>2.4 Nouns</h2>
<p>Nouns generally refer to people, places, things, or concepts, e.g.:
<span class="example">woman, Scotland, book, intelligence</span>. Nouns can appear after
determiners and adjectives, and can be the subject or object of the
verb, as shown in <a class="reference internal" href="#tab-syntax-nouns">2.2</a>.</p>
<span class="target" id="tab-syntax-nouns"></span><p class="caption"><span class="caption-label">Table 2.2</span>: </p><p>Syntactic Patterns involving some Nouns</p><p></p><table class="docutils" id="tab-syntax-nouns" border="1">
<colgroup>
<col width="11%">
<col width="42%">
<col width="47%">
</colgroup>
<thead valign="bottom">
<tr><th class="head">Word</th>
<th class="head">After a determiner</th>
<th class="head">Subject of the verb</th>
</tr>
</thead>
<tbody valign="top">
<tr><td>woman</td>
<td><em>the</em> woman who I saw yesterday ...</td>
<td>the woman <em>sat</em> down</td>
</tr>
<tr><td>Scotland</td>
<td><em>the</em> Scotland I remember as a child ...</td>
<td>Scotland <em>has</em> five million people</td>
</tr>
<tr><td>book</td>
<td><em>the</em> book I bought yesterday ...</td>
<td>this book <em>recounts</em> the colonization of Australia</td>
</tr>
<tr><td>intelligence</td>
<td><em>the</em> intelligence displayed by the child ...</td>
<td>Mary's intelligence <em>impressed</em> her teachers</td>
</tr>
</tbody>
</table>
<p>The simplified noun tags are <tt class="doctest"><span class="pre">N</span></tt> for common nouns like <span class="example">book</span>,
and <tt class="doctest"><span class="pre">NP</span></tt> for proper nouns like <span class="example">Scotland</span>.</p>
<p>Let's inspect some tagged text to see what parts of speech occur before a noun,
with the most frequent ones first. To begin with, we construct a list
of bigrams whose members are themselves word-tag pairs such as
<tt class="doctest"><span class="pre">((<span class="pysrc-string">'The'</span>, <span class="pysrc-string">'DET'</span>), (<span class="pysrc-string">'Fulton'</span>, <span class="pysrc-string">'NP'</span>))</span></tt> and <tt class="doctest"><span class="pre">((<span class="pysrc-string">'Fulton'</span>, <span class="pysrc-string">'NP'</span>), (<span class="pysrc-string">'County'</span>, <span class="pysrc-string">'N'</span>))</span></tt>.
Then we construct a <tt class="doctest"><span class="pre">FreqDist</span></tt> from the tag parts of the bigrams.</p>
<div class="doctest">
<table class="doctest" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="doctest">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_doctest_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-prompt">>>> </span>word_tag_pairs = nltk.bigrams(brown_news_tagged)
<span class="pysrc-prompt">>>> </span>noun_preceders = [a[1] <span class="pysrc-keyword">for</span> (a, b) <span class="pysrc-keyword">in</span> word_tag_pairs <span class="pysrc-keyword">if</span> b[1] == <span class="pysrc-string">'NOUN'</span>]
<span class="pysrc-prompt">>>> </span>fdist = nltk.FreqDist(noun_preceders)
<span class="pysrc-prompt">>>> </span>[tag <span class="pysrc-keyword">for</span> (tag, _) <span class="pysrc-keyword">in</span> fdist.most_common()]
<span class="pysrc-output">['NOUN', 'DET', 'ADJ', 'ADP', '.', 'VERB', 'CONJ', 'NUM', 'ADV', 'PRT', 'PRON', 'X']</span></pre>
</td>
</tr></tbody></table></td></tr>
</tbody></table></div>
<p>This confirms our assertion that nouns occur after determiners and
adjectives, including numeral adjectives (tagged as <tt class="doctest"><span class="pre">NUM</span></tt>).</p>
<!-- TO-DO say something about some of the other contexts? -->
</div>
<div class="section" id="verbs">
<h2>2.5 Verbs</h2>
<p>Verbs are words that describe events and actions, e.g. <span class="example">fall</span>,
<span class="example">eat</span> in <a class="reference internal" href="#tab-syntax-verbs">2.3</a>.
In the context of a sentence, verbs typically express a relation
involving the referents of one or more noun phrases.</p>
<span class="target" id="tab-syntax-verbs"></span><p class="caption"><span class="caption-label">Table 2.3</span>: </p><p>Syntactic Patterns involving some Verbs</p><p></p><table class="docutils" id="tab-syntax-verbs" border="1">
<colgroup>
<col width="7%">
<col width="22%">
<col width="70%">
</colgroup>
<thead valign="bottom">
<tr><th class="head">Word</th>
<th class="head">Simple</th>
<th class="head">With modifiers and adjuncts (italicized)</th>
</tr>
</thead>
<tbody valign="top">
<tr><td>fall</td>
<td>Rome fell</td>
<td>Dot com stocks <em>suddenly</em> fell <em>like a stone</em></td>
</tr>
<tr><td>eat</td>
<td>Mice eat cheese</td>
<td>John ate the pizza <em>with gusto</em></td>
</tr>
</tbody>
</table>
<p>What are the most common verbs in news text? Let's sort all the verbs by frequency:</p>
<div class="doctest">
<table class="doctest" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="doctest">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_doctest_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-prompt">>>> </span>wsj = nltk.corpus.treebank.tagged_words(tagset=<span class="pysrc-string">'universal'</span>)
<span class="pysrc-prompt">>>> </span>word_tag_fd = nltk.FreqDist(wsj)
<span class="pysrc-prompt">>>> </span>[wt[0] <span class="pysrc-keyword">for</span> (wt, _) <span class="pysrc-keyword">in</span> word_tag_fd.most_common() <span class="pysrc-keyword">if</span> wt[1] == <span class="pysrc-string">'VERB'</span>]
<span class="pysrc-output">['is', 'said', 'are', 'was', 'be', 'has', 'have', 'will', 'says', 'would',</span>
<span class="pysrc-output"> 'were', 'had', 'been', 'could', "'s", 'can', 'do', 'say', 'make', 'may',</span>
<span class="pysrc-output"> 'did', 'rose', 'made', 'does', 'expected', 'buy', 'take', 'get', 'might',</span>
<span class="pysrc-output"> 'sell', 'added', 'sold', 'help', 'including', 'should', 'reported', ...]</span></pre>
</td>
</tr></tbody></table></td></tr>
</tbody></table></div>
<p>Note that the items being counted in the frequency distribution are word-tag pairs.
Since words and tags are paired, we can treat the word as a condition and the tag
as an event, and initialize a conditional frequency distribution with a list of
condition-event pairs. This lets us see a frequency-ordered list of tags given a word:</p>
<div class="doctest">
<table class="doctest" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="doctest">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_doctest_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-prompt">>>> </span>cfd1 = nltk.ConditionalFreqDist(wsj)
<span class="pysrc-prompt">>>> </span>cfd1[<span class="pysrc-string">'yield'</span>].most_common()
<span class="pysrc-output">[('VERB', 28), ('NOUN', 20)]</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>cfd1[<span class="pysrc-string">'cut'</span>].most_common()
<span class="pysrc-output">[('VERB', 25), ('NOUN', 3)]</span></pre>
</td>
</tr></tbody></table></td></tr>
</tbody></table></div>
<p>We can reverse the order of the pairs, so that the tags are the conditions, and the
words are the events. Now we can see likely words for a given tag. We
will do this for the WSJ tagset rather than the universal tagset:</p>
<div class="doctest">
<table class="doctest" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="doctest">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_doctest_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-prompt">>>> </span>wsj = nltk.corpus.treebank.tagged_words()
<span class="pysrc-prompt">>>> </span>cfd2 = nltk.ConditionalFreqDist((tag, word) <span class="pysrc-keyword">for</span> (word, tag) <span class="pysrc-keyword">in</span> wsj)
<span class="pysrc-prompt">>>> </span>list(cfd2[<span class="pysrc-string">'VBN'</span>])
<span class="pysrc-output">['been', 'expected', 'made', 'compared', 'based', 'priced', 'used', 'sold',</span>
<span class="pysrc-output">'named', 'designed', 'held', 'fined', 'taken', 'paid', 'traded', 'said', ...]</span></pre>
</td>
</tr></tbody></table></td></tr>
</tbody></table></div>
<p>To clarify the distinction between <tt class="doctest"><span class="pre">VBD</span></tt> (past tense) and <tt class="doctest"><span class="pre">VBN</span></tt>
(past participle), let's find words which can be both <tt class="doctest"><span class="pre">VBD</span></tt> and
<tt class="doctest"><span class="pre">VBN</span></tt>, and see some surrounding text:</p>
<div class="doctest">
<table class="doctest" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="doctest">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_doctest_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-prompt">>>> </span>[w <span class="pysrc-keyword">for</span> w <span class="pysrc-keyword">in</span> cfd1.conditions() <span class="pysrc-keyword">if</span> <span class="pysrc-string">'VBD'</span> <span class="pysrc-keyword">in</span> cfd1[w] <span class="pysrc-keyword">and</span> <span class="pysrc-string">'VBN'</span> <span class="pysrc-keyword">in</span> cfd1[w]]
<span class="pysrc-output">['Asked', 'accelerated', 'accepted', 'accused', 'acquired', 'added', 'adopted', ...]</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>idx1 = wsj.index((<span class="pysrc-string">'kicked'</span>, <span class="pysrc-string">'VBD'</span>))
<span class="pysrc-prompt">>>> </span>wsj[idx1-4:idx1+1]
<span class="pysrc-output">[('While', 'IN'), ('program', 'NN'), ('trades', 'NNS'), ('swiftly', 'RB'),</span>
<span class="pysrc-output"> ('kicked', 'VBD')]</span>
<span class="pysrc-output"></span><span class="pysrc-prompt">>>> </span>idx2 = wsj.index((<span class="pysrc-string">'kicked'</span>, <span class="pysrc-string">'VBN'</span>))
<span class="pysrc-prompt">>>> </span>wsj[idx2-4:idx2+1]
<span class="pysrc-output">[('head', 'NN'), ('of', 'IN'), ('state', 'NN'), ('has', 'VBZ'), ('kicked', 'VBN')]</span></pre>
</td>
</tr></tbody></table></td></tr>
</tbody></table></div>
<p>In this case, we see that the past participle of <span class="example">kicked</span> is preceded by a form of
the auxiliary verb <span class="example">have</span>. Is this generally true?</p>
<div class="note">
<p class="first admonition-title">Note</p>
<p class="last"><strong>Your Turn:</strong>
Given the list of past participles produced by
<tt class="doctest"><span class="pre">list(cfd2[<span class="pysrc-string">'VN'</span>])</span></tt>, try to collect a list of all the word-tag
pairs that immediately precede items in that list.</p>
</div>
</div>
<div class="section" id="adjectives-and-adverbs">
<h2>2.6 Adjectives and Adverbs</h2>
<p>Two other important word classes are <a name="adjectives_index_term"><span class="termdef">adjectives</span> and </a><a name="adverbs_index_term"><span class="termdef">adverbs</span>.
Adjectives describe nouns, and can be used as modifiers
(e.g. <span class="example">large</span> in <span class="example">the large pizza</span>), or in predicates (e.g. <span class="example">the
pizza is large</span>). English adjectives can have internal structure
(e.g. <span class="example">fall+ing</span> in <span class="example">the falling
stocks</span>). Adverbs modify verbs to specify the time, manner, place or
direction of the event described by the verb (e.g. <span class="example">quickly</span> in
<span class="example">the stocks fell quickly</span>). Adverbs may also modify adjectives
(e.g. <span class="example">really</span> in <span class="example">Mary's teacher was really nice</span>).</a></p><a name="adverbs_index_term">
</a><p><a name="adverbs_index_term">English has several categories of closed class words in addition to
prepositions, such as </a><a name="articles_index_term"><span class="termdef">articles</span> (also often called </a><a name="determiners_index_term"><span class="termdef">determiners</span>)
(e.g., <span class="example">the</span>, <span class="example">a</span>), </a><a name="modals_index_term"><span class="termdef">modals</span> (e.g., <span class="example">should</span>,
<span class="example">may</span>), and </a><a name="personal_pronouns_index_term"><span class="termdef">personal pronouns</span> (e.g., <span class="example">she</span>, <span class="example">they</span>).
Each dictionary and grammar classifies these words differently.</a></p><a name="personal_pronouns_index_term">
<div class="note">
<p class="first admonition-title">Note</p>
<p class="last"><strong>Your Turn:</strong>
If you are uncertain about some of these parts of speech, study them using
<tt class="doctest"><span class="pre">nltk.app.concordance()</span></tt>, or watch some of the <em>Schoolhouse Rock!</em>
grammar videos available at YouTube, or consult the Further Reading
section at the end of this chapter.</p>
</div>
</a></div><a name="personal_pronouns_index_term">
</a><div class="section" id="unsimplified-tags"><a name="personal_pronouns_index_term">
<h2>2.7 Unsimplified Tags</h2>
</a><p><a name="personal_pronouns_index_term">Let's find the most frequent nouns of each noun part-of-speech type.
The program in </a><a class="reference internal" href="#code-findtags">2.2</a> finds all tags starting with <tt class="doctest"><span class="pre">NN</span></tt>,
and provides a few example words for each one. You will see that
there are many variants of <tt class="doctest"><span class="pre">NN</span></tt>; the most important contain <tt class="doctest"><span class="pre">$</span></tt>
for possessive nouns, <tt class="doctest"><span class="pre">S</span></tt> for plural nouns (since plural nouns
typically end in <span class="example">s</span>) and <tt class="doctest"><span class="pre">P</span></tt> for proper nouns. In addition,
most of the tags have suffix modifiers: <tt class="doctest"><span class="pre">-NC</span></tt> for citations, <tt class="doctest"><span class="pre">-HL</span></tt>
for words in headlines and <tt class="doctest"><span class="pre">-TL</span></tt> for titles (a feature of Brown tabs).</p>
<span class="target" id="code-findtags"></span><div class="pylisting">
<p></p><table class="pylisting" width="95%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="codeblock">
<table width="100%" cellspacing="0" cellpadding="0" border="0">
<tbody><tr><td class="copybar" onclick="javascript:copy_codeblock_to_clipboard(this.nextSibling);" width="1"> </td>
<td class="pysrc"><pre class="doctest"><span class="pysrc-keyword">def</span> <span class="pysrc-defname">findtags</span>(tag_prefix, tagged_text):
cfd = nltk.ConditionalFreqDist((tag, word) <span class="pysrc-keyword">for</span> (word, tag) <span class="pysrc-keyword">in</span> tagged_text
<span class="pysrc-keyword">if</span> tag.startswith(tag_prefix))
return dict((tag, cfd[tag].most_common(5)) <span class="pysrc-keyword">for</span> tag <span class="pysrc-keyword">in</span> cfd.conditions())
<span class="pysrc-prompt">>>> </span>tagdict = findtags(<span class="pysrc-string">'NN'</span>, nltk.corpus.brown.tagged_words(categories=<span class="pysrc-string">'news'</span>))
<span class="pysrc-prompt">>>> </span><span class="pysrc-keyword">for</span> tag <span class="pysrc-keyword">in</span> sorted(tagdict):
<span class="pysrc-more">... </span> <span class="pysrc-keyword">print</span>(tag, tagdict[tag])
<span class="pysrc-more">...</span>
NN [(<span class="pysrc-string">'year'</span>, 137), (<span class="pysrc-string">'time'</span>, 97), (<span class="pysrc-string">'state'</span>, 88), (<span class="pysrc-string">'week'</span>, 85), (<span class="pysrc-string">'man'</span>, 72)]
NN$ [(<span class="pysrc-string">"year's"</span>, 13), (<span class="pysrc-string">"world's"</span>, 8), (<span class="pysrc-string">"state's"</span>, 7), (<span class="pysrc-string">"nation's"</span>, 6), (<span class="pysrc-string">"company's"</span>, 6)]