reflect updates of main repository

LennMars · Mar 18, 2013 · 97a8bca · 97a8bca
1 parent dc0c9e0
commit 97a8bca
Show file tree

Hide file tree

Showing 11 changed files with 266 additions and 145 deletions.
diff --git a/crf_data_gen/MednlpFeature.pm b/crf_data_gen/MednlpFeature.pm
@@ -0,0 +1,70 @@
+#!/usr/bin/perl -w
+package MednlpFeature;
+use strict;
+use Set::IntervalTree;
+use Algorithm::AhoCorasick qw(find_all);
+use base qw(Exporter);
+our @EXPORT = qw( init_word_intervals bio_from_intervals init_dictionaries letter_type );
+
+my @word_intervals;
+
+# search words in dictionaries and convert founds into interval
+sub init_word_intervals {
+    my $input_joined = shift;
+    my $dictionaries = shift;
+    @word_intervals = map {
+        my $interval = Set::IntervalTree->new;
+        my $found = find_all($input_joined, @{$_});
+        foreach my $pos (keys %$found) {
+            my $pos2 = $pos + length($found->{$pos}->[0]);
+            $interval->insert($pos, $pos, $pos2);
+        }
+        $interval;
+    } @$dictionaries;
+}
+
+sub bio_from_intervals {
+    my $pos = shift;
+    my @bios = map {
+        my $interval = $_->fetch($pos, $pos + 1);
+        if (@$interval) {
+            ($interval->[0] == $pos) ? 'B' : 'I';
+        } else {
+            'O';
+        }
+    } @word_intervals;
+    return join '', @bios;
+}
+
+sub init_dictionaries {
+    my $dictionary_files = shift;
+    return map {
+        open DICT, $_ or die;
+        binmode DICT, ":encoding(utf-8)";
+        my @dict = <DICT>;
+        close DICT;
+        map {chomp;} @dict;
+        \@dict;
+    } @$dictionary_files;
+}
+
+sub letter_type {
+    my $str = shift;
+    if ($str =~ /^[\d０-９\.]+$/) {
+        return 'digit';
+    } elsif ($str =~ /^\p{Latin}+$/) {
+        return 'latin';
+    } elsif ($str =~ /^[\p{Hiragana}ー]+$/) {
+        return 'hiragana';
+    } elsif ($str =~ /^[\p{Katakana}ー]+$/) {
+        return 'katakana';
+    } elsif ($str =~ /^\p{Han}+$/) {
+        return 'kanji';
+    } elsif ($str =~ /^\p{Common}+$/) {
+        return 'symbol';
+    } else {
+        return 'mixed';
+    }
+}
+
+1;
diff --git a/crf_data_gen/form_text.pl b/crf_data_gen/form_text.pl
@@ -6,8 +6,8 @@
 binmode STDOUT, ":encoding(utf-8)";
 
 while (<STDIN>) {
-    $_ =~ s/(\d+)-(\d+)-(\d+)/$1年$2月$3日/;
-    $_ =~ tr/[0-9a-zA-Z]/[０-９ａ-ｚＡ-Ｚ]/;
+    s/[\x{0001}]//g;
+    s/(\d+)-(\d+)-(\d+)/$1年$2月$3日/;
+    tr/[0-9a-zA-Z%&#<>_\^\/\?\[\]]/[０-９ａ-ｚＡ-Ｚ％＆＃＜＞＿＾／？［］]/;
     print $_;
 }
-
diff --git a/crf_data_gen/get_feature.pl b/crf_data_gen/get_feature.pl
@@ -4,40 +4,24 @@
 use Encode;
 use MeCab;
 use Getopt::Long qw(:config posix_default no_ignore_case gnu_compat);
-use Algorithm::AhoCorasick qw(find_all);
-use Set::IntervalTree;
+use FindBin;
+use lib "$FindBin::Bin";
+use MednlpFeature;
 binmode STDOUT, ":encoding(utf-8)";
 
 # read dictionaries
 my @dictionary_files;
 GetOptions ('d=s' => \@dictionary_files);
 
-my @dictionaries = map {
-    open DICT, $_ or die;
-    binmode DICT, ":encoding(utf-8)";
-    my @dict = <DICT>;
-    close DICT;
-    map {chomp;} @dict;
-    \@dict;
-} @dictionary_files;
+my @dictionaries = init_dictionaries(\@dictionary_files);
 
 # read input
 open INPUT, $ARGV[0] or die;
 binmode INPUT, ":encoding(utf-8)";
 my @input = <INPUT>;
 my $input_joined = join '', @input;
 
-# search words in dictionaries and convert founds into interval
-my @word_intervals = map {
-    my $interval = Set::IntervalTree->new;
-    my $found = find_all($input_joined, @{$_});
-    foreach my $pos (keys %$found) {
-        my $pos2 = $pos + length($found->{$pos}->[0]);
-        # print "word: $found->{$pos}->[0], $pos to $pos2\n";
-        $interval->insert($pos, $pos, $pos2);
-    }
-    $interval;
-} @dictionaries;
+init_word_intervals($input_joined, \@dictionaries);
 
 my $pos = 0; # in input_joined
 
@@ -55,17 +39,13 @@
         my $surface = decode("utf-8", $node->{surface});
         my $pos_temp = index($input_joined, $surface, $pos);
         # print "surface: $surface, pos_prev: $pos, pos_found: $pos_temp\n";
-        my @bios = map {
-            my $interval = $_->fetch($pos_temp, $pos_temp + 1);
-            if (@$interval) {
-                ($interval->[0] == $pos_temp) ? 'B' : 'I';
-            } else {
-                'O';
-            }
-        } @word_intervals;
+        my $bio_str = bio_from_intervals($pos_temp);
+
+        my $letter_type = letter_type($surface);
+
+        my $last_char = substr $surface, -1;
 
-        my $bio_str = join '', @bios;
-        my @out = map {(defined && $_ ne '') ? $_ : '*'} ($surface, $class1, $class2, $read, $bio_str);
+        my @out = map {(defined && $_ ne '') ? $_ : '*'} ($surface, $class1, $class2, $read, $bio_str, $letter_type, $last_char);
         my $out_str = join " ", @out;
         print $out_str, "\n";
 

diff --git a/crf_data_gen/parse.pl b/crf_data_gen/parse.pl
@@ -6,15 +6,16 @@
 #see http://d.hatena.ne.jp/tagomoris/20120918/1347991165 for this configuration
 use Getopt::Long qw(:config posix_default no_ignore_case gnu_compat);
 use Encode;
-use Algorithm::AhoCorasick qw(find_all);
-use Set::IntervalTree;
 use HTML::Entities;
+use FindBin;
+use lib "$FindBin::Bin";
+use MednlpFeature;
 binmode STDOUT, ":encoding(utf-8)";
 
 if ($#ARGV < 0) {die "Usage: parse.pl [-tag t1,t2,...] [-notag t1,t2,...] [-attr] [-d dict]... input.xml\n";}
 
 # command line option handling
-my @output_tags = ('a', 't', 'h', 'l', 'x', 'd', 'c', 'C', 'H', 'LOC', 'M', 'M1', 'T', 'T1', 'X'); # default tags to output
+my @output_tags = ('mn', 'ms', 'm', 'a', 't', 'h', 'l', 'x', 'd', 'cn', 'cs', 'c', 'C', 'H', 'LOC', 'M', 'M1', 'T', 'T1', 'X'); # default tags to output
 my @no_output_tags = (); # exclusion
 
 my $to_use_modality = 0;
@@ -34,7 +35,7 @@
 
 init_output_tags();
 my @dictionaries = init_dictionaries(\@dictionary_files);
-my @word_intervals = init_word_intervals($text, \@dictionaries);
+init_word_intervals($text, \@dictionaries);
 
 
 &traverse($doc);
@@ -65,40 +66,14 @@ sub init_output_tags {
         ! $found} @output_tags;
 }
 
-sub init_dictionaries {
-    my $dictionary_files = shift;
-    return map {
-        open DICT, $_ or die;
-        binmode DICT, ":encoding(utf-8)";
-        my @dict = <DICT>;
-        close DICT;
-        map {chomp;} @dict;
-        \@dict;
-    } @$dictionary_files;
-}
-
-# search words in dictionaries and convert founds into interval
-sub init_word_intervals {
-    my $input_joined = shift;
-    my $dictionaries = shift;
-    return map {
-        my $interval = Set::IntervalTree->new;
-        my $found = find_all($input_joined, @{$_});
-        foreach my $pos (keys %$found) {
-            my $pos2 = $pos + length($found->{$pos}->[0]);
-            $interval->insert($pos, $pos, $pos2);
-        }
-        $interval;
-    } @$dictionaries;
-}
-
-sub print_iob_sequence {
+sub iob_sequence_to_string {
     my $iobs_ref = shift;
-    foreach (@$iobs_ref) {
+    my @iob_strs = map {
         my @iob = @$_;
         @iob = map {(defined && $_ ne '') ? $_ : '*'} @iob; # avoid empty column
-        print ((join ' ', @iob)."\n");
-    }
+        join ' ', @iob;
+    } (@$iobs_ref);
+    return join "\n", @iob_strs;
 }
 
 sub get_iob_sequence {
@@ -117,18 +92,10 @@ sub get_iob_sequence {
         next if ($class1 =~ /BOS|EOS/);
 
         my $surface = decode("utf8", $node->{surface});
-
-        # generate feature from dictionary
         my $pos_temp = index($text, $surface, $pos_in_text);
-        my @iobs_from_dict = map {
-            my $interval = $_->fetch($pos_temp, $pos_temp + 1);
-            if (@$interval) {
-                ($interval->[0] == $pos_temp) ? 'B' : 'I';
-            } else {
-                'O';
-            }
-        } @word_intervals;
-        my $iobs_from_dict_str = join '', @iobs_from_dict;
+        my $iobs_from_dict_str = bio_from_intervals($pos_temp);
+        my $letter_type = letter_type($surface);
+        my $last_char = substr $surface, -1;
 
         # forward position
         $pos_in_text = $pos_temp + length($surface);
@@ -138,19 +105,10 @@ sub get_iob_sequence {
 
         my $tag = ($type eq 'O') ? 'O' : ($is_first ? "B-${type}" : "I-${type}");
         $is_first = 0;
-        my @iob = ($surface, $class1, $class2, $read, $iobs_from_dict_str, $tag);
+        my @iob = ($surface, $class1, $class2, $read, $iobs_from_dict_str, $letter_type, $last_char, $tag);
         push @iobs, \@iob;
     }
 
-    # add a line break for each end of sentences
-    if (substr ($string, -1) eq "\n") {
-        my $iob_last = pop @iobs;
-        if (defined $iob_last) {
-            push @$iob_last, "\n";
-            push @iobs, $iob_last;
-        }
-    }
-
     return \@iobs;
 }
 
@@ -178,16 +136,25 @@ sub print_leaf {
     my $node = shift;
     my $name = $node->nodeName();
 
-    my $iobs;
     if (grep {$name =~ /^$_$/} @output_tags) {
-        $iobs = get_iob_sequence(get_modality($node), $node->textContent);
+        my $iobs = get_iob_sequence(get_modality($node), $node->textContent);
+        my $str = iob_sequence_to_string $iobs;
+        print "$str\n";
     } else {
         my $parent_name = $node->parentNode()->nodeName();
         unless ($node->hasChildNodes() || grep {$parent_name =~ /^$_$/} @output_tags) {
-            $iobs = get_iob_sequence('O', $node->textContent);
+            my $content = $node->textContent;
+            $content =~ s/^\s+//;
+            my @contents = split "\n+", $content;
+            my @strs = map {
+                my $iobs = get_iob_sequence('O', $_);
+                iob_sequence_to_string $iobs;
+            } @contents;
+            my $output = join "\n\n", @strs;
+            print $output;
+            print ($node->textContent =~ /\n$/ ? "\n\n" : "\n") if $output;
         }
     }
-    print_iob_sequence $iobs if defined $iobs;
 }
 
 sub traverse {

diff --git a/evaluate/README b/evaluate/README
@@ -1,17 +1,17 @@
 - experiment.pl
 
-usage: experiment.pl [-d dict]... [-d1 depth] [-d2 depth] [-mod] [-h] [-c file] -m model input
+usage: experiment.pl [-d dict]... [-d1 depth] [-d2 depth] [-mod] [-c file] -m model input
 
 CRF++ のモデルファイルが得られている時, input に与えられたプレーンテキストにタグ付けして XML にする.
 モデルファイルは -m で与える (必須).
 -c で正解が与えられていれば conlleval.pl に渡せるように比較したファイルも生成する (cmp.txt).
 モデルの訓練時に辞書とのマッチの情報を使っている場合, 同じ辞書を -d で与える.
--d1, -d2, -mod, -h は xml_to_charwise_iob.pl に渡される.
+-d1, -d2, -mod は xml_to_charwise_iob.pl に渡される.
 
 
 - crfout_to_xml.pl
 
-conlleval 形式のタグ付け結果から XML を生成する.
+conlleval 形式のタグ付け結果から XML を生成する. ヘッダは付けないので valid な XML にはならないことに注意する.
 
 usage: crfout_to_xml.pl < (conlleval形式ファイル)
 
@@ -21,7 +21,7 @@ IOB部分は'-'で分割され, 3番目の要素がある場合は modality の
 
 - xml_to_charwise_iob.pl
 
-usage: xml_to_charwise_iob.pl [-d1 depth] [-d2 depth] [-mod] [-h] file1 [file2]
+usage: xml_to_charwise_iob.pl [-d1 depth] [-d2 depth] [-mod] file1 [file2]
 
 XML からトークンを 1 文字ずつに取った conlleval 形式に変換する.
 2 ファイルを入力することも出来, この場合 1 トークンに両方のタグの情報を付けて出力する. タグと空白を除いて一致しない場合はエラーを返す.
@@ -34,5 +34,3 @@ y	I-b
 となる.
 
 -mod は modality の値を出力するよう指定する.
-
--h は出力を人間に読みやすい形式にする.
diff --git a/normalizer/README b/normalizer/README
@@ -0,0 +1,34 @@
+- normalizer.pl
+
+Usage: normalizer.pl TAG DICT ATTR < INPUT
+
+入力 XML の TAG タグについてそのテキストを辞書 DICT を用いて正規化する.
+結果は ATTR 属性に記録される.
+正規化においては sim.pl が呼ばれる. sim.pl はこのスクリプトと同じディレクトリにある必要がある.
+
+
+- sim.pl
+
+Usage: sim.pl -m table [-r "A/B"]... INPUT...
+Example: sim.pl -m tables/Master_M.txt -r "XX/アク" アクチダス XXチダス ビルレクス
+
+入力単語を正規化する.
+-m オプション(必須)は正規化テーブルを指定する.
+-r オプション(複数指定可)は入力の部分的な変換を, 変換前と後をスラッシュで区切って指定する.
+入力は複数指定可である.
+
+
+- date_normalizer.pl
+
+Usage: date_normalizer.pl [-t tag] < INPUT
+
+時間を表すタグ(オプション t で指定可)を順に date2value.pl に入力し, 結果を absolute 属性に記録する.
+
+
+- timeline.pl
+
+Usage: timeline.pl [-t tag] [-q] XML
+
+入力から各タグを切り出して時間タグが現れるところで分割した XML を返す.
+-t tag は tag を時間を表したタグだと見なす(デフォルトは t).
+-q は, 各イベントに対応するオリジナルのテキストを表示しないようにする.