Skip to content

Commit

Permalink
reflect updates of main repository
Browse files Browse the repository at this point in the history
  • Loading branch information
LennMars committed Mar 18, 2013
1 parent dc0c9e0 commit 97a8bca
Show file tree
Hide file tree
Showing 11 changed files with 266 additions and 145 deletions.
70 changes: 70 additions & 0 deletions crf_data_gen/MednlpFeature.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/perl -w
package MednlpFeature;
use strict;
use Set::IntervalTree;
use Algorithm::AhoCorasick qw(find_all);
use base qw(Exporter);
our @EXPORT = qw( init_word_intervals bio_from_intervals init_dictionaries letter_type );

my @word_intervals;

# search words in dictionaries and convert founds into interval
sub init_word_intervals {
my $input_joined = shift;
my $dictionaries = shift;
@word_intervals = map {
my $interval = Set::IntervalTree->new;
my $found = find_all($input_joined, @{$_});
foreach my $pos (keys %$found) {
my $pos2 = $pos + length($found->{$pos}->[0]);
$interval->insert($pos, $pos, $pos2);
}
$interval;
} @$dictionaries;
}

sub bio_from_intervals {
my $pos = shift;
my @bios = map {
my $interval = $_->fetch($pos, $pos + 1);
if (@$interval) {
($interval->[0] == $pos) ? 'B' : 'I';
} else {
'O';
}
} @word_intervals;
return join '', @bios;
}

sub init_dictionaries {
my $dictionary_files = shift;
return map {
open DICT, $_ or die;
binmode DICT, ":encoding(utf-8)";
my @dict = <DICT>;
close DICT;
map {chomp;} @dict;
\@dict;
} @$dictionary_files;
}

sub letter_type {
my $str = shift;
if ($str =~ /^[\d0-9\.]+$/) {
return 'digit';
} elsif ($str =~ /^\p{Latin}+$/) {
return 'latin';
} elsif ($str =~ /^[\p{Hiragana}ー]+$/) {
return 'hiragana';
} elsif ($str =~ /^[\p{Katakana}ー]+$/) {
return 'katakana';
} elsif ($str =~ /^\p{Han}+$/) {
return 'kanji';
} elsif ($str =~ /^\p{Common}+$/) {
return 'symbol';
} else {
return 'mixed';
}
}

1;
6 changes: 3 additions & 3 deletions crf_data_gen/form_text.pl
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
binmode STDOUT, ":encoding(utf-8)";

while (<STDIN>) {
$_ =~ s/(\d+)-(\d+)-(\d+)/$1$2$3/;
$_ =~ tr/[0-9a-zA-Z]/[0-9a-zA-Z]/;
s/[\x{0001}]//g;
s/(\d+)-(\d+)-(\d+)/$1$2$3/;
tr/[0-9a-zA-Z%&#<>_\^\/\?\[\]]/[0-9a-zA-Z%&#<>_^/?[]]/;
print $_;
}

42 changes: 11 additions & 31 deletions crf_data_gen/get_feature.pl
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,24 @@
use Encode;
use MeCab;
use Getopt::Long qw(:config posix_default no_ignore_case gnu_compat);
use Algorithm::AhoCorasick qw(find_all);
use Set::IntervalTree;
use FindBin;
use lib "$FindBin::Bin";
use MednlpFeature;
binmode STDOUT, ":encoding(utf-8)";

# read dictionaries
my @dictionary_files;
GetOptions ('d=s' => \@dictionary_files);

my @dictionaries = map {
open DICT, $_ or die;
binmode DICT, ":encoding(utf-8)";
my @dict = <DICT>;
close DICT;
map {chomp;} @dict;
\@dict;
} @dictionary_files;
my @dictionaries = init_dictionaries(\@dictionary_files);

# read input
open INPUT, $ARGV[0] or die;
binmode INPUT, ":encoding(utf-8)";
my @input = <INPUT>;
my $input_joined = join '', @input;

# search words in dictionaries and convert founds into interval
my @word_intervals = map {
my $interval = Set::IntervalTree->new;
my $found = find_all($input_joined, @{$_});
foreach my $pos (keys %$found) {
my $pos2 = $pos + length($found->{$pos}->[0]);
# print "word: $found->{$pos}->[0], $pos to $pos2\n";
$interval->insert($pos, $pos, $pos2);
}
$interval;
} @dictionaries;
init_word_intervals($input_joined, \@dictionaries);

my $pos = 0; # in input_joined

Expand All @@ -55,17 +39,13 @@
my $surface = decode("utf-8", $node->{surface});
my $pos_temp = index($input_joined, $surface, $pos);
# print "surface: $surface, pos_prev: $pos, pos_found: $pos_temp\n";
my @bios = map {
my $interval = $_->fetch($pos_temp, $pos_temp + 1);
if (@$interval) {
($interval->[0] == $pos_temp) ? 'B' : 'I';
} else {
'O';
}
} @word_intervals;
my $bio_str = bio_from_intervals($pos_temp);

my $letter_type = letter_type($surface);

my $last_char = substr $surface, -1;

my $bio_str = join '', @bios;
my @out = map {(defined && $_ ne '') ? $_ : '*'} ($surface, $class1, $class2, $read, $bio_str);
my @out = map {(defined && $_ ne '') ? $_ : '*'} ($surface, $class1, $class2, $read, $bio_str, $letter_type, $last_char);
my $out_str = join " ", @out;
print $out_str, "\n";

Expand Down
87 changes: 27 additions & 60 deletions crf_data_gen/parse.pl
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@
#see http://d.hatena.ne.jp/tagomoris/20120918/1347991165 for this configuration
use Getopt::Long qw(:config posix_default no_ignore_case gnu_compat);
use Encode;
use Algorithm::AhoCorasick qw(find_all);
use Set::IntervalTree;
use HTML::Entities;
use FindBin;
use lib "$FindBin::Bin";
use MednlpFeature;
binmode STDOUT, ":encoding(utf-8)";

if ($#ARGV < 0) {die "Usage: parse.pl [-tag t1,t2,...] [-notag t1,t2,...] [-attr] [-d dict]... input.xml\n";}

# command line option handling
my @output_tags = ('a', 't', 'h', 'l', 'x', 'd', 'c', 'C', 'H', 'LOC', 'M', 'M1', 'T', 'T1', 'X'); # default tags to output
my @output_tags = ('mn', 'ms', 'm', 'a', 't', 'h', 'l', 'x', 'd', 'cn', 'cs', 'c', 'C', 'H', 'LOC', 'M', 'M1', 'T', 'T1', 'X'); # default tags to output
my @no_output_tags = (); # exclusion

my $to_use_modality = 0;
Expand All @@ -34,7 +35,7 @@

init_output_tags();
my @dictionaries = init_dictionaries(\@dictionary_files);
my @word_intervals = init_word_intervals($text, \@dictionaries);
init_word_intervals($text, \@dictionaries);


&traverse($doc);
Expand Down Expand Up @@ -65,40 +66,14 @@ sub init_output_tags {
! $found} @output_tags;
}

sub init_dictionaries {
my $dictionary_files = shift;
return map {
open DICT, $_ or die;
binmode DICT, ":encoding(utf-8)";
my @dict = <DICT>;
close DICT;
map {chomp;} @dict;
\@dict;
} @$dictionary_files;
}

# search words in dictionaries and convert founds into interval
sub init_word_intervals {
my $input_joined = shift;
my $dictionaries = shift;
return map {
my $interval = Set::IntervalTree->new;
my $found = find_all($input_joined, @{$_});
foreach my $pos (keys %$found) {
my $pos2 = $pos + length($found->{$pos}->[0]);
$interval->insert($pos, $pos, $pos2);
}
$interval;
} @$dictionaries;
}

sub print_iob_sequence {
sub iob_sequence_to_string {
my $iobs_ref = shift;
foreach (@$iobs_ref) {
my @iob_strs = map {
my @iob = @$_;
@iob = map {(defined && $_ ne '') ? $_ : '*'} @iob; # avoid empty column
print ((join ' ', @iob)."\n");
}
join ' ', @iob;
} (@$iobs_ref);
return join "\n", @iob_strs;
}

sub get_iob_sequence {
Expand All @@ -117,18 +92,10 @@ sub get_iob_sequence {
next if ($class1 =~ /BOS|EOS/);

my $surface = decode("utf8", $node->{surface});

# generate feature from dictionary
my $pos_temp = index($text, $surface, $pos_in_text);
my @iobs_from_dict = map {
my $interval = $_->fetch($pos_temp, $pos_temp + 1);
if (@$interval) {
($interval->[0] == $pos_temp) ? 'B' : 'I';
} else {
'O';
}
} @word_intervals;
my $iobs_from_dict_str = join '', @iobs_from_dict;
my $iobs_from_dict_str = bio_from_intervals($pos_temp);
my $letter_type = letter_type($surface);
my $last_char = substr $surface, -1;

# forward position
$pos_in_text = $pos_temp + length($surface);
Expand All @@ -138,19 +105,10 @@ sub get_iob_sequence {

my $tag = ($type eq 'O') ? 'O' : ($is_first ? "B-${type}" : "I-${type}");
$is_first = 0;
my @iob = ($surface, $class1, $class2, $read, $iobs_from_dict_str, $tag);
my @iob = ($surface, $class1, $class2, $read, $iobs_from_dict_str, $letter_type, $last_char, $tag);
push @iobs, \@iob;
}

# add a line break for each end of sentences
if (substr ($string, -1) eq "\n") {
my $iob_last = pop @iobs;
if (defined $iob_last) {
push @$iob_last, "\n";
push @iobs, $iob_last;
}
}

return \@iobs;
}

Expand Down Expand Up @@ -178,16 +136,25 @@ sub print_leaf {
my $node = shift;
my $name = $node->nodeName();

my $iobs;
if (grep {$name =~ /^$_$/} @output_tags) {
$iobs = get_iob_sequence(get_modality($node), $node->textContent);
my $iobs = get_iob_sequence(get_modality($node), $node->textContent);
my $str = iob_sequence_to_string $iobs;
print "$str\n";
} else {
my $parent_name = $node->parentNode()->nodeName();
unless ($node->hasChildNodes() || grep {$parent_name =~ /^$_$/} @output_tags) {
$iobs = get_iob_sequence('O', $node->textContent);
my $content = $node->textContent;
$content =~ s/^\s+//;
my @contents = split "\n+", $content;
my @strs = map {
my $iobs = get_iob_sequence('O', $_);
iob_sequence_to_string $iobs;
} @contents;
my $output = join "\n\n", @strs;
print $output;
print ($node->textContent =~ /\n$/ ? "\n\n" : "\n") if $output;
}
}
print_iob_sequence $iobs if defined $iobs;
}

sub traverse {
Expand Down
10 changes: 4 additions & 6 deletions evaluate/README
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
- experiment.pl

usage: experiment.pl [-d dict]... [-d1 depth] [-d2 depth] [-mod] [-h] [-c file] -m model input
usage: experiment.pl [-d dict]... [-d1 depth] [-d2 depth] [-mod] [-c file] -m model input

CRF++ のモデルファイルが得られている時, input に与えられたプレーンテキストにタグ付けして XML にする.
モデルファイルは -m で与える (必須).
-c で正解が与えられていれば conlleval.pl に渡せるように比較したファイルも生成する (cmp.txt).
モデルの訓練時に辞書とのマッチの情報を使っている場合, 同じ辞書を -d で与える.
-d1, -d2, -mod, -h は xml_to_charwise_iob.pl に渡される.
-d1, -d2, -mod は xml_to_charwise_iob.pl に渡される.


- crfout_to_xml.pl

conlleval 形式のタグ付け結果から XML を生成する.
conlleval 形式のタグ付け結果から XML を生成する. ヘッダは付けないので valid な XML にはならないことに注意する.

usage: crfout_to_xml.pl < (conlleval形式ファイル)

Expand All @@ -21,7 +21,7 @@ IOB部分は'-'で分割され, 3番目の要素がある場合は modality の

- xml_to_charwise_iob.pl

usage: xml_to_charwise_iob.pl [-d1 depth] [-d2 depth] [-mod] [-h] file1 [file2]
usage: xml_to_charwise_iob.pl [-d1 depth] [-d2 depth] [-mod] file1 [file2]

XML からトークンを 1 文字ずつに取った conlleval 形式に変換する.
2 ファイルを入力することも出来, この場合 1 トークンに両方のタグの情報を付けて出力する. タグと空白を除いて一致しない場合はエラーを返す.
Expand All @@ -34,5 +34,3 @@ y I-b
となる.

-mod は modality の値を出力するよう指定する.

-h は出力を人間に読みやすい形式にする.
34 changes: 34 additions & 0 deletions normalizer/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
- normalizer.pl

Usage: normalizer.pl TAG DICT ATTR < INPUT

入力 XML の TAG タグについてそのテキストを辞書 DICT を用いて正規化する.
結果は ATTR 属性に記録される.
正規化においては sim.pl が呼ばれる. sim.pl はこのスクリプトと同じディレクトリにある必要がある.


- sim.pl

Usage: sim.pl -m table [-r "A/B"]... INPUT...
Example: sim.pl -m tables/Master_M.txt -r "XX/アク" アクチダス XXチダス ビルレクス

入力単語を正規化する.
-m オプション(必須)は正規化テーブルを指定する.
-r オプション(複数指定可)は入力の部分的な変換を, 変換前と後をスラッシュで区切って指定する.
入力は複数指定可である.


- date_normalizer.pl

Usage: date_normalizer.pl [-t tag] < INPUT

時間を表すタグ(オプション t で指定可)を順に date2value.pl に入力し, 結果を absolute 属性に記録する.


- timeline.pl

Usage: timeline.pl [-t tag] [-q] XML

入力から各タグを切り出して時間タグが現れるところで分割した XML を返す.
-t tag は tag を時間を表したタグだと見なす(デフォルトは t).
-q は, 各イベントに対応するオリジナルのテキストを表示しないようにする.
Loading

0 comments on commit 97a8bca

Please sign in to comment.