-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
266 additions
and
145 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
#!/usr/bin/perl -w | ||
package MednlpFeature; | ||
use strict; | ||
use Set::IntervalTree; | ||
use Algorithm::AhoCorasick qw(find_all); | ||
use base qw(Exporter); | ||
our @EXPORT = qw( init_word_intervals bio_from_intervals init_dictionaries letter_type ); | ||
|
||
my @word_intervals; | ||
|
||
# search words in dictionaries and convert founds into interval | ||
sub init_word_intervals { | ||
my $input_joined = shift; | ||
my $dictionaries = shift; | ||
@word_intervals = map { | ||
my $interval = Set::IntervalTree->new; | ||
my $found = find_all($input_joined, @{$_}); | ||
foreach my $pos (keys %$found) { | ||
my $pos2 = $pos + length($found->{$pos}->[0]); | ||
$interval->insert($pos, $pos, $pos2); | ||
} | ||
$interval; | ||
} @$dictionaries; | ||
} | ||
|
||
sub bio_from_intervals { | ||
my $pos = shift; | ||
my @bios = map { | ||
my $interval = $_->fetch($pos, $pos + 1); | ||
if (@$interval) { | ||
($interval->[0] == $pos) ? 'B' : 'I'; | ||
} else { | ||
'O'; | ||
} | ||
} @word_intervals; | ||
return join '', @bios; | ||
} | ||
|
||
sub init_dictionaries { | ||
my $dictionary_files = shift; | ||
return map { | ||
open DICT, $_ or die; | ||
binmode DICT, ":encoding(utf-8)"; | ||
my @dict = <DICT>; | ||
close DICT; | ||
map {chomp;} @dict; | ||
\@dict; | ||
} @$dictionary_files; | ||
} | ||
|
||
sub letter_type { | ||
my $str = shift; | ||
if ($str =~ /^[\d0-9\.]+$/) { | ||
return 'digit'; | ||
} elsif ($str =~ /^\p{Latin}+$/) { | ||
return 'latin'; | ||
} elsif ($str =~ /^[\p{Hiragana}ー]+$/) { | ||
return 'hiragana'; | ||
} elsif ($str =~ /^[\p{Katakana}ー]+$/) { | ||
return 'katakana'; | ||
} elsif ($str =~ /^\p{Han}+$/) { | ||
return 'kanji'; | ||
} elsif ($str =~ /^\p{Common}+$/) { | ||
return 'symbol'; | ||
} else { | ||
return 'mixed'; | ||
} | ||
} | ||
|
||
1; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
- normalizer.pl | ||
|
||
Usage: normalizer.pl TAG DICT ATTR < INPUT | ||
|
||
入力 XML の TAG タグについてそのテキストを辞書 DICT を用いて正規化する. | ||
結果は ATTR 属性に記録される. | ||
正規化においては sim.pl が呼ばれる. sim.pl はこのスクリプトと同じディレクトリにある必要がある. | ||
|
||
|
||
- sim.pl | ||
|
||
Usage: sim.pl -m table [-r "A/B"]... INPUT... | ||
Example: sim.pl -m tables/Master_M.txt -r "XX/アク" アクチダス XXチダス ビルレクス | ||
|
||
入力単語を正規化する. | ||
-m オプション(必須)は正規化テーブルを指定する. | ||
-r オプション(複数指定可)は入力の部分的な変換を, 変換前と後をスラッシュで区切って指定する. | ||
入力は複数指定可である. | ||
|
||
|
||
- date_normalizer.pl | ||
|
||
Usage: date_normalizer.pl [-t tag] < INPUT | ||
|
||
時間を表すタグ(オプション t で指定可)を順に date2value.pl に入力し, 結果を absolute 属性に記録する. | ||
|
||
|
||
- timeline.pl | ||
|
||
Usage: timeline.pl [-t tag] [-q] XML | ||
|
||
入力から各タグを切り出して時間タグが現れるところで分割した XML を返す. | ||
-t tag は tag を時間を表したタグだと見なす(デフォルトは t). | ||
-q は, 各イベントに対応するオリジナルのテキストを表示しないようにする. |
Oops, something went wrong.