Skip to content

Commit

Permalink
add verification func for result
Browse files Browse the repository at this point in the history
  • Loading branch information
bowbowbow committed Sep 13, 2019
1 parent 3eee0b6 commit db784f1
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 19 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ This project use the same data partitioning as the previous work ([Yang and Mitc
Below is information about the amount of parsed data when using this project. It is slightly different from the parsing results of the two papers above. The difference seems to have occurred because there are no promised rules for splitting sentences within the sgm format files.
| | Documents | Sentences |Triggers | Arguments | Entity Mentions |
|------- |-----------|----------- |---------------|----------------- |----------------- |
| Test | 40 | 713 | 424 | 878 | 4226 |
| Dev | 30 | 875 | 505 | 906 | 4050 |
| Train | 529 | 14724 | 4420 | 7147 | 53045 |
|------- |--------------|--------------|------------|-----------|----------------- |
| Test | 40 | 713 | 424 | 892 | 4226 |
| Dev | 30 | 875 | 505 | 933 | 4050 |
| Train | 529 | 14724 | 4420 | 7811 | 53045 |
53 changes: 38 additions & 15 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,43 @@ def remove_punc(s):
return start_idx, end_idx


def find_token_index_v2(words, phrase):
start_idx, end_idx = -1, -1
def verify_result(data):
def remove_punctuation(s):
for c in ['-LRB-', '-RRB-', '-LSB-', '-RSB-', '-LCB-', '-RCB-', '\xa0']:
s = s.replace(c, '')
s = re.sub(r'[^\w]', '', s)
return s

return start_idx, end_idx
def check_diff(words, phrase):
return remove_punctuation(phrase) not in remove_punctuation(words)

for item in data:
words = item['words']
for entity_mention in item['golden-entity-mentions']:
if check_diff(''.join(words[entity_mention['start']:entity_mention['end']]), entity_mention['text'].replace(' ', '')):
print('============================')
print('[Warning] entity has invalid start/end')
print('Expected: ', entity_mention['text'])
print('Actual:', words[entity_mention['start']:entity_mention['end']])
print('start: {}, end: {}, words: {}'.format(entity_mention['start'], entity_mention['end'], words))

for event_mention in item['golden-event-mentions']:
trigger = event_mention['trigger']
if check_diff(''.join(words[trigger['start']:trigger['end']]), trigger['text'].replace(' ', '')):
print('============================')
print('[Warning] trigger has invalid start/end')
print('Expected: ', trigger['text'])
print('Actual:', words[trigger['start']:trigger['end']])
print('start: {}, end: {}, words: {}'.format(trigger['start'], trigger['end'], words))
for argument in event_mention['arguments']:
if check_diff(''.join(words[argument['start']:argument['end']]), argument['text'].replace(' ', '')):
print('============================')
print('[Warning] argument has invalid start/end')
print('Expected: ', argument['text'])
print('Actual:', words[argument['start']:argument['end']])
print('start: {}, end: {}, words: {}'.format(argument['start'], argument['end'], words))

print('Complete verification')


def preprocessing(data_type, files):
Expand Down Expand Up @@ -109,10 +142,6 @@ def preprocessing(data_type, files):
end_pos=position[1] - sent_start_pos + 1,
phrase=entity_mention['text'],
)
# start_idx, end_idx = find_token_index_v2(
# words=data['words'],
# phrase=entity_mention['text'],
# )

entity_mention['start'] = start_idx
entity_mention['end'] = end_idx
Expand All @@ -131,10 +160,6 @@ def preprocessing(data_type, files):
end_pos=position[1] - sent_start_pos + 1,
phrase=event_mention['trigger']['text'],
)
# start_idx, end_idx = find_token_index_v2(
# words=data['words'],
# phrase=event_mention['trigger']['text'],
# )

event_mention['trigger']['start'] = start_idx
event_mention['trigger']['end'] = end_idx
Expand All @@ -151,10 +176,7 @@ def preprocessing(data_type, files):
end_pos=position[1] - sent_start_pos + 1,
phrase=argument['text'],
)
# start_idx, end_idx = find_token_index_v2(
# words=data['words'],
# phrase=argument['text'],
# )

argument['start'] = start_idx
argument['end'] = end_idx
del argument['position']
Expand All @@ -172,6 +194,7 @@ def preprocessing(data_type, files):
print('entity :', entity_count)
print('argument:', argument_count)

verify_result(result)
with open('output/{}.json'.format(data_type), 'w') as f:
json.dump(result, f, indent=2)

Expand Down

0 comments on commit db784f1

Please sign in to comment.