forked from dbpedia/fact-extractor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_soccer_players_articles.py
41 lines (32 loc) · 1.08 KB
/
get_soccer_players_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import re
import shutil
import sys
DEBUG = True
def load_wiki_ids(filein):
with open(filein) as i:
return [l.strip() for l in i.readlines()]
def extract_soccer_articles(soccer_ids, corpus_dir, output_dir):
for path, subdirs, files in os.walk(corpus_dir):
for name in files:
f = os.path.join(path, name)
with open(f) as i:
content = ''.join(i.readlines())
match = re.search('id="([^"]+)"', content)
current_id = match.group(1)
if DEBUG:
print "File = [%s] - Wiki ID = [%s]" % (f, current_id)
if current_id in soccer_ids:
shutil.copy(f, output_dir)
if DEBUG:
print "MATCHED! [%s]" % content
return 0
if __name__ == "__main__":
if len(sys.argv) != 4:
print "Usage: %s <SOCCER_IDS> <CORPUS_DIR> <OUTPUT_DIR>" % __file__
sys.exit(1)
else:
ids = load_wiki_ids(sys.argv[1])
extract_soccer_articles(ids, sys.argv[2], sys.argv[3])