-
Notifications
You must be signed in to change notification settings - Fork 0
/
ted_talk_lexical_feature_processor.py
executable file
·86 lines (73 loc) · 2.56 KB
/
ted_talk_lexical_feature_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import csv
import cPickle as cp
from TED_data_location import ted_data_path
from list_of_talks import all_valid_talks
from nltk.tokenize import word_tokenize
def ReadLIWCDictionary(path):
f = open(path)
lines = f.readlines()
f.close()
dic = {}
for line in lines:
parts = line.lstrip().rstrip().split("\t")
values = list()
for i in range(1, len(parts)):
# print(parts[0], parts[i])
values.append(int(parts[i]))
dic[parts[0]] = values
return dic
def ReadLIWCCategories(path):
f = open(path)
lines = f.readlines()
f.close()
categories = lines[0].split("\r")
catdic = {}
for cat in categories:
catparts = cat.split("\t")
catdic[int(catparts[0])] = catparts[1]
return catdic
liwcpath = os.path.join(ted_data_path,'misc/')
LIWCDic = ReadLIWCDictionary(os.path.join(liwcpath,'liwcdic2007.dic'))
categories = ReadLIWCCategories(os.path.join(liwcpath,'liwccat2007.txt'))
def match(word,LIWCDic=LIWCDic):
if word in LIWCDic:
return LIWCDic[word]
for i in range(1,len(word)):
key = word[:i] + "*"
if key in LIWCDic:
return LIWCDic[key]
return list()
def feat(wrdlist,LIWCDic=LIWCDic,cats=categories):
feat_count={cats[acat]:0 for acat in cats}
m = float(len(wrdlist))
for awrd in wrdlist:
for acat in match(awrd):
if acat in cats:
# Word LIWC categories normalized by word count
feat_count[cats[acat]]+=1./m
return feat_count
def prepare_lexical_feat(talklist=all_valid_talks,
featurefile = 'misc/lexical.csv'):
'''
Reads, prepares the lexical features and writes in featurefile
'''
with open(os.path.join(ted_data_path,featurefile),'wb') as fout:
writer = None
for atalk in talklist:
print 'Processing Lexical Features of',atalk
pklname = os.path.join(ted_data_path,'TED_meta/'+str(atalk)+'.pkl')
if not os.path.exists(pklname):
print 'Not found',atalk
continue
data = cp.load(open(pklname))
wrds = word_tokenize(' '.join([aline for apara in \
data['talk_transcript'] for aline in apara]))
features = feat(wrds)
# Read the labels for the first time
if not writer:
writer = csv.DictWriter(fout,['TalkID']+\
sorted(features.keys()))
writer.writeheader()
features['TalkID'] = atalk
writer.writerow(features)