-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster_eml_docs.py
104 lines (84 loc) · 2.96 KB
/
cluster_eml_docs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
'''
Author: Jaakko Lappalainen, 2013. email: [email protected]
'''
'''
Some code that uses MR to summarize text in the map phase, and then group
them in the reduce phase using topic similarity. Not working.
'''
from pymongo import MongoClient
from bson.code import Code
#require "sinatra"
#require "rdf"
#require "sparql/client"
#require "sparql"
#require "uri"
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
db = MongoClient('localhost',27777).eml_docs
def cls(): print "\n" * 100
cls()
def getText(raw):
while(raw.__class__.__name__ != 'unicode'):
raw = raw[0]
return raw
def listProjectFields(field):
list = []
for i in db.eml_docs.find():
try:
list.append([i['dataset']['project'][field], i['_id']] )
pass
except KeyError:
pass
return list
def listGeographicDescription():
list = []
for i in db.eml_docs.find():
try:
list.append([i['dataset']['project']['studyAreaDescription'][0][0]['geographicDescription'],i['_id']])
pass
except KeyError:
pass
return list
def listAbstracts():
list = []
for i in db.eml_docs.find():
try:
list.append([i['dataset']['project']['abstract'][0][0],i['_id']])
pass
except KeyError:
pass
return list
# This mapper emits the value of each key on the document
summarizer = Code("function summarizer() {"
" for(var i in this['dataset']) {"
" var sum = require( 'sum' );"
" var summary = sum({ 'corpus': i });"
" emit(summary, this['dataset']['title']);"
" }"
" }")
# Define the function for the nested map reduce.
#clusterize = Code("function clusterize(key, values) {"
# " return mapReduce(compare, countClusters, {'out': test})"
# " }")
# This reducer executes a summarizer for each value emitted. The summarization is done
# using map reduce.
#tagcomparator = Code("function (key, values) {"
# " var total = 0;"
# " for (var i = 0; i < values.length; i++) {"
# " clusterize(key,values);"
# " }"
# " return total;"
# " }")
# This mapper summarizes the input and emits the summary made with sum.js
#compareMap = Code ("function compare() { "
# " var sum = require( 'sum' );"
# " var abstract = sum({ 'corpus': this });"
# " emit(abstract, 1);"
# " }")
# This reducer counts the summarizations.
count = Code ("function (key, values) {"
" return values.length;"
"}")
result = db.eml_docs.all.map_reduce(summarizer, count, "MROutput")
for doc in result.find():
print doc