-
Notifications
You must be signed in to change notification settings - Fork 0
/
es_loader.py
executable file
·224 lines (191 loc) · 8.67 KB
/
es_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/env python3
import argparse
import os
import yaml
from elasticsearch import Elasticsearch, RequestsHttpConnection
from elasticsearch.helpers import streaming_bulk
from requests_aws4auth import AWS4Auth
from botocore.session import Session
from neo4j import GraphDatabase
from bento.common.utils import get_logger, print_config
from icdc_schema import ICDC_Schema, PROPERTIES, ENUM, PROP_ENUM, PROP_TYPE, REQUIRED, DESCRIPTION
from props import Props
logger = get_logger('ESLoader')
class ESLoader:
def __init__(self, es_host, neo4j_driver):
self.neo4j_driver = neo4j_driver
if 'amazonaws.com' in es_host:
awsauth = AWS4Auth(
refreshable_credentials=Session().get_credentials(),
region='us-east-1',
service='es'
)
self.es_client = Elasticsearch(
hosts=[es_host],
http_auth = awsauth,
use_ssl = True,
verify_certs = True,
connection_class = RequestsHttpConnection
)
else:
self.es_client = Elasticsearch(hosts=[es_host])
def create_index(self, index_name, mapping):
"""Creates an index in Elasticsearch if one isn't already there."""
return self.es_client.indices.create(
index=index_name,
body={
"settings": {"number_of_shards": 1, "index.max_result_window": 100000, "index.mapping.nested_objects.limit": 100000},
"mappings": {
"properties": mapping
},
},
ignore=400,
)
def delete_index(self, index_name):
return self.es_client.indices.delete(index=index_name, ignore_unavailable=True)
def get_data(self, cypher_query, fields):
"""Reads data from Neo4j, for each row
yields a single document. This function is passed into the bulk()
helper to create many documents in sequence.
"""
with self.neo4j_driver.session() as session:
result = session.run(cypher_query)
for record in result:
doc = {}
for key in fields:
doc[key] = record[key]
yield doc
def recreate_index(self, index_name, mapping):
logger.info(f'Deleting old index "{index_name}"')
result = self.delete_index(index_name)
logger.info(result)
logger.info(f'Creating index "{index_name}"')
result = self.create_index(index_name, mapping)
logger.info(result)
def load(self, index_name, mapping, cypher_query):
self.recreate_index(index_name, mapping)
logger.info('Indexing data from Neo4j')
self.bulk_load(index_name, self.get_data(cypher_query, mapping.keys()))
def bulk_load(self, index_name, data):
logger.info('Indexing data in bulk ...')
successes = 0
total = 0
for ok, _ in streaming_bulk(
client=self.es_client,
index=index_name,
actions=data
):
total += 1
successes += 1 if ok else 0
logger.info(f"Indexed {successes}/{total} documents")
def load_about_page(self, index_name, mapping, file_name):
logger.info('Indexing content from about page')
if not os.path.isfile(file_name):
raise Exception(f'"{file_name} is not a file!')
self.recreate_index(index_name, mapping)
with open(file_name) as file_obj:
about_file = yaml.safe_load(file_obj)
for page in about_file:
logger.info(f'Indexing about page "{page["page"]}"')
self.index_data(index_name, page, f'page{page["page"]}')
def read_model(self, model_files, prop_file):
for file_name in model_files:
if not os.path.isfile(file_name):
raise Exception(f'"{file_name} is not a file!')
if not os.path.isfile(prop_file):
raise Exception(f'"{prop_file} is not a file!')
self.model = ICDC_Schema(model_files, Props(prop_file))
def load_model(self, index_name, mapping, subtype):
logger.info(f'Indexing data model')
if not self.model:
logger.warning(f'Data model is not loaded, {index_name} will not be loaded!')
return
self.recreate_index(index_name, mapping)
self.bulk_load(index_name, self.get_model_data(subtype))
def get_model_data(self, subtype):
nodes = self.model.nodes
for node_name, obj in nodes.items():
props = obj[PROPERTIES]
if subtype == 'node':
yield {
'type': 'node',
'node': node_name,
'node_name': node_name,
'node_kw': node_name
}
else:
for prop_name, prop in props.items():
# Skip relationship based properties
if "@relation" in obj[PROPERTIES][prop_name][PROP_TYPE]:
continue
if subtype == 'property':
yield {
'type': 'property',
'node': node_name,
'node_name': node_name,
'property': prop_name,
'property_name': prop_name,
'property_kw': prop_name,
'property_description': prop.get(DESCRIPTION, ''),
'property_required': prop.get(REQUIRED, False),
'property_type': PROP_ENUM if ENUM in prop else prop[PROP_TYPE]
}
elif subtype == 'value' and ENUM in prop:
for value in prop[ENUM]:
yield {
'type': 'value',
"node": node_name,
"node_name": node_name,
"property": prop_name,
"property_name": prop_name,
'property_description': prop.get(DESCRIPTION, ''),
'property_required': prop.get(REQUIRED, False),
'property_type': PROP_ENUM,
"value": value,
"value_kw": value
}
def index_data(self, index_name, object, id):
self.es_client.index(index_name, body=object, id=id)
def main(args=None):
parser = argparse.ArgumentParser(description='Load data from Neo4j to Elasticsearch')
parser.add_argument('indices_file',
type=argparse.FileType('r'),
help='Configuration file for indices, example is in config/es_indices.example.yml')
parser.add_argument('config_file',
type=argparse.FileType('r'),
help='Configuration file, example is in config/es_loader.example.yml')
args = parser.parse_args() if not args else parser.parse_args(args=args)
config = yaml.safe_load(args.config_file)['Config']
indices = yaml.safe_load(args.indices_file)['Indices']
print_config(logger, config)
neo4j_driver = GraphDatabase.driver(
config['neo4j_uri'],
auth=(config['neo4j_user'], config['neo4j_password']),
encrypted=False
)
loader = ESLoader(
es_host=config['es_host'],
neo4j_driver=neo4j_driver
)
load_model = False
if 'model_files' in config and config['model_files'] and 'prop_file' in config and config['prop_file']:
loader.read_model(config['model_files'], config['prop_file'])
load_model = True
for index in indices:
if 'type' not in index or index['type'] == 'neo4j':
loader.load(index['index_name'], index['mapping'], index['cypher_query'])
elif index['type'] == 'about_file':
if 'about_file' in config:
loader.load_about_page(index['index_name'], index['mapping'], config['about_file'])
else:
logger.warning(f'"about_file" not set in configuration file, {index["index_name"]} will not be loaded!')
elif index['type'] == 'model':
if load_model and 'subtype' in index:
loader.load_model(index['index_name'], index['mapping'], index['subtype'])
else:
logger.warning(f'"model_files" not set in configuration file, {index["index_name"]} will not be loaded!')
else:
logger.error(f'Unknown index type: "{index["type"]}"')
continue
if __name__ == '__main__':
main()