diff --git a/src/import/article.py b/src/import/article.py index e3e3d41..71570b3 100644 --- a/src/import/article.py +++ b/src/import/article.py @@ -13,24 +13,17 @@ class Article: Represents a single article. """ - def __init__(self, article_id, data): + def __init__(self, article_id, headers, body): self.from_name = None self.from_email = None self.date = None + self.headers = headers self.article_id = article_id - self.location = data['folder'] - self.path = data['path'] - self.newsgroups = data['newsgroups'].split(',') - self.subject = data['subject'] - self.message_id = data['message_id'] - self.x_gateway = data['x_gateway'] - self.lines = data['lines'] - self.xref = data['xref'] - self.body = data['body'] - self.references = data['references'] - - self.set_from(data['from_raw']) - self.set_date(data['date']) + self.body = body + self.references = headers['references'] + + self.set_from(headers['from_raw']) + self.set_date(headers['date']) def set_date(self, date): """ @@ -78,18 +71,18 @@ def to_dict(self): """ return { 'id': self.article_id, - 'path': self.path, - 'folder': self.location, + 'path': self.headers['path'], + 'folder': self.headers['location'], 'from_name': self.from_name, 'from_email': self.from_email, - 'newsgroups': self.newsgroups, - 'subject': self.subject, - 'message_id': self.message_id, + 'newsgroups': self.headers['newsgroups'], + 'subject': self.headers['subject'], + 'message_id': self.headers['subject'], 'date': self.date.isoformat(), 'year': self.date.year, - 'x_gateway': self.x_gateway, - 'lines': self.lines, - 'xref': self.xref, + 'x_gateway': self.headers['x_gateway'], + 'lines': self.headers['lines'], + 'xref': self.headers['xref'], 'references': self.references, 'body': self.body, } @@ -126,7 +119,7 @@ def from_file(path): path_parts = pathstr.split('/') article_id = '-'.join(path_parts[-3:]) - data = { + headers = { 'path': msg['Path'], 'folder': '/'.join(path_parts[-3:-1]), 'from_raw': msg['From'], @@ -138,10 +131,10 @@ def from_file(path): 'lines': msg['Lines'], 'xref': msg['X-Reference'], 'references': msg.get('References', ''), - 'body': body.strip(), } return Article( article_id, - data + headers, + body.strip() ) diff --git a/src/import/test_import.py b/src/import/test_import.py index 287e6bd..2e65630 100755 --- a/src/import/test_import.py +++ b/src/import/test_import.py @@ -139,8 +139,8 @@ def import_zip(file): bulks = list(split_bulk(article_locations, 250)) print(f"Processing {len(bulks)} bulks from {file}") - with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: - executor.map(process_bulk, bulks) + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as bulk_executor: + bulk_executor.map(process_bulk, bulks) print(f"Imported {len(article_locations)} articles from {file}") return len(article_locations) @@ -151,5 +151,5 @@ def import_zip(file): zipfiles = Path("data").rglob("*.zip") - with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: - executor.map(import_zip, zipfiles) + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as zip_executor: + zip_executor.map(import_zip, zipfiles) diff --git a/src/service/elastic_index.py b/src/service/elastic_index.py index e7eb4a4..6d751a4 100644 --- a/src/service/elastic_index.py +++ b/src/service/elastic_index.py @@ -11,6 +11,10 @@ class Index: + """ + An elasticsearch index of articles. + """ + def __init__(self, config): self.config = config self.client = Elasticsearch([{"host": self.config["url"]}]) @@ -18,14 +22,14 @@ def __init__(self, config): @staticmethod def no_case(str_in): """ - Create query from string, case insensitive. + Create query from string, case-insensitive. :param str_in: :return: """ - str = str_in.strip() + string = str_in.strip() ret_str = "" - if str != "": - for char in str: + if string != "": + for char in string: ret_str = ret_str + "[" + char.upper() + char.lower() + "]" return ret_str + ".*" @@ -44,7 +48,9 @@ def make_matches(searchvalues): elif item["field"] in ["year", "lines"]: range_values = item["values"][0] r_array = range_values.split('-') - must_collection.append({"range": {item["field"]: {"gte": r_array[0], "lte": r_array[1]}}}) + must_collection.append( + {"range": {item["field"]: {"gte": r_array[0], "lte": r_array[1]}}} + ) else: for value in item["values"]: must_collection.append({"match": {item["field"] + ".keyword": value}}) @@ -91,11 +97,10 @@ def get_facet(self, field, amount, facet_filter, search_values): return [{"key": hits["key"], "doc_count": hits["doc_count"]} for hits in response["aggregations"]["names"]["buckets"]] - def get_filter_facet(self, field, amount, facet_filter): + def get_filter_facet(self, field, facet_filter): """ Get a filter facet. :param field: - :param amount: :param facet_filter: :return: """ @@ -129,12 +134,11 @@ def get_filter_facet(self, field, amount, facet_filter): ret_array.append(buffer) return ret_array - def get_nested_facet(self, field, amount, facet_filter): + def get_nested_facet(self, field, amount): """ Get a nested facet. :param field: :param amount: - :param facet_filter: :return: """ ret_array = [] @@ -142,9 +146,33 @@ def get_nested_facet(self, field, amount, facet_filter): response = self.client.search( index="articles", body= - {"size": 0, "aggs": {"nested_terms": {"nested": {"path": path}, "aggs": { - "filter": {"filter": {"regexp": {"$field.raw": "$filter.*"}}, - "aggs": {"names": {"terms": {"field": "$field.raw", "size": amount}}}}}}}} + { + "size": 0, + "aggs": { + "nested_terms": { + "nested": { + "path": path + }, + "aggs": { + "filter": { + "filter": { + "regexp": { + "$field.raw": "$filter.*" + } + }, + "aggs": { + "names": { + "terms": { + "field": "$field.raw", + "size": amount + } + } + } + } + } + } + } + } ) for hits in response["aggregations"]["nested_terms"]["filter"]["names"]["buckets"]: buffer = {"key": hits["key"], "doc_count": hits["doc_count"]} @@ -213,8 +241,8 @@ def browse(self, page, length, search_values): "query": query, "size": length, "from": start, - "_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject", "message_id", "date", - "x_gateway", "lines", "xref", "body", "references"], + "_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject", + "message_id", "date", "x_gateway", "lines", "xref", "body", "references"], "sort": [ {"date": {"order": "asc"}} ] @@ -265,8 +293,8 @@ def by_message_id(self, message_id): "query": query, "size": 1, "from": 0, - "_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject", "message_id", "date", - "x_gateway", "lines", "xref", "body", "references"], + "_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject", + "message_id", "date", "x_gateway", "lines", "xref", "body", "references"], "sort": [ {"date": {"order": "asc"}} ] @@ -290,8 +318,9 @@ def get_replies(self, message_id): } response = self.client.search(index="articles", body={ "query": query, - "_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject", "message_id", "date", - "x_gateway", "lines", "xref", "body", "references", "body"], + "_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject", + "message_id", "date", "x_gateway", "lines", "xref", "body", "references", + "body"], "sort": [ {"date": {"order": "asc"}} ]