linter

knaw-huc · Jul 23, 2024 · 7ec3bd2 · 7ec3bd2
1 parent b7a0f07
commit 7ec3bd2
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 47 deletions.
diff --git a/src/import/article.py b/src/import/article.py
@@ -13,24 +13,17 @@ class Article:
     Represents a single article.
     """
 
-    def __init__(self, article_id, data):
+    def __init__(self, article_id, headers, body):
         self.from_name = None
         self.from_email = None
         self.date = None
+        self.headers = headers
         self.article_id = article_id
-        self.location = data['folder']
-        self.path = data['path']
-        self.newsgroups = data['newsgroups'].split(',')
-        self.subject = data['subject']
-        self.message_id = data['message_id']
-        self.x_gateway = data['x_gateway']
-        self.lines = data['lines']
-        self.xref = data['xref']
-        self.body = data['body']
-        self.references = data['references']
-
-        self.set_from(data['from_raw'])
-        self.set_date(data['date'])
+        self.body = body
+        self.references = headers['references']
+
+        self.set_from(headers['from_raw'])
+        self.set_date(headers['date'])
 
     def set_date(self, date):
         """
@@ -78,18 +71,18 @@ def to_dict(self):
         """
         return {
             'id': self.article_id,
-            'path': self.path,
-            'folder': self.location,
+            'path': self.headers['path'],
+            'folder': self.headers['location'],
             'from_name': self.from_name,
             'from_email': self.from_email,
-            'newsgroups': self.newsgroups,
-            'subject': self.subject,
-            'message_id': self.message_id,
+            'newsgroups': self.headers['newsgroups'],
+            'subject': self.headers['subject'],
+            'message_id': self.headers['subject'],
             'date': self.date.isoformat(),
             'year': self.date.year,
-            'x_gateway': self.x_gateway,
-            'lines': self.lines,
-            'xref': self.xref,
+            'x_gateway': self.headers['x_gateway'],
+            'lines': self.headers['lines'],
+            'xref': self.headers['xref'],
             'references': self.references,
             'body': self.body,
         }
@@ -126,7 +119,7 @@ def from_file(path):
         path_parts = pathstr.split('/')
         article_id = '-'.join(path_parts[-3:])
 
-        data = {
+        headers = {
             'path': msg['Path'],
             'folder': '/'.join(path_parts[-3:-1]),
             'from_raw': msg['From'],
@@ -138,10 +131,10 @@ def from_file(path):
             'lines': msg['Lines'],
             'xref': msg['X-Reference'],
             'references': msg.get('References', ''),
-            'body': body.strip(),
         }
 
         return Article(
             article_id,
-            data
+            headers,
+            body.strip()
         )
diff --git a/src/import/test_import.py b/src/import/test_import.py
@@ -139,8 +139,8 @@ def import_zip(file):
 
         bulks = list(split_bulk(article_locations, 250))
         print(f"Processing {len(bulks)} bulks from {file}")
-        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
-            executor.map(process_bulk, bulks)
+        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as bulk_executor:
+            bulk_executor.map(process_bulk, bulks)
 
         print(f"Imported {len(article_locations)} articles from {file}")
     return len(article_locations)
@@ -151,5 +151,5 @@ def import_zip(file):
 
     zipfiles = Path("data").rglob("*.zip")
 
-    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
-        executor.map(import_zip, zipfiles)
+    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as zip_executor:
+        zip_executor.map(import_zip, zipfiles)
diff --git a/src/service/elastic_index.py b/src/service/elastic_index.py
@@ -11,21 +11,25 @@
 
 
 class Index:
+    """
+    An elasticsearch index of articles.
+    """
+
     def __init__(self, config):
         self.config = config
         self.client = Elasticsearch([{"host": self.config["url"]}])
 
     @staticmethod
     def no_case(str_in):
         """
-        Create query from string, case insensitive.
+        Create query from string, case-insensitive.
         :param str_in:
         :return:
         """
-        str = str_in.strip()
+        string = str_in.strip()
         ret_str = ""
-        if str != "":
-            for char in str:
+        if string != "":
+            for char in string:
                 ret_str = ret_str + "[" + char.upper() + char.lower() + "]"
         return ret_str + ".*"
 
@@ -44,7 +48,9 @@ def make_matches(searchvalues):
             elif item["field"] in ["year", "lines"]:
                 range_values = item["values"][0]
                 r_array = range_values.split('-')
-                must_collection.append({"range": {item["field"]: {"gte": r_array[0], "lte": r_array[1]}}})
+                must_collection.append(
+                    {"range": {item["field"]: {"gte": r_array[0], "lte": r_array[1]}}}
+                )
             else:
                 for value in item["values"]:
                     must_collection.append({"match": {item["field"] + ".keyword": value}})
@@ -91,11 +97,10 @@ def get_facet(self, field, amount, facet_filter, search_values):
         return [{"key": hits["key"], "doc_count": hits["doc_count"]}
                 for hits in response["aggregations"]["names"]["buckets"]]
 
-    def get_filter_facet(self, field, amount, facet_filter):
+    def get_filter_facet(self, field, facet_filter):
         """
         Get a filter facet.
         :param field:
-        :param amount:
         :param facet_filter:
         :return:
         """
@@ -129,22 +134,45 @@ def get_filter_facet(self, field, amount, facet_filter):
                 ret_array.append(buffer)
         return ret_array
 
-    def get_nested_facet(self, field, amount, facet_filter):
+    def get_nested_facet(self, field, amount):
         """
         Get a nested facet.
         :param field:
         :param amount:
-        :param facet_filter:
         :return:
         """
         ret_array = []
         path = field.split('.')[0]
         response = self.client.search(
             index="articles",
             body=
-            {"size": 0, "aggs": {"nested_terms": {"nested": {"path": path}, "aggs": {
-                "filter": {"filter": {"regexp": {"$field.raw": "$filter.*"}},
-                           "aggs": {"names": {"terms": {"field": "$field.raw", "size": amount}}}}}}}}
+            {
+                "size": 0,
+                "aggs": {
+                    "nested_terms": {
+                        "nested": {
+                            "path": path
+                        },
+                        "aggs": {
+                            "filter": {
+                                "filter": {
+                                    "regexp": {
+                                        "$field.raw": "$filter.*"
+                                    }
+                                },
+                                "aggs": {
+                                    "names": {
+                                        "terms": {
+                                            "field": "$field.raw",
+                                            "size": amount
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
         )
         for hits in response["aggregations"]["nested_terms"]["filter"]["names"]["buckets"]:
             buffer = {"key": hits["key"], "doc_count": hits["doc_count"]}
@@ -213,8 +241,8 @@ def browse(self, page, length, search_values):
             "query": query,
             "size": length,
             "from": start,
-            "_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject", "message_id", "date",
-                        "x_gateway", "lines", "xref", "body", "references"],
+            "_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject",
+                        "message_id", "date", "x_gateway", "lines", "xref", "body", "references"],
             "sort": [
                 {"date": {"order": "asc"}}
             ]
@@ -265,8 +293,8 @@ def by_message_id(self, message_id):
             "query": query,
             "size": 1,
             "from": 0,
-            "_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject", "message_id", "date",
-                        "x_gateway", "lines", "xref", "body", "references"],
+            "_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject",
+                        "message_id", "date", "x_gateway", "lines", "xref", "body", "references"],
             "sort": [
                 {"date": {"order": "asc"}}
             ]
@@ -290,8 +318,9 @@ def get_replies(self, message_id):
         }
         response = self.client.search(index="articles", body={
             "query": query,
-            "_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject", "message_id", "date",
-                        "x_gateway", "lines", "xref", "body", "references", "body"],
+            "_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject",
+                        "message_id", "date", "x_gateway", "lines", "xref", "body", "references",
+                        "body"],
             "sort": [
                 {"date": {"order": "asc"}}
             ]