From b7a0f070f58cba20c60c087fb14b3f0de1045b4c Mon Sep 17 00:00:00 2001
From: Jarno Bakker <jarno.bakker@di.huc.knaw.nl>
Date: Tue, 23 Jul 2024 13:47:03 +0200
Subject: [PATCH] python linter cleanup

---
 src/import/article.py        | 62 +++++++++++++++++++-----------------
 src/import/test_import.py    | 10 +++---
 src/service/app.py           | 17 ++++++++--
 src/service/elastic_index.py | 62 +++++++++++++++++++++++++++++++-----
 4 files changed, 106 insertions(+), 45 deletions(-)

diff --git a/src/import/article.py b/src/import/article.py
index 3214f40..e3e3d41 100644
--- a/src/import/article.py
+++ b/src/import/article.py
@@ -13,25 +13,24 @@ class Article:
     Represents a single article.
     """
 
-    def __init__(self, articleId, folder, path, from_raw, newsgroups, subject, message_id, date, x_gateway, lines, xref,
-                 body, references):
+    def __init__(self, article_id, data):
         self.from_name = None
         self.from_email = None
         self.date = None
-        self.articleId = articleId
-        self.location = folder
-        self.path = path
-        self.newsgroups = newsgroups.split(',')
-        self.subject = subject
-        self.message_id = message_id
-        self.x_gateway = x_gateway
-        self.lines = lines
-        self.xref = xref
-        self.body = body
-        self.references = references
-
-        self.set_from(from_raw)
-        self.set_date(date)
+        self.article_id = article_id
+        self.location = data['folder']
+        self.path = data['path']
+        self.newsgroups = data['newsgroups'].split(',')
+        self.subject = data['subject']
+        self.message_id = data['message_id']
+        self.x_gateway = data['x_gateway']
+        self.lines = data['lines']
+        self.xref = data['xref']
+        self.body = data['body']
+        self.references = data['references']
+
+        self.set_from(data['from_raw'])
+        self.set_date(data['date'])
 
     def set_date(self, date):
         """
@@ -78,7 +77,7 @@ def to_dict(self):
         :return:
         """
         return {
-            'id': self.articleId,
+            'id': self.article_id,
             'path': self.path,
             'folder': self.location,
             'from_name': self.from_name,
@@ -126,18 +125,23 @@ def from_file(path):
         pathstr = str(path)
         path_parts = pathstr.split('/')
         article_id = '-'.join(path_parts[-3:])
+
+        data = {
+            'path': msg['Path'],
+            'folder': '/'.join(path_parts[-3:-1]),
+            'from_raw': msg['From'],
+            'newsgroups': msg.get('Newsgroups', ''),
+            'subject': msg['Subject'],
+            'message_id': msg['Message-ID'],
+            'date': msg['Date'],
+            'x_gateway': msg.get('X-Gateway', ''),
+            'lines': msg['Lines'],
+            'xref': msg['X-Reference'],
+            'references': msg.get('References', ''),
+            'body': body.strip(),
+        }
+
         return Article(
             article_id,
-            '/'.join(path_parts[-3:-1]),
-            msg['Path'],
-            msg['From'],
-            msg.get('Newsgroups', ''),
-            msg['Subject'],
-            msg['Message-ID'],
-            msg['Date'],
-            msg.get('X-Gateway', ''),
-            msg['Lines'],
-            msg.get('Xref', ''),
-            body.strip(),
-            msg.get('References', '')
+            data
         )
diff --git a/src/import/test_import.py b/src/import/test_import.py
index f193027..287e6bd 100755
--- a/src/import/test_import.py
+++ b/src/import/test_import.py
@@ -4,15 +4,16 @@
 This is a test import script which imports data from zipped
 Usenet news files.
 """
+from elasticsearch.helpers import BulkIndexError
 
-from article import Article
 import os
 import sys
 from pathlib import Path
-from elasticsearch import Elasticsearch, helpers
 import tempfile
 import zipfile
 import concurrent.futures
+from article import Article
+from elasticsearch import Elasticsearch, helpers
 import yaml
 
 
@@ -63,7 +64,6 @@ def create_mapping(name):
 
     client.indices.create(
         index=name,
-        ignore=400,
         body=settings,
     )
 
@@ -118,8 +118,8 @@ def process_bulk(filenames):
         for article in articles
     ]
     try:
-        res = helpers.bulk(client, bulk)
-    except BaseException as e:
+        helpers.bulk(client, bulk)
+    except BulkIndexError as e:
         print("Error!")
         print(e.__class__.__name__)
         print(e)
diff --git a/src/service/app.py b/src/service/app.py
index 0097725..54f197e 100644
--- a/src/service/app.py
+++ b/src/service/app.py
@@ -1,3 +1,8 @@
+"""
+The Flask application, with routes for handling the
+back-end of the application.
+"""
+
 import os
 from elastic_index import Index
 from flask import Flask, request, jsonify
@@ -35,12 +40,13 @@ def catch_all():
     return app.send_static_file("index.html")
 
 
-@app.route("/detail/<id>")
-def detail(id):
+@app.route("/detail/<article_id>")
+def detail(article_id):
     """
     Return the front-end, pages are handled by React
     :return:
     """
+    print(f"Requesting page for id: {article_id}")
     return app.send_static_file("index.html")
 
 
@@ -66,7 +72,12 @@ def get_facet():
     :return:
     """
     struc = request.get_json()
-    ret_struc = index.get_facet(struc["name"], struc["amount"], struc["filter"], struc["searchvalues"])
+    ret_struc = index.get_facet(
+        struc["name"],
+        struc["amount"],
+        struc["filter"],
+        struc["searchvalues"]
+    )
     return jsonify(ret_struc)
 
 
diff --git a/src/service/elastic_index.py b/src/service/elastic_index.py
index 28c234b..e7eb4a4 100644
--- a/src/service/elastic_index.py
+++ b/src/service/elastic_index.py
@@ -1,8 +1,11 @@
-import json
+"""
+elastic_index.py
+This includes class Index for dealing with Elasticsearch.
+Contains methods for finding articles.
+"""
 
 import yaml
 from elasticsearch import Elasticsearch
-import string
 import math
 from werkzeug.exceptions import NotFound
 
@@ -12,7 +15,13 @@ def __init__(self, config):
         self.config = config
         self.client = Elasticsearch([{"host": self.config["url"]}])
 
-    def no_case(selfself, str_in):
+    @staticmethod
+    def no_case(str_in):
+        """
+        Create query from string, case insensitive.
+        :param str_in:
+        :return:
+        """
         str = str_in.strip()
         ret_str = ""
         if str != "":
@@ -22,6 +31,11 @@ def no_case(selfself, str_in):
 
     @staticmethod
     def make_matches(searchvalues):
+        """
+        Create match queries.
+        :param searchvalues:
+        :return:
+        """
         must_collection = []
         for item in searchvalues:
             if item["field"] == "FREE_TEXT":
@@ -37,6 +51,14 @@ def make_matches(searchvalues):
         return must_collection
 
     def get_facet(self, field, amount, facet_filter, search_values):
+        """
+        Get a facet.
+        :param field:
+        :param amount:
+        :param facet_filter:
+        :param search_values:
+        :return:
+        """
         terms = {
             "field": field + ".keyword",
             "size": amount,
@@ -46,8 +68,6 @@ def get_facet(self, field, amount, facet_filter, search_values):
         }
 
         if facet_filter:
-            # filtered_filter = facet_filter.translate(str.maketrans('', '', string.punctuation))
-            # filtered_filter = ''.join([f"[{char.upper()}{char.lower()}]" for char in filtered_filter])
             filtered_filter = ''.join([f"[{char.upper()}{char.lower()}]" for char in facet_filter])
             terms["include"] = f'.*{filtered_filter}.*'
 
@@ -72,6 +92,13 @@ def get_facet(self, field, amount, facet_filter, search_values):
                 for hits in response["aggregations"]["names"]["buckets"]]
 
     def get_filter_facet(self, field, amount, facet_filter):
+        """
+        Get a filter facet.
+        :param field:
+        :param amount:
+        :param facet_filter:
+        :return:
+        """
         ret_array = []
         response = self.client.search(
             index="articles",
@@ -103,6 +130,13 @@ def get_filter_facet(self, field, amount, facet_filter):
         return ret_array
 
     def get_nested_facet(self, field, amount, facet_filter):
+        """
+        Get a nested facet.
+        :param field:
+        :param amount:
+        :param facet_filter:
+        :return:
+        """
         ret_array = []
         path = field.split('.')[0]
         response = self.client.search(
@@ -154,6 +188,13 @@ def get_min_max(self, fields):
         return tmp
 
     def browse(self, page, length, search_values):
+        """
+        Search for articles.
+        :param page:
+        :param length:
+        :param search_values:
+        :return:
+        """
         int_page = int(page)
         start = (int_page - 1) * length
 
@@ -184,6 +225,11 @@ def browse(self, page, length, search_values):
                 "items": [item["_source"] for item in response["hits"]["hits"]]}
 
     def get_facets(self):
+        """
+        Get all facets. Parses the configuration YAML file for determining
+        what facets are available.
+        :return:
+        """
         with open("fields.yaml", 'r') as stream:
             data = yaml.safe_load(stream)
         tmp = {}
@@ -252,13 +298,13 @@ def get_replies(self, message_id):
         })
         return response["hits"]['hits']
 
-    def by_id(self, id):
+    def by_id(self, article_id):
         """
         Get an article by id
-        :param id:
+        :param article_id:
         :return:
         """
-        res = self.client.get(index='articles', id=id)
+        res = self.client.get(index='articles', id=article_id)
         if not res['found']:
             raise NotFound('Article not found')