Skip to content

Commit

Permalink
linter
Browse files Browse the repository at this point in the history
  • Loading branch information
jarno-knaw committed Jul 23, 2024
1 parent b7a0f07 commit 7ec3bd2
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 47 deletions.
43 changes: 18 additions & 25 deletions src/import/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,17 @@ class Article:
Represents a single article.
"""

def __init__(self, article_id, data):
def __init__(self, article_id, headers, body):
self.from_name = None
self.from_email = None
self.date = None
self.headers = headers
self.article_id = article_id
self.location = data['folder']
self.path = data['path']
self.newsgroups = data['newsgroups'].split(',')
self.subject = data['subject']
self.message_id = data['message_id']
self.x_gateway = data['x_gateway']
self.lines = data['lines']
self.xref = data['xref']
self.body = data['body']
self.references = data['references']

self.set_from(data['from_raw'])
self.set_date(data['date'])
self.body = body
self.references = headers['references']

self.set_from(headers['from_raw'])
self.set_date(headers['date'])

def set_date(self, date):
"""
Expand Down Expand Up @@ -78,18 +71,18 @@ def to_dict(self):
"""
return {
'id': self.article_id,
'path': self.path,
'folder': self.location,
'path': self.headers['path'],
'folder': self.headers['location'],
'from_name': self.from_name,
'from_email': self.from_email,
'newsgroups': self.newsgroups,
'subject': self.subject,
'message_id': self.message_id,
'newsgroups': self.headers['newsgroups'],
'subject': self.headers['subject'],
'message_id': self.headers['subject'],
'date': self.date.isoformat(),
'year': self.date.year,
'x_gateway': self.x_gateway,
'lines': self.lines,
'xref': self.xref,
'x_gateway': self.headers['x_gateway'],
'lines': self.headers['lines'],
'xref': self.headers['xref'],
'references': self.references,
'body': self.body,
}
Expand Down Expand Up @@ -126,7 +119,7 @@ def from_file(path):
path_parts = pathstr.split('/')
article_id = '-'.join(path_parts[-3:])

data = {
headers = {
'path': msg['Path'],
'folder': '/'.join(path_parts[-3:-1]),
'from_raw': msg['From'],
Expand All @@ -138,10 +131,10 @@ def from_file(path):
'lines': msg['Lines'],
'xref': msg['X-Reference'],
'references': msg.get('References', ''),
'body': body.strip(),
}

return Article(
article_id,
data
headers,
body.strip()
)
8 changes: 4 additions & 4 deletions src/import/test_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ def import_zip(file):

bulks = list(split_bulk(article_locations, 250))
print(f"Processing {len(bulks)} bulks from {file}")
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
executor.map(process_bulk, bulks)
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as bulk_executor:
bulk_executor.map(process_bulk, bulks)

print(f"Imported {len(article_locations)} articles from {file}")
return len(article_locations)
Expand All @@ -151,5 +151,5 @@ def import_zip(file):

zipfiles = Path("data").rglob("*.zip")

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
executor.map(import_zip, zipfiles)
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as zip_executor:
zip_executor.map(import_zip, zipfiles)
65 changes: 47 additions & 18 deletions src/service/elastic_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,25 @@


class Index:
"""
An elasticsearch index of articles.
"""

def __init__(self, config):
self.config = config
self.client = Elasticsearch([{"host": self.config["url"]}])

@staticmethod
def no_case(str_in):
"""
Create query from string, case insensitive.
Create query from string, case-insensitive.
:param str_in:
:return:
"""
str = str_in.strip()
string = str_in.strip()
ret_str = ""
if str != "":
for char in str:
if string != "":
for char in string:
ret_str = ret_str + "[" + char.upper() + char.lower() + "]"
return ret_str + ".*"

Expand All @@ -44,7 +48,9 @@ def make_matches(searchvalues):
elif item["field"] in ["year", "lines"]:
range_values = item["values"][0]
r_array = range_values.split('-')
must_collection.append({"range": {item["field"]: {"gte": r_array[0], "lte": r_array[1]}}})
must_collection.append(
{"range": {item["field"]: {"gte": r_array[0], "lte": r_array[1]}}}
)
else:
for value in item["values"]:
must_collection.append({"match": {item["field"] + ".keyword": value}})
Expand Down Expand Up @@ -91,11 +97,10 @@ def get_facet(self, field, amount, facet_filter, search_values):
return [{"key": hits["key"], "doc_count": hits["doc_count"]}
for hits in response["aggregations"]["names"]["buckets"]]

def get_filter_facet(self, field, amount, facet_filter):
def get_filter_facet(self, field, facet_filter):
"""
Get a filter facet.
:param field:
:param amount:
:param facet_filter:
:return:
"""
Expand Down Expand Up @@ -129,22 +134,45 @@ def get_filter_facet(self, field, amount, facet_filter):
ret_array.append(buffer)
return ret_array

def get_nested_facet(self, field, amount, facet_filter):
def get_nested_facet(self, field, amount):
"""
Get a nested facet.
:param field:
:param amount:
:param facet_filter:
:return:
"""
ret_array = []
path = field.split('.')[0]
response = self.client.search(
index="articles",
body=
{"size": 0, "aggs": {"nested_terms": {"nested": {"path": path}, "aggs": {
"filter": {"filter": {"regexp": {"$field.raw": "$filter.*"}},
"aggs": {"names": {"terms": {"field": "$field.raw", "size": amount}}}}}}}}
{
"size": 0,
"aggs": {
"nested_terms": {
"nested": {
"path": path
},
"aggs": {
"filter": {
"filter": {
"regexp": {
"$field.raw": "$filter.*"
}
},
"aggs": {
"names": {
"terms": {
"field": "$field.raw",
"size": amount
}
}
}
}
}
}
}
}
)
for hits in response["aggregations"]["nested_terms"]["filter"]["names"]["buckets"]:
buffer = {"key": hits["key"], "doc_count": hits["doc_count"]}
Expand Down Expand Up @@ -213,8 +241,8 @@ def browse(self, page, length, search_values):
"query": query,
"size": length,
"from": start,
"_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject", "message_id", "date",
"x_gateway", "lines", "xref", "body", "references"],
"_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject",
"message_id", "date", "x_gateway", "lines", "xref", "body", "references"],
"sort": [
{"date": {"order": "asc"}}
]
Expand Down Expand Up @@ -265,8 +293,8 @@ def by_message_id(self, message_id):
"query": query,
"size": 1,
"from": 0,
"_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject", "message_id", "date",
"x_gateway", "lines", "xref", "body", "references"],
"_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject",
"message_id", "date", "x_gateway", "lines", "xref", "body", "references"],
"sort": [
{"date": {"order": "asc"}}
]
Expand All @@ -290,8 +318,9 @@ def get_replies(self, message_id):
}
response = self.client.search(index="articles", body={
"query": query,
"_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject", "message_id", "date",
"x_gateway", "lines", "xref", "body", "references", "body"],
"_source": ["id", "path", "from_name", "from_email", "newsgroups", "subject",
"message_id", "date", "x_gateway", "lines", "xref", "body", "references",
"body"],
"sort": [
{"date": {"order": "asc"}}
]
Expand Down

0 comments on commit 7ec3bd2

Please sign in to comment.