Skip to content

Commit

Permalink
Fixes for legislation importer and other fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
melish committed Mar 15, 2024
1 parent 053d1d8 commit 40796c4
Show file tree
Hide file tree
Showing 26 changed files with 34,133 additions and 7,739 deletions.
5 changes: 3 additions & 2 deletions docker/reprocess_from_db.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#!/usr/bin/env bash

$PYTHONPATH/python $ECOLEX_HOME/ecolex/manage.py import legislation --reindex
$PYTHONPATH/python $ECOLEX_HOME/ecolex/manage.py import legislation --update-text
# outdated, only for manual execution
# $PYTHONPATH/python $ECOLEX_HOME/ecolex/manage.py import legislation --reindex
# $PYTHONPATH/python $ECOLEX_HOME/ecolex/manage.py import legislation --update-text

$PYTHONPATH/python $ECOLEX_HOME/ecolex/manage.py import literature --reindex
$PYTHONPATH/python $ECOLEX_HOME/ecolex/manage.py import literature --update-text
Expand Down
164 changes: 71 additions & 93 deletions ecolex/legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,9 @@
from bs4 import BeautifulSoup
from pysolr import SolrError
from django.conf import settings
from django.utils import timezone
from django.template.defaultfilters import slugify

from ecolex.management.commands.logging import LOG_DICT
from ecolex.management.commands.legislation import LegislationImporter
from ecolex.management.definitions import LEGISLATION
from ecolex.management.utils import EcolexSolr, clean_text_date
from ecolex.models import DocumentText
Expand All @@ -34,12 +32,18 @@
"Long_Title_of_text": "legLongTitle",
"Serial_Imprint": "legSource",

"Date_of_original_Text": "legOriginalYear",
"Date_of_original_Text": "_legOriginalDate",
"Date_of_Text": "legDate",
"Date_of_Consolidation": "_legDateOfConsolidation", # not stored

"Entry_into_Force": "legEntryIntoForce",

"country_ISO": "legCountry_iso",
"country": "_countryCodeAlt",
"organization": "_organization_en",
"organization_fr": "_organization_fr",
"organization_es": "_organization_es",

"Territorial_Subdivision": "legTerritorialSubdivision",
"Sub_file_code": "legSubject_code",
"basin_en": "legBasin_en",
Expand All @@ -49,6 +53,8 @@
"Type_of_Text": "legTypeCode",

"Related_Web_Site": "legRelatedWebSite",
"link_to_full_text": "legLinkToFullText", # TODO: handle multiple files

"Record_Language": "legLanguage_code",
"Doc_Language": "legLanguage_en",

Expand All @@ -66,11 +72,11 @@

MULTIVALUED_FIELDS = [
"legLanguage_en",
"legSubject_code",
"legKeyword_code",
"legBasin_en", "legBasin_fr", "legBasin_es",
"legImplement", "legAmends", "legRepeals", "legImplementTreaty",
"legCitesTreaty",
"legSubject_code",
"legImplement", "legAmends", "legRepeals",
"legImplementTreaty", "legCitesTreaty",
]

LANGUAGE_FIELDS = ["legLanguage_en", "legLanguage_fr", "legLanguage_es"]
Expand All @@ -92,16 +98,16 @@ def harvest_file(upfile):
legislations = []
count_ignored = 0

with open(settings.SOLR_IMPORT["common"]["subjects_xml"], encoding="utf-8") as f:
with open(settings.SOLR_IMPORT["common"]["fao_subjects_xml"], encoding="utf-8") as f:
bs = BeautifulSoup(f.read(), "xml")
subjects = {subject.Classification_Sec_Area.string: subject
for subject in bs.findAll("dictionary_term")}

with open(settings.SOLR_IMPORT["common"]["keywords_xml"], encoding="utf-8") as f:
with open(settings.SOLR_IMPORT["common"]["fao_keywords_xml"], encoding="utf-8") as f:
bs = BeautifulSoup(f.read(), "xml")
keywords = {keyword.Code.string: keyword for keyword in bs.findAll("dictionary_term")}

with open(settings.SOLR_IMPORT["common"]["leg_regions_json"], encoding="utf-8") as f:
with open(settings.SOLR_IMPORT["common"]["fao_regions_json"], encoding="utf-8") as f:
json_regions = json.load(f)

with open(settings.SOLR_IMPORT["common"]["fao_countries_json"], encoding="utf-8") as f:
Expand Down Expand Up @@ -183,68 +189,66 @@ def harvest_file(upfile):
_set_language_fields(legislation, "legKeyword_", keywords)

# overwrite countries with names from the dictionary
iso_country = legislation.get("legCountry_iso")
iso_country = (
legislation.get("legCountry_iso") or
legislation.get("_countryCodeAlt")
)
if iso_country:
fao_country = json_countries.get(iso_country)
if fao_country:
legislation["legCountry_en"] = fao_country.get("en")
legislation["legCountry_es"] = fao_country.get("es")
legislation["legCountry_fr"] = fao_country.get("fr")

region = json_regions.get(fao_country.get("en"))
region = json_regions.get(iso_country)
if region:
legislation["legGeoArea_en"] = region.get("en", [])
legislation["legGeoArea_fr"] = region.get("fr", [])
legislation["legGeoArea_es"] = region.get("es", [])
else:
logger.warning(f"No regions for country {iso_country}")
else:
logger.warning(f"Country not found: {iso_country}")
else:
# exception for the European Union
if legislation.get("_organization_en") == "European Union":
legislation["legCountry_en"] = "European Union"
legislation["legCountry_fr"] = "Union européenne"
legislation["legCountry_es"] = "Unión Europea"
legislation["legGeoArea_en"] = "European Union Countries"
legislation["legGeoArea_fr"] = "Países de la Unión Europea"
legislation["legGeoArea_es"] = "Pays de l'Union Européenne"

legDate = legislation.get("legDate") or legislation.get("_legDateOfConsolidation")
if "_legDateOfConsolidation" in legislation:
del legislation["_legDateOfConsolidation"]

if legDate:
try:
_, solr_format, dateValue = clean_text_date(legDate)
if not dateValue:
if "legDate" in legislation:
del legislation["legDate"]
else:
legislation["legYear"] = dateValue.strftime("%Y")
legislation["legDate"] = solr_format
except Exception as e:
logger.warn(
f"Error parsing legDate {legDate} for {legislation.get('legId')}"
)
_, solr_format, dateValue = clean_text_date(legDate)
if not dateValue or dateValue.year < 1700:
if "legDate" in legislation:
# if check is for record with invalid _legDateOfConsolidation and no legDate
del legislation["legDate"]
else:
legislation["legYear"] = dateValue.strftime("%Y")
legislation["legDate"] = solr_format

if "legOriginalYear" in legislation:
try:
_, solr_format, dateValue = clean_text_date(legislation["legOriginalYear"])
if not dateValue:
del legislation["legOriginalYear"]
else:
legislation["legOriginalYear"] = dateValue.strftime("%Y")
except Exception:
logger.warn(
f"Error parsing legOriginalYear {legislation.get('legOriginalYear')} "
f"for {legislation.get('legId')}"
)
del legislation["legOriginalYear"]

filenames = get_content(document.findall("link_to_full_text"))
url_values = []
for filename in filenames:
if "_legOriginalDate" in legislation:
_, solr_format, dateValue = clean_text_date(legislation["_legOriginalDate"])
if dateValue:
legislation["legOriginalYear"] = dateValue.strftime("%Y")

# XML may contain multiple files, but in ECOLEX it's single valued
if "legLinkToFullText" in legislation:
filename = legislation["legLinkToFullText"]
extension = filename.rsplit(".")[-1].lower()
url = settings.FULL_TEXT_URLS.get(extension)
if url:
url_values.append(f"{url}{filename}")
legislation["legLinkToFullText"] = f"{url}{filename}"
else:
logger.error(f"URL not found for {filename} {legislation.get('legId')}")

if (REPEALED.upper() in
get_content(document.findall(REPEALED))):
legislation["legStatus"] = REPEALED
import ipdb; ipdb.set_trace()
else:
legislation["legStatus"] = IN_FORCE

Expand All @@ -260,66 +264,49 @@ def harvest_file(upfile):
slug = title + " " + legislation.get("legId")
legislation["slug"] = slugify(slug)

for url_value in url_values:
legislation_copy = legislation.copy()
legislation_copy["legLinkToFullText"] = url_value
legislations.append(legislation_copy)
# remove internal attributes
legislations.append({
key: value
for key, value in legislation.items()
if not key.startswith("_")
})

logger.info(f"[Legislation] Harvest file finished.")
add_legislations(legislations, count_ignored)


def add_legislations(legislations, count_ignored):
solr = EcolexSolr()
leg_result = None
count_updated = 0
count_new = 0
config = settings.SOLR_IMPORT
importer_config = config["common"]
importer_config.update(config["legislation"])
importer = LegislationImporter(importer_config)
local_time = timezone.now()

for legislation in legislations:
leg_id = legislation.get("legId")
logger.info(f"[Legislation] {leg_id}")
docs = DocumentText.objects.filter(doc_id=leg_id,
updated_datetime__lt=local_time).order_by("updated_datetime")

if not docs:
doc = DocumentText.objects.create(doc_id=leg_id)
else:
doc = docs[0]

logger.info(f"[Legislation] Adding {leg_id}")
doc, _ = DocumentText.objects.get_or_create(
doc_id=leg_id,
url=legislation.get("legLinkToFullText")
)
doc.doc_type = LEGISLATION
doc.status = DocumentText.INDEXED
legislation["updatedDate"] = (datetime.now()
.strftime("%Y-%m-%dT%H:%M:%SZ"))
try:
leg_result = solr.search(LEGISLATION, leg_id)
if leg_result:
legislation["id"] = leg_result["id"]
except SolrError as e:
logger.error(f"Error importing legislation {leg_id}")
if settings.DEBUG:
logger.exception(e)

doc.parsed_data = json.dumps(legislation)
if (doc.url != legislation.get("legLinkToFullText")):
doc.url = legislation.get("legLinkToFullText")
# will not re-parse if same url and doc_size
doc.doc_size = None

if leg_result:
count_updated += index_and_log(solr, legislation, doc)
else:
count_new += index_and_log(solr, legislation, doc)

try:
leg_existing = solr.search(LEGISLATION, leg_id)
if leg_existing:
legislation["id"] = leg_existing["id"]
solr.add(legislation)
# full-text extraction is done separately
# see LegislationImporter.update_full_text
doc.save()
importer.update_full_text_one(doc)
if leg_existing:
count_updated += 1
else:
count_new += 1
except KeyboardInterrupt:
raise
except Exception as e:
logger.error(f"Error importing legislation {leg_id}")
if settings.DEBUG:
logger.exception(e)

Expand Down Expand Up @@ -387,12 +374,3 @@ def _set_values_from_dict(data, field, local_dict):
new_values["en"].append(val_en)
for field in fields:
data[field] = new_values[field[-2:]]


def index_and_log(solr, legislation, doc):
faolex_enabled = getattr(settings, "FAOLEX_ENABLED", False)
if faolex_enabled:
if not solr.add(legislation):
return 0
doc.parsed_data = ""
return 1
38 changes: 38 additions & 0 deletions ecolex/management/commands/format_countries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import json
import os
from bs4 import BeautifulSoup

from django.core.management.base import BaseCommand
from django.conf import settings

class Command(BaseCommand):
help = "Format countries XML and store data in JSON"

def load_countries(self):
countries_dict = {}
xml_file = os.path.join(settings.CONFIG_DIR, 'fao_countries_ter.xml')
with open(xml_file, encoding="utf-8") as f_in:
bs = BeautifulSoup(f_in.read(), "xml")
for country in bs.findAll("dictionary_term"):
code = (
country.Country_ISO3_Code.string
if country.Country_ISO3_Code
else country.Country_ISO_Code.string
)
countries_dict[code] = {
"en": country.Name_en_US.string,
"fr": country.Name_fr_FR.string,
"es": country.Name_es_ES.string,
}
return countries_dict

def handle(self, *args, **kwargs):
countries_dict = self.load_countries()
json_file = os.path.join(settings.CONFIG_DIR, 'fao_countries.json')
with open(json_file, "w", encoding="utf-8") as f_out:
json.dump(
dict(sorted(countries_dict.items())),
f_out,
indent=2,
ensure_ascii=False,
)
Loading

0 comments on commit 40796c4

Please sign in to comment.