Split CCC refs from content into own element type

nossbigg · Jun 15, 2019 · 8034f8c · 8034f8c
1 parent 98c8540
commit 8034f8c
Show file tree

Hide file tree

Showing 4 changed files with 142 additions and 9 deletions.
diff --git a/src/parse.py b/src/parse.py
@@ -1,9 +1,12 @@
 from scrapers.tocScraper import readTocFromDisk
 from scrapers.pageScraper import readPagesFromDisk
 from scrapers.abbreviationsScraper import readAbbreviationsFromDisk
+
 from parsers.tocParser import parseToc
 from parsers.pageParser import parsePages
+from parsers.specificPagesFixer import fixSpecificPagesHtml
 from parsers.abbreviationsParser import parseAbbreviations
+
 from exporters.jsonExporter import exportStoreAsJson
 from exporters.jsonMetaGenerator import generate_store_meta
 
@@ -14,7 +17,7 @@
 bible_refs, other_refs = parseAbbreviations(abbreviations_html)
 ccc_refs = {'bible': bible_refs, 'other': other_refs}
 
-pages_html_dict = readPagesFromDisk()
+pages_html_dict = fixSpecificPagesHtml(readPagesFromDisk())
 page_nodes_dict = parsePages(pages_html_dict)
 
 meta = generate_store_meta()

diff --git a/src/parsers/contentsParser.py b/src/parsers/contentsParser.py
@@ -3,15 +3,16 @@
 PageContent = namedtuple('PageContent', 'text')
 Paragraph = namedtuple('Paragraph', 'elements attrs')
 
-cccReferencedLineMatcher = re.compile('^[0-9]+ ')
+cccReferenceLineMatcher = re.compile('(^[0-9]+) (.*)')
 
 
 def extractStructuredContents(raw_nodes):
-    result = []
+    paragraphs = []
     for n in raw_nodes:
-        result = result + processElement(n)
+        paragraphs = paragraphs + processElement(n)
 
-    return result
+    paragraphs = [transformCCCReferenceLine(p) for p in paragraphs]
+    return paragraphs
 
 
 def processElement(node):
@@ -65,6 +66,45 @@ def processParagraphChild(node, attrs):
     return []
 
 
+def transformCCCReferenceLine(paragraph):
+    if not hasCCCReferenceLine(paragraph):
+        return paragraph
+
+    elements = paragraph.elements
+    first_element = elements[0]
+    rest_elements = elements[1:]
+
+    ccc_ref_element, new_text_element = splitCCCReferenceFromTextElement(
+        first_element)
+
+    new_elements = [ccc_ref_element, new_text_element] + rest_elements
+    new_paragrah = Paragraph(new_elements, paragraph.attrs)
+    return new_paragrah
+
+
+def hasCCCReferenceLine(paragraph):
+    if not isinstance(paragraph, Paragraph):
+        return False
+
+    first_element = paragraph.elements[0]
+
+    if 'text' not in first_element:
+        return False
+
+    return cccReferenceLineMatcher.match(first_element['text'])
+
+
+def splitCCCReferenceFromTextElement(element):
+    text_match = cccReferenceLineMatcher.match(element['text'])
+    element_attrs = element['attrs']
+
+    ccc_ref_element = createCCCRefElement(
+        int(text_match.group(1)), element_attrs)
+    new_text_element = createTextElement(text_match.group(2), element_attrs)
+
+    return ccc_ref_element, new_text_element
+
+
 def unwrapChildren(node, attrs):
     result = []
     for n in node.children:
@@ -91,6 +131,14 @@ def createSpacerElement():
     return {'type': 'spacer'}
 
 
+def createTextElement(text, attrs):
+    return {'type': 'text', 'text': text, 'attrs': attrs}
+
+
+def createCCCRefElement(ref_number, attrs):
+    return {'type': 'ref-ccc', 'ref_number': ref_number}
+
+
 def createParagraph(node, children):
     attrs = {}
     if isIndentedParagraph(node):
@@ -110,7 +158,3 @@ def isIndentedParagraph(node):
 def isEmptyOutput(node_text):
     text = node_text.replace('\n', "").strip()
     return len(text) == 0
-
-
-def isCCCReferenceLine(node_text):
-    return cccReferencedLineMatcher.match(node_text)
diff --git a/src/parsers/specificPagesFixer.py b/src/parsers/specificPagesFixer.py
@@ -0,0 +1,65 @@
+def fixSpecificPagesHtml(pages_dict):
+    with_fixed_pages = {}
+
+    for page_toc_ref, page_html in pages_dict.items():
+        if page_toc_ref not in pages_to_fix:
+            with_fixed_pages[page_toc_ref] = page_html
+            continue
+
+        fixer_function = pages_to_fix[page_toc_ref]
+        with_fixed_pages[page_toc_ref] = fixer_function(page_html)
+
+    return with_fixed_pages
+
+
+def fix_for_ref_2077(page_html):
+    # Fix for
+    # PART THREE: LIFE IN CHRIST
+    # SECTION TWO THE TEN COMMANDMENTS
+    # IN BRIEF
+    # http://www.vatican.va/archive/ENG0015/__P79.HTM
+    p = page_html
+
+    # Fix Ref 2077 weird order
+    p = p.replace("2076 By his life and by his\n"
+                  "preaching Jesus attested to the permanent validity of the Decalogue. 2077 The",
+                  "2076 By his life and by his\n"
+                  "preaching Jesus attested to the permanent validity of the Decalogue.")
+    p = p.replace("gift of the Decalogue is bestowed from within\n"
+                  "the covenant concluded by God with his people. God's",
+                  "2077 The gift of the Decalogue is bestowed from within\n"
+                  "the covenant concluded by God with his people. God's")
+
+    return p
+
+
+def fix_for_ref_2436(page_html):
+    # Fix for
+    # PART THREE: LIFE IN CHRIST
+    # SECTION TWO THE TEN COMMANDMENTS
+    # CHAPTER TWO YOU SHALL LOVE YOUR NEIGHBOR AS YOURSELF
+    # Article 7 THE SEVENTH COMMANDMENT
+    # IV. Economic Activity and Social Justice
+    # http://www.vatican.va/archive/ENG0015/__P8D.HTM
+    p = page_html
+
+    # Move Ref 2436 into its own paragraph
+    p = p.replace("Recourse to a strike is morally legitimate when it cannot be avoided, or at\n"
+                  "least when it is necessary to obtain a proportionate benefit. It becomes\n"
+                  "morally unacceptable when accompanied by violence, or when objectives are\n"
+                  "included that are not directly linked to working conditions or are contrary to\n"
+                  "the common good. <br>\n2436 ",
+                  "Recourse to a strike is morally legitimate when it cannot be avoided, or at\n"
+                  "least when it is necessary to obtain a proportionate benefit. It becomes\n"
+                  "morally unacceptable when accompanied by violence, or when objectives are\n"
+                  "included that are not directly linked to working conditions or are contrary to\n"
+                  "the common good.</p>\n\n"
+                  "<p class=MsoNormal>2436\n")
+
+    return p
+
+
+pages_to_fix = {
+    'toc-279': fix_for_ref_2077,
+    'toc-319': fix_for_ref_2436
+}
diff --git a/src/validators/validators.py b/src/validators/validators.py
@@ -0,0 +1,21 @@
+from parsers.contentsParser import Paragraph
+
+
+def validate_has_all_ccc_refs(page_nodes_dict):
+    ccc_refs = {}
+
+    for page in page_nodes_dict.values():
+        for paragraph in page.paragraphs:
+            if isinstance(paragraph, Paragraph):
+                for element in paragraph.elements:
+                    if element['type'] == 'ref-ccc':
+                        ccc_refs[element['ref_number']] = ''
+
+    expected_num_ccc_refs = 2865
+    missing_refs = []
+
+    for i in range(1, expected_num_ccc_refs):
+        if i not in ccc_refs:
+            missing_refs.append(i)
+
+    return len(missing_refs) == 0