From 8034f8c5e05983d95cf89d9130854995a6e6fe42 Mon Sep 17 00:00:00 2001
From: nossbigg
Date: Sat, 15 Jun 2019 16:02:32 +0800
Subject: [PATCH] Split CCC refs from content into own element type
---
src/parse.py | 5 ++-
src/parsers/contentsParser.py | 60 ++++++++++++++++++++++++----
src/parsers/specificPagesFixer.py | 65 +++++++++++++++++++++++++++++++
src/validators/validators.py | 21 ++++++++++
4 files changed, 142 insertions(+), 9 deletions(-)
create mode 100644 src/parsers/specificPagesFixer.py
create mode 100644 src/validators/validators.py
diff --git a/src/parse.py b/src/parse.py
index bf8b4ff..f21e17f 100644
--- a/src/parse.py
+++ b/src/parse.py
@@ -1,9 +1,12 @@
from scrapers.tocScraper import readTocFromDisk
from scrapers.pageScraper import readPagesFromDisk
from scrapers.abbreviationsScraper import readAbbreviationsFromDisk
+
from parsers.tocParser import parseToc
from parsers.pageParser import parsePages
+from parsers.specificPagesFixer import fixSpecificPagesHtml
from parsers.abbreviationsParser import parseAbbreviations
+
from exporters.jsonExporter import exportStoreAsJson
from exporters.jsonMetaGenerator import generate_store_meta
@@ -14,7 +17,7 @@
bible_refs, other_refs = parseAbbreviations(abbreviations_html)
ccc_refs = {'bible': bible_refs, 'other': other_refs}
-pages_html_dict = readPagesFromDisk()
+pages_html_dict = fixSpecificPagesHtml(readPagesFromDisk())
page_nodes_dict = parsePages(pages_html_dict)
meta = generate_store_meta()
diff --git a/src/parsers/contentsParser.py b/src/parsers/contentsParser.py
index a0624e4..32cdcbb 100644
--- a/src/parsers/contentsParser.py
+++ b/src/parsers/contentsParser.py
@@ -3,15 +3,16 @@
PageContent = namedtuple('PageContent', 'text')
Paragraph = namedtuple('Paragraph', 'elements attrs')
-cccReferencedLineMatcher = re.compile('^[0-9]+ ')
+cccReferenceLineMatcher = re.compile('(^[0-9]+) (.*)')
def extractStructuredContents(raw_nodes):
- result = []
+ paragraphs = []
for n in raw_nodes:
- result = result + processElement(n)
+ paragraphs = paragraphs + processElement(n)
- return result
+ paragraphs = [transformCCCReferenceLine(p) for p in paragraphs]
+ return paragraphs
def processElement(node):
@@ -65,6 +66,45 @@ def processParagraphChild(node, attrs):
return []
+def transformCCCReferenceLine(paragraph):
+ if not hasCCCReferenceLine(paragraph):
+ return paragraph
+
+ elements = paragraph.elements
+ first_element = elements[0]
+ rest_elements = elements[1:]
+
+ ccc_ref_element, new_text_element = splitCCCReferenceFromTextElement(
+ first_element)
+
+ new_elements = [ccc_ref_element, new_text_element] + rest_elements
+ new_paragrah = Paragraph(new_elements, paragraph.attrs)
+ return new_paragrah
+
+
+def hasCCCReferenceLine(paragraph):
+ if not isinstance(paragraph, Paragraph):
+ return False
+
+ first_element = paragraph.elements[0]
+
+ if 'text' not in first_element:
+ return False
+
+ return cccReferenceLineMatcher.match(first_element['text'])
+
+
+def splitCCCReferenceFromTextElement(element):
+ text_match = cccReferenceLineMatcher.match(element['text'])
+ element_attrs = element['attrs']
+
+ ccc_ref_element = createCCCRefElement(
+ int(text_match.group(1)), element_attrs)
+ new_text_element = createTextElement(text_match.group(2), element_attrs)
+
+ return ccc_ref_element, new_text_element
+
+
def unwrapChildren(node, attrs):
result = []
for n in node.children:
@@ -91,6 +131,14 @@ def createSpacerElement():
return {'type': 'spacer'}
+def createTextElement(text, attrs):
+ return {'type': 'text', 'text': text, 'attrs': attrs}
+
+
+def createCCCRefElement(ref_number, attrs):
+ return {'type': 'ref-ccc', 'ref_number': ref_number}
+
+
def createParagraph(node, children):
attrs = {}
if isIndentedParagraph(node):
@@ -110,7 +158,3 @@ def isIndentedParagraph(node):
def isEmptyOutput(node_text):
text = node_text.replace('\n', "").strip()
return len(text) == 0
-
-
-def isCCCReferenceLine(node_text):
- return cccReferencedLineMatcher.match(node_text)
diff --git a/src/parsers/specificPagesFixer.py b/src/parsers/specificPagesFixer.py
new file mode 100644
index 0000000..9fe5308
--- /dev/null
+++ b/src/parsers/specificPagesFixer.py
@@ -0,0 +1,65 @@
+def fixSpecificPagesHtml(pages_dict):
+ with_fixed_pages = {}
+
+ for page_toc_ref, page_html in pages_dict.items():
+ if page_toc_ref not in pages_to_fix:
+ with_fixed_pages[page_toc_ref] = page_html
+ continue
+
+ fixer_function = pages_to_fix[page_toc_ref]
+ with_fixed_pages[page_toc_ref] = fixer_function(page_html)
+
+ return with_fixed_pages
+
+
+def fix_for_ref_2077(page_html):
+ # Fix for
+ # PART THREE: LIFE IN CHRIST
+ # SECTION TWO THE TEN COMMANDMENTS
+ # IN BRIEF
+ # http://www.vatican.va/archive/ENG0015/__P79.HTM
+ p = page_html
+
+ # Fix Ref 2077 weird order
+ p = p.replace("2076 By his life and by his\n"
+ "preaching Jesus attested to the permanent validity of the Decalogue. 2077 The",
+ "2076 By his life and by his\n"
+ "preaching Jesus attested to the permanent validity of the Decalogue.")
+ p = p.replace("gift of the Decalogue is bestowed from within\n"
+ "the covenant concluded by God with his people. God's",
+ "2077 The gift of the Decalogue is bestowed from within\n"
+ "the covenant concluded by God with his people. God's")
+
+ return p
+
+
+def fix_for_ref_2436(page_html):
+ # Fix for
+ # PART THREE: LIFE IN CHRIST
+ # SECTION TWO THE TEN COMMANDMENTS
+ # CHAPTER TWO YOU SHALL LOVE YOUR NEIGHBOR AS YOURSELF
+ # Article 7 THE SEVENTH COMMANDMENT
+ # IV. Economic Activity and Social Justice
+ # http://www.vatican.va/archive/ENG0015/__P8D.HTM
+ p = page_html
+
+ # Move Ref 2436 into its own paragraph
+ p = p.replace("Recourse to a strike is morally legitimate when it cannot be avoided, or at\n"
+ "least when it is necessary to obtain a proportionate benefit. It becomes\n"
+ "morally unacceptable when accompanied by violence, or when objectives are\n"
+ "included that are not directly linked to working conditions or are contrary to\n"
+ "the common good.
\n2436 ",
+ "Recourse to a strike is morally legitimate when it cannot be avoided, or at\n"
+ "least when it is necessary to obtain a proportionate benefit. It becomes\n"
+ "morally unacceptable when accompanied by violence, or when objectives are\n"
+ "included that are not directly linked to working conditions or are contrary to\n"
+ "the common good.
\n\n"
+ "2436\n")
+
+ return p
+
+
+pages_to_fix = {
+ 'toc-279': fix_for_ref_2077,
+ 'toc-319': fix_for_ref_2436
+}
diff --git a/src/validators/validators.py b/src/validators/validators.py
new file mode 100644
index 0000000..984423a
--- /dev/null
+++ b/src/validators/validators.py
@@ -0,0 +1,21 @@
+from parsers.contentsParser import Paragraph
+
+
+def validate_has_all_ccc_refs(page_nodes_dict):
+ ccc_refs = {}
+
+ for page in page_nodes_dict.values():
+ for paragraph in page.paragraphs:
+ if isinstance(paragraph, Paragraph):
+ for element in paragraph.elements:
+ if element['type'] == 'ref-ccc':
+ ccc_refs[element['ref_number']] = ''
+
+ expected_num_ccc_refs = 2865
+ missing_refs = []
+
+ for i in range(1, expected_num_ccc_refs):
+ if i not in ccc_refs:
+ missing_refs.append(i)
+
+ return len(missing_refs) == 0