From 8034f8c5e05983d95cf89d9130854995a6e6fe42 Mon Sep 17 00:00:00 2001 From: nossbigg Date: Sat, 15 Jun 2019 16:02:32 +0800 Subject: [PATCH] Split CCC refs from content into own element type --- src/parse.py | 5 ++- src/parsers/contentsParser.py | 60 ++++++++++++++++++++++++---- src/parsers/specificPagesFixer.py | 65 +++++++++++++++++++++++++++++++ src/validators/validators.py | 21 ++++++++++ 4 files changed, 142 insertions(+), 9 deletions(-) create mode 100644 src/parsers/specificPagesFixer.py create mode 100644 src/validators/validators.py diff --git a/src/parse.py b/src/parse.py index bf8b4ff..f21e17f 100644 --- a/src/parse.py +++ b/src/parse.py @@ -1,9 +1,12 @@ from scrapers.tocScraper import readTocFromDisk from scrapers.pageScraper import readPagesFromDisk from scrapers.abbreviationsScraper import readAbbreviationsFromDisk + from parsers.tocParser import parseToc from parsers.pageParser import parsePages +from parsers.specificPagesFixer import fixSpecificPagesHtml from parsers.abbreviationsParser import parseAbbreviations + from exporters.jsonExporter import exportStoreAsJson from exporters.jsonMetaGenerator import generate_store_meta @@ -14,7 +17,7 @@ bible_refs, other_refs = parseAbbreviations(abbreviations_html) ccc_refs = {'bible': bible_refs, 'other': other_refs} -pages_html_dict = readPagesFromDisk() +pages_html_dict = fixSpecificPagesHtml(readPagesFromDisk()) page_nodes_dict = parsePages(pages_html_dict) meta = generate_store_meta() diff --git a/src/parsers/contentsParser.py b/src/parsers/contentsParser.py index a0624e4..32cdcbb 100644 --- a/src/parsers/contentsParser.py +++ b/src/parsers/contentsParser.py @@ -3,15 +3,16 @@ PageContent = namedtuple('PageContent', 'text') Paragraph = namedtuple('Paragraph', 'elements attrs') -cccReferencedLineMatcher = re.compile('^[0-9]+ ') +cccReferenceLineMatcher = re.compile('(^[0-9]+) (.*)') def extractStructuredContents(raw_nodes): - result = [] + paragraphs = [] for n in raw_nodes: - result = result + processElement(n) + paragraphs = paragraphs + processElement(n) - return result + paragraphs = [transformCCCReferenceLine(p) for p in paragraphs] + return paragraphs def processElement(node): @@ -65,6 +66,45 @@ def processParagraphChild(node, attrs): return [] +def transformCCCReferenceLine(paragraph): + if not hasCCCReferenceLine(paragraph): + return paragraph + + elements = paragraph.elements + first_element = elements[0] + rest_elements = elements[1:] + + ccc_ref_element, new_text_element = splitCCCReferenceFromTextElement( + first_element) + + new_elements = [ccc_ref_element, new_text_element] + rest_elements + new_paragrah = Paragraph(new_elements, paragraph.attrs) + return new_paragrah + + +def hasCCCReferenceLine(paragraph): + if not isinstance(paragraph, Paragraph): + return False + + first_element = paragraph.elements[0] + + if 'text' not in first_element: + return False + + return cccReferenceLineMatcher.match(first_element['text']) + + +def splitCCCReferenceFromTextElement(element): + text_match = cccReferenceLineMatcher.match(element['text']) + element_attrs = element['attrs'] + + ccc_ref_element = createCCCRefElement( + int(text_match.group(1)), element_attrs) + new_text_element = createTextElement(text_match.group(2), element_attrs) + + return ccc_ref_element, new_text_element + + def unwrapChildren(node, attrs): result = [] for n in node.children: @@ -91,6 +131,14 @@ def createSpacerElement(): return {'type': 'spacer'} +def createTextElement(text, attrs): + return {'type': 'text', 'text': text, 'attrs': attrs} + + +def createCCCRefElement(ref_number, attrs): + return {'type': 'ref-ccc', 'ref_number': ref_number} + + def createParagraph(node, children): attrs = {} if isIndentedParagraph(node): @@ -110,7 +158,3 @@ def isIndentedParagraph(node): def isEmptyOutput(node_text): text = node_text.replace('\n', "").strip() return len(text) == 0 - - -def isCCCReferenceLine(node_text): - return cccReferencedLineMatcher.match(node_text) diff --git a/src/parsers/specificPagesFixer.py b/src/parsers/specificPagesFixer.py new file mode 100644 index 0000000..9fe5308 --- /dev/null +++ b/src/parsers/specificPagesFixer.py @@ -0,0 +1,65 @@ +def fixSpecificPagesHtml(pages_dict): + with_fixed_pages = {} + + for page_toc_ref, page_html in pages_dict.items(): + if page_toc_ref not in pages_to_fix: + with_fixed_pages[page_toc_ref] = page_html + continue + + fixer_function = pages_to_fix[page_toc_ref] + with_fixed_pages[page_toc_ref] = fixer_function(page_html) + + return with_fixed_pages + + +def fix_for_ref_2077(page_html): + # Fix for + # PART THREE: LIFE IN CHRIST + # SECTION TWO THE TEN COMMANDMENTS + # IN BRIEF + # http://www.vatican.va/archive/ENG0015/__P79.HTM + p = page_html + + # Fix Ref 2077 weird order + p = p.replace("2076 By his life and by his\n" + "preaching Jesus attested to the permanent validity of the Decalogue. 2077 The", + "2076 By his life and by his\n" + "preaching Jesus attested to the permanent validity of the Decalogue.") + p = p.replace("gift of the Decalogue is bestowed from within\n" + "the covenant concluded by God with his people. God's", + "2077 The gift of the Decalogue is bestowed from within\n" + "the covenant concluded by God with his people. God's") + + return p + + +def fix_for_ref_2436(page_html): + # Fix for + # PART THREE: LIFE IN CHRIST + # SECTION TWO THE TEN COMMANDMENTS + # CHAPTER TWO YOU SHALL LOVE YOUR NEIGHBOR AS YOURSELF + # Article 7 THE SEVENTH COMMANDMENT + # IV. Economic Activity and Social Justice + # http://www.vatican.va/archive/ENG0015/__P8D.HTM + p = page_html + + # Move Ref 2436 into its own paragraph + p = p.replace("Recourse to a strike is morally legitimate when it cannot be avoided, or at\n" + "least when it is necessary to obtain a proportionate benefit. It becomes\n" + "morally unacceptable when accompanied by violence, or when objectives are\n" + "included that are not directly linked to working conditions or are contrary to\n" + "the common good.
\n2436 ", + "Recourse to a strike is morally legitimate when it cannot be avoided, or at\n" + "least when it is necessary to obtain a proportionate benefit. It becomes\n" + "morally unacceptable when accompanied by violence, or when objectives are\n" + "included that are not directly linked to working conditions or are contrary to\n" + "the common good.

\n\n" + "

2436\n") + + return p + + +pages_to_fix = { + 'toc-279': fix_for_ref_2077, + 'toc-319': fix_for_ref_2436 +} diff --git a/src/validators/validators.py b/src/validators/validators.py new file mode 100644 index 0000000..984423a --- /dev/null +++ b/src/validators/validators.py @@ -0,0 +1,21 @@ +from parsers.contentsParser import Paragraph + + +def validate_has_all_ccc_refs(page_nodes_dict): + ccc_refs = {} + + for page in page_nodes_dict.values(): + for paragraph in page.paragraphs: + if isinstance(paragraph, Paragraph): + for element in paragraph.elements: + if element['type'] == 'ref-ccc': + ccc_refs[element['ref_number']] = '' + + expected_num_ccc_refs = 2865 + missing_refs = [] + + for i in range(1, expected_num_ccc_refs): + if i not in ccc_refs: + missing_refs.append(i) + + return len(missing_refs) == 0