Skip to content

Commit

Permalink
Split CCC refs from content into own element type
Browse files Browse the repository at this point in the history
  • Loading branch information
nossbigg committed Jun 15, 2019
1 parent 98c8540 commit 8034f8c
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 9 deletions.
5 changes: 4 additions & 1 deletion src/parse.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from scrapers.tocScraper import readTocFromDisk
from scrapers.pageScraper import readPagesFromDisk
from scrapers.abbreviationsScraper import readAbbreviationsFromDisk

from parsers.tocParser import parseToc
from parsers.pageParser import parsePages
from parsers.specificPagesFixer import fixSpecificPagesHtml
from parsers.abbreviationsParser import parseAbbreviations

from exporters.jsonExporter import exportStoreAsJson
from exporters.jsonMetaGenerator import generate_store_meta

Expand All @@ -14,7 +17,7 @@
bible_refs, other_refs = parseAbbreviations(abbreviations_html)
ccc_refs = {'bible': bible_refs, 'other': other_refs}

pages_html_dict = readPagesFromDisk()
pages_html_dict = fixSpecificPagesHtml(readPagesFromDisk())
page_nodes_dict = parsePages(pages_html_dict)

meta = generate_store_meta()
Expand Down
60 changes: 52 additions & 8 deletions src/parsers/contentsParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
PageContent = namedtuple('PageContent', 'text')
Paragraph = namedtuple('Paragraph', 'elements attrs')

cccReferencedLineMatcher = re.compile('^[0-9]+ ')
cccReferenceLineMatcher = re.compile('(^[0-9]+) (.*)')


def extractStructuredContents(raw_nodes):
result = []
paragraphs = []
for n in raw_nodes:
result = result + processElement(n)
paragraphs = paragraphs + processElement(n)

return result
paragraphs = [transformCCCReferenceLine(p) for p in paragraphs]
return paragraphs


def processElement(node):
Expand Down Expand Up @@ -65,6 +66,45 @@ def processParagraphChild(node, attrs):
return []


def transformCCCReferenceLine(paragraph):
if not hasCCCReferenceLine(paragraph):
return paragraph

elements = paragraph.elements
first_element = elements[0]
rest_elements = elements[1:]

ccc_ref_element, new_text_element = splitCCCReferenceFromTextElement(
first_element)

new_elements = [ccc_ref_element, new_text_element] + rest_elements
new_paragrah = Paragraph(new_elements, paragraph.attrs)
return new_paragrah


def hasCCCReferenceLine(paragraph):
if not isinstance(paragraph, Paragraph):
return False

first_element = paragraph.elements[0]

if 'text' not in first_element:
return False

return cccReferenceLineMatcher.match(first_element['text'])


def splitCCCReferenceFromTextElement(element):
text_match = cccReferenceLineMatcher.match(element['text'])
element_attrs = element['attrs']

ccc_ref_element = createCCCRefElement(
int(text_match.group(1)), element_attrs)
new_text_element = createTextElement(text_match.group(2), element_attrs)

return ccc_ref_element, new_text_element


def unwrapChildren(node, attrs):
result = []
for n in node.children:
Expand All @@ -91,6 +131,14 @@ def createSpacerElement():
return {'type': 'spacer'}


def createTextElement(text, attrs):
return {'type': 'text', 'text': text, 'attrs': attrs}


def createCCCRefElement(ref_number, attrs):
return {'type': 'ref-ccc', 'ref_number': ref_number}


def createParagraph(node, children):
attrs = {}
if isIndentedParagraph(node):
Expand All @@ -110,7 +158,3 @@ def isIndentedParagraph(node):
def isEmptyOutput(node_text):
text = node_text.replace('\n', "").strip()
return len(text) == 0


def isCCCReferenceLine(node_text):
return cccReferencedLineMatcher.match(node_text)
65 changes: 65 additions & 0 deletions src/parsers/specificPagesFixer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
def fixSpecificPagesHtml(pages_dict):
with_fixed_pages = {}

for page_toc_ref, page_html in pages_dict.items():
if page_toc_ref not in pages_to_fix:
with_fixed_pages[page_toc_ref] = page_html
continue

fixer_function = pages_to_fix[page_toc_ref]
with_fixed_pages[page_toc_ref] = fixer_function(page_html)

return with_fixed_pages


def fix_for_ref_2077(page_html):
# Fix for
# PART THREE: LIFE IN CHRIST
# SECTION TWO THE TEN COMMANDMENTS
# IN BRIEF
# http://www.vatican.va/archive/ENG0015/__P79.HTM
p = page_html

# Fix Ref 2077 weird order
p = p.replace("2076 By his life and by his\n"
"preaching Jesus attested to the permanent validity of the Decalogue. 2077 The",
"2076 By his life and by his\n"
"preaching Jesus attested to the permanent validity of the Decalogue.")
p = p.replace("gift of the Decalogue is bestowed from within\n"
"the covenant concluded by God with his people. God's",
"2077 The gift of the Decalogue is bestowed from within\n"
"the covenant concluded by God with his people. God's")

return p


def fix_for_ref_2436(page_html):
# Fix for
# PART THREE: LIFE IN CHRIST
# SECTION TWO THE TEN COMMANDMENTS
# CHAPTER TWO YOU SHALL LOVE YOUR NEIGHBOR AS YOURSELF
# Article 7 THE SEVENTH COMMANDMENT
# IV. Economic Activity and Social Justice
# http://www.vatican.va/archive/ENG0015/__P8D.HTM
p = page_html

# Move Ref 2436 into its own paragraph
p = p.replace("Recourse to a strike is morally legitimate when it cannot be avoided, or at\n"
"least when it is necessary to obtain a proportionate benefit. It becomes\n"
"morally unacceptable when accompanied by violence, or when objectives are\n"
"included that are not directly linked to working conditions or are contrary to\n"
"the common good. <br>\n2436 ",
"Recourse to a strike is morally legitimate when it cannot be avoided, or at\n"
"least when it is necessary to obtain a proportionate benefit. It becomes\n"
"morally unacceptable when accompanied by violence, or when objectives are\n"
"included that are not directly linked to working conditions or are contrary to\n"
"the common good.</p>\n\n"
"<p class=MsoNormal>2436\n")

return p


pages_to_fix = {
'toc-279': fix_for_ref_2077,
'toc-319': fix_for_ref_2436
}
21 changes: 21 additions & 0 deletions src/validators/validators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from parsers.contentsParser import Paragraph


def validate_has_all_ccc_refs(page_nodes_dict):
ccc_refs = {}

for page in page_nodes_dict.values():
for paragraph in page.paragraphs:
if isinstance(paragraph, Paragraph):
for element in paragraph.elements:
if element['type'] == 'ref-ccc':
ccc_refs[element['ref_number']] = ''

expected_num_ccc_refs = 2865
missing_refs = []

for i in range(1, expected_num_ccc_refs):
if i not in ccc_refs:
missing_refs.append(i)

return len(missing_refs) == 0

0 comments on commit 8034f8c

Please sign in to comment.