-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Split CCC refs from content into own element type
- Loading branch information
Showing
4 changed files
with
142 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
def fixSpecificPagesHtml(pages_dict): | ||
with_fixed_pages = {} | ||
|
||
for page_toc_ref, page_html in pages_dict.items(): | ||
if page_toc_ref not in pages_to_fix: | ||
with_fixed_pages[page_toc_ref] = page_html | ||
continue | ||
|
||
fixer_function = pages_to_fix[page_toc_ref] | ||
with_fixed_pages[page_toc_ref] = fixer_function(page_html) | ||
|
||
return with_fixed_pages | ||
|
||
|
||
def fix_for_ref_2077(page_html): | ||
# Fix for | ||
# PART THREE: LIFE IN CHRIST | ||
# SECTION TWO THE TEN COMMANDMENTS | ||
# IN BRIEF | ||
# http://www.vatican.va/archive/ENG0015/__P79.HTM | ||
p = page_html | ||
|
||
# Fix Ref 2077 weird order | ||
p = p.replace("2076 By his life and by his\n" | ||
"preaching Jesus attested to the permanent validity of the Decalogue. 2077 The", | ||
"2076 By his life and by his\n" | ||
"preaching Jesus attested to the permanent validity of the Decalogue.") | ||
p = p.replace("gift of the Decalogue is bestowed from within\n" | ||
"the covenant concluded by God with his people. God's", | ||
"2077 The gift of the Decalogue is bestowed from within\n" | ||
"the covenant concluded by God with his people. God's") | ||
|
||
return p | ||
|
||
|
||
def fix_for_ref_2436(page_html): | ||
# Fix for | ||
# PART THREE: LIFE IN CHRIST | ||
# SECTION TWO THE TEN COMMANDMENTS | ||
# CHAPTER TWO YOU SHALL LOVE YOUR NEIGHBOR AS YOURSELF | ||
# Article 7 THE SEVENTH COMMANDMENT | ||
# IV. Economic Activity and Social Justice | ||
# http://www.vatican.va/archive/ENG0015/__P8D.HTM | ||
p = page_html | ||
|
||
# Move Ref 2436 into its own paragraph | ||
p = p.replace("Recourse to a strike is morally legitimate when it cannot be avoided, or at\n" | ||
"least when it is necessary to obtain a proportionate benefit. It becomes\n" | ||
"morally unacceptable when accompanied by violence, or when objectives are\n" | ||
"included that are not directly linked to working conditions or are contrary to\n" | ||
"the common good. <br>\n2436 ", | ||
"Recourse to a strike is morally legitimate when it cannot be avoided, or at\n" | ||
"least when it is necessary to obtain a proportionate benefit. It becomes\n" | ||
"morally unacceptable when accompanied by violence, or when objectives are\n" | ||
"included that are not directly linked to working conditions or are contrary to\n" | ||
"the common good.</p>\n\n" | ||
"<p class=MsoNormal>2436\n") | ||
|
||
return p | ||
|
||
|
||
pages_to_fix = { | ||
'toc-279': fix_for_ref_2077, | ||
'toc-319': fix_for_ref_2436 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from parsers.contentsParser import Paragraph | ||
|
||
|
||
def validate_has_all_ccc_refs(page_nodes_dict): | ||
ccc_refs = {} | ||
|
||
for page in page_nodes_dict.values(): | ||
for paragraph in page.paragraphs: | ||
if isinstance(paragraph, Paragraph): | ||
for element in paragraph.elements: | ||
if element['type'] == 'ref-ccc': | ||
ccc_refs[element['ref_number']] = '' | ||
|
||
expected_num_ccc_refs = 2865 | ||
missing_refs = [] | ||
|
||
for i in range(1, expected_num_ccc_refs): | ||
if i not in ccc_refs: | ||
missing_refs.append(i) | ||
|
||
return len(missing_refs) == 0 |