forked from paul-hammant/tbd
-
Notifications
You must be signed in to change notification settings - Fork 5
/
footer_refs.py
72 lines (58 loc) · 2.46 KB
/
footer_refs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from bs4 import BeautifulSoup
import sys
import json
soup = BeautifulSoup(open(sys.argv[1]).read(), "html.parser")
# Replace ref anchors to chapter footer references
ftr = 0
refs = "<h2>Web references inline in this chapter</h2><p style=\"font-size: 70%\">"
for span in soup.find_all("span", { "class" : "rref" }):
ftr += 1;
try:
refs += str(ftr) + ": " + span.a['href'] + "<br/>"
except TypeError:
print str(span)
raise
new_tag = soup.new_tag('sup')
new_tag.string = "[" + str(ftr) + "]"
span.a.replace_with(new_tag)
refs += "</p>"
# Write a inline links section if needed
if ftr > 0:
h2 = soup.find("h2", { "id" : "references-elsewhere" })
if h2 is not None:
ix = h2.parent.contents.index(h2)
h2.parent.insert(ix, BeautifulSoup(refs, "html.parser"))
else:
soup.body.article.div.insert(len(soup.body.article.div.contents), BeautifulSoup(refs, "html.parser"))
# Replace anchors to other TBD pages with chapter references.
for a in soup.find_all("a"):
href = a['href'].replace('../','')
if ":" not in href:
if "#" not in href:
sub_chapters = json.loads(open(href.replace(".html", ".json")).read())
a.replace_with(BeautifulSoup("<span><i>"+a.text+"</i><sup>[ch: "+sub_chapters['ch']+"]</sup></span>", "html.parser"))
else:
file = href.replace(".html", ".json")
file = file[0:file.index("#")]
subchap = href[href.index("#")+1:]
sub_chapters = json.loads(open(file).read())
try:
a.replace_with(BeautifulSoup("<span><i>"+a.text+"</i><sup>[ch: " + sub_chapters['h2s'][subchap] + "]</sup></span>", "html.parser"))
except KeyError:
print str(a) + " -->" + subchap + "<"
raise
open(sys.argv[1], 'wb').write(str(soup))
# All remaining anchors ... requires a reload of soup for some reason.
soup = BeautifulSoup(open(sys.argv[1]).read(), "html.parser")
for a in soup.find_all("a"):
a.replace_with(BeautifulSoup("<span><strong>"+a.text+"</strong> ["+a['href']+"]</span>", "html.parser"))
open(sys.argv[1], 'wb').write(str(soup))
# Reload soup again for some reason.
soup = BeautifulSoup(open(sys.argv[1]).read(), "html.parser")
sub_chapters = json.loads(open(sys.argv[1].replace(".html", ".json")).read())
h2_ix = 0
for h2 in soup.findAll('h2'):
if 'references-elsewhere' not in str(h2) and 'Books promoting' not in str(h2) and 'Reports promoting' not in str(h2):
h2_ix += 1
h2.string = sub_chapters['ch'] + "." + str(h2_ix) + ": " + h2.text
open(sys.argv[1], 'wb').write(str(soup))