-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathusfm2html.py
executable file
·130 lines (117 loc) · 5.83 KB
/
usfm2html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python3
#usfm2html.py
#version 0.5
# by John Wood -- for Tech Advance
# This script reads a USFM 2 file and outputs a "pretty"
# HTML version of the file.
# It is intended that the HTML will then be further tweaked
# in LibreOffice or something similar, but of course you're
# free to do as you please.
#
# It will not check for a HTML file before writing to it.
# I will decide later if that is a bug or a feature.
#
# At this point it will only work on a file. Later it may work
# on a folder/directory
# It also does not work with wildcards for multiple files (i.e. *.usfm)
# Change in v. 0.5:
# Added code to ignore the Strong's numbers, etc., in USFM 3 files
# Change in v. 0.4:
# Removed <!--NewPage--> comment because it caused problems in LibreOffice, and didn't seem to work as intended.
# Changes in v. 0.3:
# Correctly deals with UTF-8 files that otherwise may not display correctly.
# Changes in v. 0.2:
# Adding/changing code to deal with Bahasa USFM files
# Usage python3 usfm2html.py <path to USFM file>
# The output of this command can be run through html2ps and then ps2pdf in order to produce pdf output.
# Please note that any errors in the USFM will likely produce formatting errors in the HTML, and thus in the PDF.
#Import necessary python components
import os # file system commands
import re # regular expressions
import sys # command line arguments
import shutil # high level file operations
arguments=sys.argv[1:]
count_args=len(arguments)
print("usfm2html: Making your output prettier since 2018\n")
if count_args!=1: #If there is not exactly one argument, fail with a usage remark.
if count_args == 0:
print ("usfm2html.py script to convert USFM 2 scripture to pretty HTML")
print("Usage: python3 usfm2html.py <path to USFM file>")
elif count_args > 1:
print ("usfm2html.py currently only handles one file at a time")
print ("If you are using a bash shell, try")
print (" for n in $(ls *.usfm); do usfm2html $n; done")
sys.exit(1)
book_name="Unknown" # Script was failing with 'book_name unknown' errors. This initializes the variable before it can be needed.
convert_file=sys.argv[1]
if not re.search(r'.u?sfm',convert_file, flags=re.IGNORECASE):
print("Not a USFM file as far as I can tell.")
sys.exit(1)
target_file=re.sub(r'.u?sfm','.html',convert_file,flags=re.IGNORECASE)
print("Converting "+convert_file+" to "+target_file+"\n")
with open(target_file, "w+") as newfile:
newfile.write('\n'.join([
'<!DOCTYPE html>',
'<html >',
' <head>',
' <meta charset="UTF-8">',
' <title></title>',
' <style type="text/css">',
' .verse_number {color:darkgrey}',
' h1, h2, h3, .section {text-align: center}',
' .footnote {color:grey;}',
' .chapter {page-break-before: always}',
' .chapitre {font-style: italic}',
' .book_title {text-decoration:underline}',
' </style>',
' </head>',
'<body>']))
with open(convert_file) as input_file: #open the usfm file and scan it
for line in input_file:
line=line.rstrip()
if re.match(r'\\toc2',line):
line=None
elif re.match(r'\\toc3',line):
line=None
elif re.match(r'\\id(e?)',line):
line=None
elif re.match(r'\\rem',line):
line=None
elif re.match(r'\\b',line):
line=None
elif re.match(r'\\mt',line):
line=None #strip out tags we don't want/need
#elif re.match(r'^\\',line):
else:
book=re.match(r'\\h (.+)',line)
if book:
book_name=book.group(1)
if '\\' not in line:
line=re.sub(r'(.+)','<h3 class="chapitre">\\1</h3>',line) # print lines that have no tags as though they are chapters
line=re.sub(r'\\v (\d+?) ','<span class="verse_number"><sup>\\1 </sup></span>',line) # change verse marking
line=re.sub(r'\\toc1 (.+)','<h2 class="fancy_name">\\1</h2>',line) #set fancy name of book
line=re.sub(r'\\h (.+)','<h1 class="book_name">\\1</h1>',line) #set the book name
line=re.sub(r'\\c (\d+)','<pagebreak/><?page-break><h3 class="chapter">'+book_name+' Chapter \\1</h3>',line) # change chapter marking
line=re.sub(r'\\s (.+)','<h4 class="section">\\1</h4>',line) # section headings from .sfm files
line=re.sub(r'\\q1','<br /> ',line) #quote level 1
line=re.sub(r'\\q2','<br /> ',line) #quote level 2
line=re.sub(r'\\q','<br />',line) #change quote tags to line breaks
line=re.sub(r'\\p|\\m','</p>',line) #close paragraph marks
line=re.sub(r'\\s5','<p>',line) #open paragraphs
line=re.sub(r'\\f\*','</sup></span>',line) #close footnotes
line=re.sub(r'\\f \+ \\ft','<span class="footnote"><sup>',line) #start footnotes
line=re.sub(r'\\fqa\*','</em>',line) #close quote in footnote
line=re.sub(r'\\fqa','<em>',line) #open quote in footnote
line=re.sub(r'\\f \+ \\fr','<span class="footnote"><sup><strong>',line) #open reference in footnote
line=re.sub(r'\\fk','</strong><em>',line) #close reference in footnote
line=re.sub(r'\\ft','</strong></em>',line) # yeah, something to do with footnotes
line=re.sub(r'\\li','<li>',line) #list items
line=re.sub(r'\\bk\*','</span>',line) #inline book titles close
line=re.sub(r'\\bk','<span class="book_title">',line) #inline book titles open
line=re.sub(r'\\w(.*?)(\|.+?\\w\*)','\\1',line) # remove Strongs numbers, etc.
if line!=None:
with open(target_file, "a+") as newfile:
newfile.write(line+"\n")
with open(target_file, "a+") as newfile:
newfile.write("</body></html>")
newfile.close()