-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathhtml-embed
executable file
·205 lines (169 loc) · 7 KB
/
html-embed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/env python3
import itertools as it, operator as op, functools as ft
from urllib.parse import unquote as url_unquote
from html.parser import HTMLParser
import os, sys, contextlib, pathlib, collections, logging
import tempfile, stat, html, mimetypes, base64
class LogMessage:
def __init__(self, fmt, a, k): self.fmt, self.a, self.k = fmt, a, k
def __str__(self): return self.fmt.format(*self.a, **self.k) if self.a or self.k else self.fmt
class LogStyleAdapter(logging.LoggerAdapter):
def __init__(self, logger, extra=None):
super(LogStyleAdapter, self).__init__(logger, extra or {})
def log(self, level, msg, *args, **kws):
if not self.isEnabledFor(level): return
log_kws = {} if 'exc_info' not in kws else dict(exc_info=kws.pop('exc_info'))
msg, kws = self.process(msg, kws)
self.logger._log(level, LogMessage(msg, args, kws), (), **log_kws)
get_logger = lambda name: LogStyleAdapter(logging.getLogger(name))
@contextlib.contextmanager
def safe_replacement(path, *open_args, mode=None, **open_kws):
if mode is None:
with contextlib.suppress(OSError):
mode = stat.S_IMODE(os.lstat(path).st_mode)
open_kws.update( delete=False,
dir=os.path.dirname(path), prefix=os.path.basename(path)+'.' )
with tempfile.NamedTemporaryFile(*open_args, **open_kws) as tmp:
try:
if mode is not None: os.fchmod(tmp.fileno(), mode)
yield tmp
if not tmp.closed: tmp.flush()
os.rename(tmp.name, path)
finally:
with contextlib.suppress(OSError): os.unlink(tmp.name)
@contextlib.contextmanager
def cleanup_on_err(file_obj):
try: yield
except:
with contextlib.suppress(OSError): os.unlink(file_obj.name)
file_obj.close()
raise
class EmbedTagFinder(HTMLParser):
def __init__(self):
super(EmbedTagFinder, self).__init__()
self.store_pos = None
self.subst_open, self.subst_list = dict(), list()
def decode_path(self, path):
return url_unquote(html.unescape(path))
def updatepos(self, i, k):
# Assumption here is that html.parser.HTMLParser calls
# "self.updatepos(i, k)" right after parsing end tag and handle_endtag
# call, as it is in current Python-3.5.2 stdlib module
res = super(EmbedTagFinder, self).updatepos(i, k)
if self.store_pos:
(dst, k), self.store_pos = self.store_pos, None
dst[k] = list(self.getpos())
dst[k][1] += 1 # closing '>'
self.store_pos = None
return res
def handle_starttag(self, tag, attrs):
subst, attrs = None, dict(attrs)
if tag == 'script' and attrs.get('src'): subst, path = tag, attrs.pop('src')
if tag == 'link' and attrs.get('rel') == 'stylesheet' and attrs.get('href'):
subst, path = tag, attrs.pop('href')
if tag == 'img' and attrs.get('src'): subst, path = tag, attrs.pop('src')
if not subst: return
if subst in self.subst_open: # standalone tags like "<link ...>"
self.handle_endtag(subst, end=self.subst_open[subst]['next'])
self.subst_open[subst] = dict(
path=self.decode_path(path), pos=self.getpos(), attrs=attrs )
self.store_pos = self.subst_open[subst], 'next'
def handle_endtag(self, tag, end=None):
subst_info = self.subst_open.pop(tag, None)
if not subst_info: return
subst = dict( tag=tag,
start=subst_info['pos'], end=end,
path=subst_info['path'], attrs=subst_info['attrs'] )
self.subst_list.append(subst)
if not end: self.store_pos = subst, 'end'
def close(self):
super(EmbedTagFinder, self).close()
for tag, s in list(self.subst_open.items()):
self.handle_endtag(tag, end=s['next'])
def dump_fat_html(src_path, dst):
src_path = pathlib.Path(src_path)
if src_path.is_dir(): src_path, src_dir = src_path / 'index.html', src_path
else: src_dir = src_path.parent
with open(bytes(src_path), 'r') as src:
log.debug('Parsing html file...')
proc = EmbedTagFinder()
for buff in iter(ft.partial(src.read, 2**20), ''): proc.feed(buff)
proc.close()
# Resolve all start/end pos to offset int, get sorted (by pos) list of replacements
subst_list, subst_lines = proc.subst_list, collections.defaultdict(list)
for s in sorted(subst_list, key=op.itemgetter('start')): subst_lines[s['start'][0]].append(s)
subst_lines = dict(subst_lines)
src.seek(0)
for n, line in enumerate(iter(src.readline, ''), 1):
if n not in subst_lines: continue
pos = src.tell() - len(line)
for s in subst_lines.pop(n):
a, b = map(op.itemgetter(1), op.itemgetter('start', 'end')(s))
s.update(a=pos+a, b=pos+b)
assert not subst_lines, subst_lines
subst_list = sorted(subst_list, key=op.itemgetter('a'))
# Check for overlaps - can't handle that case, and shouldn't be necessary here
b0 = None
for s in subst_list:
if b0 is not None:
assert b0 <= s['a'], [s['a'], b0]
b0 = s['b']
# Cache all the substitute data
# XXX: can be kinda large, easy to optimize
log.debug('Gathering replacement data...')
for s in subst_list:
tag = s['tag']
p = (src_dir / s['path'].lstrip('/')).resolve()
log.debug('Embedding file (tag: {}): {}', tag, p)
src.seek(s['a'])
str_len = s['b'] - s['a']
s_src = src.read(str_len)
assert len(s_src) == str_len, [s['a'], s['b'], len(s_src), str_len]
s['src'] = s_src.rstrip()
s['b'] -= len(s_src) - len(s['src'])
if tag in ['script', 'link']:
if tag == 'link': tag = 'style'
with open(bytes(p), 'r') as inc: text = inc.read()
if not text.endswith('\n'): text += '\n'
s['subst'] = '<{}>\n{}</{}>'.format(tag, text, tag)
elif tag == 'img':
attr_str = ' '.join(
(k if v is None else '{}="{}"'.format(k, html.escape(v, quote=True)))
for k, v in s['attrs'].items() )
if attr_str: attr_str = ' ' + attr_str
mime, enc = mimetypes.guess_type(str(p))
with open(bytes(p), 'rb') as inc: img = inc.read()
img = 'data:{};base64,{}'.format(
mime, base64.standard_b64encode(img).decode() )
s['subst'] = '<img{} src="{}">'.format(attr_str, img)
else: raise ValueError(tag)
log.debug(
'Embedded file (tag: {}, name: {}), replacing {} chars with {}',
tag, p.name, len(s['src']), len(s['subst']) )
# Copy contents, inserting substitutions
log.debug('Assembling new html...')
src.seek(0)
for s in subst_list:
dst.write(src.read(s['a'] - src.tell()))
dst.write(s['subst'])
src.seek(s['b'])
dst.write(src.read())
dst.flush()
log.debug('Finished')
def main(args=None):
import argparse
parser = argparse.ArgumentParser(
description='Create "fat" html file with all the linked stuff (js, css, img) embedded in it.')
parser.add_argument('path',
help='HTML file to operate on. Paths to lined files are assumed to be relative to it ')
parser.add_argument('dst_path', nargs='?',
help='Path to output resulting file to. If not specified, stdout will be used.')
parser.add_argument('--debug', action='store_true', help='Verbose operation mode.')
opts = parser.parse_args(sys.argv[1:] if args is None else args)
global log
logging.basicConfig(level=logging.DEBUG if opts.debug else logging.WARNING)
log = get_logger('main')
if opts.dst_path:
with safe_replacement(opts.dst_path, 'a+') as tmp: dump_fat_html(opts.path, tmp)
else: dump_fat_html(opts.path, sys.stdout)
if __name__ == '__main__': sys.exit(main())