-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwwz.py
executable file
·705 lines (537 loc) · 18.9 KB
/
wwz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
#!/usr/bin/env python2
from __future__ import print_function
"""
wwz.py - Serve web content directly from a zip file.
-S gives a slight speedup. Although most of the latency appears to be on the
Dreamhost side.
"""
import cgi
import os
import re
import sys
import time
import threading
import traceback
from email.utils import formatdate # for HTTP header
# NOTE: this is not the full 'zipfile' module (in Python), but it has the
# functionality we need (in C). We only want to extract zip files, and this is
# faster.
import zipimport
# Performance note: with 40K files in a 61 MB zip file, this is even slower
# than zipimport! ~700 ms vs. ~450 ms.
#
# import zipfile
# with zipfile.ZipFile(wwz_abs_path) as z:
# body = z.read(rel_path)
# To find out if a zip file is cached, you have to join request log and.
# Will this show up in apache logs? Or is that generated by mod_fcgi?
# UNIQUE_ID Wf-SE0Wj2GQAADYR0E8AAAAF
# TODO: unique_id to join with access.log.
# PID is in the (pid, request_counter) can be used to join request.log and
# trace.log.
# timestamp: should it have a unix-timestamp type? automatically seconds in float?
# then it can automatically be printed
# request_counter: int
# everything else is string
REQUEST_LOG_SCHEMA = [
('unique_id', 'string'),
('request_counter', 'integer'),
('thread_name', 'string'),
('timestamp', 'double'),
('request_uri', 'string'),
]
TRACE_SCHEMA = [
('unique_id', 'string'),
('request_counter', 'integer'),
('event_name', 'string'),
('timestamp', 'double'),
]
class LogFile(object):
"""Interface that the app uses."""
def Append(self, row):
pass
def Flush(self):
pass
class NoLogFile(LogFile):
pass
# No locking because we assume that records are less than 4096 bytes, and those are atomic:
# http://www.notthewizard.com/2014/06/17/are-files-appends-really-atomic/
class TabularLogFile(LogFile):
"""
Manages:
- schema (data integrity)
- file flushing policy
- encoding (TSV)
"""
def __init__(self, schema, path):
self.f = open(path, 'w') # not append
header = [name for name, _ in schema]
# TODO: Write .schema.csv?
self.f.write('\t'.join(header))
self.f.write('\n')
def Append(self, row):
self.f.write('\t'.join(str(cell) for cell in row))
self.f.write('\n')
def Flush(self):
self.f.flush()
class RequestTracer(object):
def __init__(self):
self.events = []
self.start_time = time.time()
def Event(self, msg):
"""Record a timestamp and string."""
ts = time.time() - self.start_time
self.events.append((ts * 1000, msg)) # milliseconds
def GetEvents(self):
return self.events
def log(msg, *args):
"""Print to stderr. Shows up in error.log."""
if args:
msg = msg % args
print(msg, file=sys.stderr)
HTML_UTF8 = ('Content-Type', 'text/html; charset=utf-8')
def _HtmlHeader(title, css_url):
return """
<!DOCTYPE html>
<html>
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>%s</title>
<link rel="stylesheet" type="text/css" href="%s" />
</head>
<body>
""" % (cgi.escape(title), cgi.escape(css_url))
def _HtmlFooter():
return """
</body>
</html>
"""
def Ok(start_response, headers, body):
start_response('200 OK', headers)
return [body]
def BadRequest(start_response, msg, *args):
"""
Usage: return BadRequest(start_response, 'message %r', arg)
"""
if args:
msg = msg % args
start_response('400 Bad Request', [HTML_UTF8])
body = """\
<h1>wwz: 400 Bad Request</h1>
<p>%s</p>
""" % cgi.escape(msg)
return [body]
def NotFound(start_response, msg, *args):
"""
Usage: return NotFound(start_response, 'message %r', arg)
"""
if args:
msg = msg % args
start_response('404 Not Found', [HTML_UTF8])
body = """\
<h1>wwz: 404 Not Found</h1>
<p>%s</p>
""" % cgi.escape(msg)
return [body]
# Don't print unsanitized request path to header, which would allow header
# injection.
# flup doesn't appear to take care of this!
#
# Be conservative.
REDIRECT_RE = re.compile(r'^[a-zA-Z0-9_./-]*$')
def Redirect(start_response, location):
"""
Usage: return Redirect(start_response, 'http://example.com')
"""
start_response('302 Found', [HTML_UTF8, ('Location', location)])
body = """\
<h1>wwz: 302 Found</h1>
<p>%s</p>
""" % cgi.escape(location)
return [body]
DEBUG = False
#DEBUG = True
def _MakeListing(page_data, rel_paths, dir_prefix):
dirs = set()
files = []
assert dir_prefix == '' or dir_prefix.endswith('/'), dir_prefix
for rel_path in rel_paths:
if rel_path == dir_prefix:
continue # don't list yourself
if not rel_path.startswith(dir_prefix):
continue # not under this dir
if rel_path == dir_prefix + 'index.html':
page_data['index_html'] = True
zip_rel_path = rel_path[len(dir_prefix):]
# Here we assume that dirs end with /, but files don't.
# That appears to be true in zips.
slash1 = zip_rel_path.find('/')
if slash1 == -1:
# foo -> file is foo
files.append(zip_rel_path)
else:
# Note: we can have a rel_path _tmp/soil/, but NOT _tmp/
dir_name = zip_rel_path[:slash1+1] # include /
dirs.add(dir_name)
page_data['files'].extend(sorted(files))
page_data['dirs'].extend(sorted(dirs))
def _MakeCrumb2(crumb2, wwz_name, dir_prefix):
parts = [p for p in dir_prefix.split('/') if p]
anchors = [wwz_name] + parts
urls = [None] * len(anchors)
n_inside = len(anchors)
for i in xrange(n_inside - 1):
dots = ['..'] * (n_inside - i - 1)
urls[i] = '/'.join(dots) + '/-wwz-index'
crumb2['anchors'] = anchors
crumb2['urls'] = urls
return n_inside
def _MakeCrumb1(crumb1, n_inside, http_host, wwz_base_url):
#
# Now go even further back
#
parts = [p for p in wwz_base_url.split('/') if p]
parts.pop() # remove .wwz
anchors = [http_host] + parts
urls = [None] * len(anchors)
n_before = len(anchors)
for i in xrange(n_before):
dots = ['..'] * (n_inside + n_before - i - 1)
urls[i] = '/'.join(dots) + '/' # use web server index
crumb1['anchors'] = anchors
crumb1['urls'] = urls
def _Breadcrumb(crumb, last_slash=False):
yield '<div class="breadcrumb">\n'
i = 0
for anchor, link in zip(crumb['anchors'], crumb['urls']):
if i != 0:
yield '/\n' # separator
if link is None:
yield '<span>%s</span>\n' % cgi.escape(anchor)
else:
yield '<a href="%s">%s</a>\n' % (cgi.escape(link, quote=True),
cgi.escape(anchor))
i += 1
if last_slash:
yield '/\n'
yield '</div>\n\n'
def _EntriesHtml(heading, entries, url_suffix=''):
yield '<h1>%s</h1>\n' % cgi.escape(heading)
if len(entries):
for entry in entries:
escaped = cgi.escape(entry, quote=True)
yield '<a href="%s">%s</a> <br/>\n' % (escaped + url_suffix, escaped)
else:
yield '<p><i>(no entries)</i></p>\n'
yield '\n'
class App(object):
def __init__(self, request_log, trace_log, log_dir, pid):
self.traces = []
self.request_log = request_log
self.trace_log = trace_log
self.log_dir = log_dir
# path -> zipimporter instance. They are assumed to be immutable. If you
# mutate one, you have to restart the FastCGI process.
# TODO: Need to lock this
self.zip_files = {}
self.zip_files_lock = threading.Lock() # multiple threads may access state
# for monitoring
self.pid = pid
self.request_counter = 0
def StatusPage(self, environ, start_response):
"""Serve the status page so we can monitor it.
Note: we could also have a JSON status page
"""
start_response('200 OK', [HTML_UTF8])
title = 'Status of wwz process %d' % self.pid
yield _HtmlHeader(title, '-wwz-css')
yield '''
<div style="text-align: right">
<a href="..">Up</a> | <a href="%s">wwz Index</a>
</div>
''' % '-wwz-index'
yield '<h1>%s</h1>\n' % title
# By default, I'm seeing a thread pool of 5. Does more concurrency help?
th = threading.current_thread()
yield '<p>thread ID = %d</p>' % th.ident
yield '<p>thread name = %s</p>' % cgi.escape(th.getName())
yield '<p>current time = %s</p>' % time.time()
yield '<p>num requests = %d</p>' % self.request_counter
yield '<h3>zip files open</h3>'
for name in self.zip_files: # is this thread safe?
yield '<p>%s</p>' % cgi.escape(name)
yield '<h3>traces</h3>'
for trace in self.traces: # is this thread safe?
yield '<p><pre>'
for ts, event in trace: # ts is in milliseconds
yield '%.2f %s\n' % (ts, cgi.escape(event))
yield '</pre></p>'
yield '<h3>FastCGI Environment</h3>'
yield '<table>'
for k, v in sorted(environ.items()):
yield '<tr><td>%s</td><td><code>%s</code></td></tr>\n' % (cgi.escape(str(k)), cgi.escape(str(v)))
yield '</table>'
yield '<hr/>\n'
yield _HtmlFooter()
def IndexListing(self, start_response, http_host, wwz_base_url, wwz_abs_path,
rel_path, dir_prefix, last_modified):
"""
wwz_base_url: /dir/foo.wwz
wwz_abspath: /home/andy/dir/foo.wwz
"""
# 2024-05: Use zipfile module, not zipimport, because it can list files
import zipfile
z = zipfile.ZipFile(wwz_abs_path)
start_response('200 OK', [HTML_UTF8, last_modified])
if DEBUG:
log('rel_path = %r', rel_path)
log('dir_prefix = %r', dir_prefix)
# Suppose we have these request paths:
# dir/foo.wwz/spam/eggs/-wwz-index
# dir/foo.wwz/-wwz-index
#
# Then in both cases:
# wwz_base_url = /dir/foo.wwz
# wwz_abs_path = ~/www/dir/foo.wwz
#
# rel_path =
# -wwz-index
# spam/eggs/-wwz-index
# dir_prefix
# ''
# spam/
# spam/eggs/
wwz_name = os.path.basename(wwz_abs_path)
title = '%s : %s' % (cgi.escape(wwz_name), cgi.escape(dir_prefix))
yield _HtmlHeader(title, wwz_base_url + '/-wwz-css')
yield '''
<div style="text-align: right">
<a href="%s">wwz Status</a>
</div>
''' % (wwz_base_url + '/-wwz-status')
page_data = {
'files': [], 'dirs': [],
# breadcrumb inside wwz
'crumb2': {'anchors': [], 'urls': []} ,
# then a breadcrumb UP TO wwz
'crumb1': {'anchors': [], 'urls': []} ,
# is there an index.html for this dir?
'index_html': False
}
_MakeListing(page_data, z.namelist(), dir_prefix)
n_inside = _MakeCrumb2(page_data['crumb2'], wwz_name, dir_prefix)
_MakeCrumb1(page_data['crumb1'], n_inside, http_host, wwz_base_url)
if DEBUG:
from pprint import pformat
log('%s', pformat(page_data))
log('')
for chunk in _Breadcrumb(page_data['crumb1'], last_slash=True):
yield chunk
yield '<hr/>\n'
for chunk in _Breadcrumb(page_data['crumb2']):
yield chunk
for chunk in _EntriesHtml('Files', page_data['files']):
yield chunk
for chunk in _EntriesHtml('Dirs', page_data['dirs'], url_suffix='-wwz-index'):
yield chunk
if page_data['index_html']:
yield '<hr />\n'
yield '<p><a href=".">View index.html</a></p>\n'
yield _HtmlFooter()
def _LogException(self, unique_id, request_uri, exc_type, e, tb):
# For now, create a file for each exception. Use a simple name and a
# simple format. Eventually it might be nice to revive my simple UDP
# server.
out_path = os.path.join(self.log_dir, 'exception.%s.txt' % time.time())
with open(out_path, 'w') as f:
f.write(unique_id)
f.write('\n')
f.write(request_uri)
f.write('\n')
f.write('---\n')
f.write(str(exc_type))
f.write('\n')
f.write('---\n')
f.write(str(e))
f.write('\n')
f.write('---\n')
traceback.print_tb(tb, None, f) # no limit
f.write('\n')
def __call__(self, environ, start_response):
"""Wrap the real request in tracing."""
unique_id = environ.get('UNIQUE_ID', '-') # from mod_unique_id, for joining logs
request_uri = environ.get('REQUEST_URI', '-')
try:
tracer = RequestTracer()
self.request_counter += 1
request_counter = self.request_counter # copy it into this thread for later
th = threading.current_thread() # new thread for every request
entry = (unique_id, request_counter, th.getName(), time.time(), request_uri)
self.request_log.Append(entry)
try:
for chunk in self.Respond(environ, start_response, tracer):
yield chunk
finally:
# Make sure we don't lose any requests, since there are early returns.
# Flush to disk afterward.
for ts, name in tracer.GetEvents():
entry = (unique_id, request_counter, ts, name)
self.trace_log.Append(entry)
self.trace_log.Flush()
self.request_log.Flush()
except Exception:
exc_type, e, tb = sys.exc_info()
self._LogException(unique_id, request_uri, exc_type, e, tb)
# NOTE: The WSGI server will catch this. But it might be better to let
# it restart! That will clear the error that happens when the zip file
# is updated.
# I think I have to patch flup then.
raise
def Respond(self, environ, start_response, tracer):
"""Produce HTTP response. Called from multiple threads.
Example:
Given the rewrite rule in .htaccess, and URL
http://chubot.org/wwz-test/foo.wwz/a/b/c
We get CGI vars:
PATH_INFO = /a/b/c
REQUEST_URI = /wwz-test/foo.wwz/a/b/c
DOCUMENT_ROOT = /home/chubot/chubot.org
"""
request_uri = environ['REQUEST_URI']
path_info = environ.get('PATH_INFO', '')
# PATH_INFO may be unset if you visit http://example.com/cgi-bin/wwz.py with
# no trailng path.
if not path_info:
chunks = list(self.StatusPage(environ, start_response))
tracer.Event('StatusPage-end')
return chunks
doc_root = environ['DOCUMENT_ROOT']
if DEBUG:
log('REQUEST_URI = %r', request_uri)
log('PATH_INFO = %r', path_info)
log('DOCUMENT_ROOT = %r', doc_root)
n = len(path_info)
wwz_base_url = request_uri[:-n] # /dir/foo.wwz
wwz_abs_path = os.path.join(doc_root, wwz_base_url[1:])
# Use the timestamp on the whole .zip file as the Last-Modified header. If
# ANY file in the .zip is modified, consider the whole thing modified. I
# think that is fine.
try:
mtime = os.path.getmtime(wwz_abs_path)
except OSError as e:
return NotFound(start_response, "Couldn't open wwz path %r", wwz_abs_path)
# https://stackoverflow.com/questions/225086/rfc-1123-date-representation-in-python
last_modified = (
'Last-Modified', formatdate(mtime, localtime=False, usegmt=True))
rel_path = path_info[1:] # remove leading /
if rel_path == '-wwz-css':
with open('wwz.css') as f:
body = f.read()
headers = [('Content-Type', 'text/css')]
return Ok(start_response, headers, body)
if rel_path == '-wwz-status':
return list(self.StatusPage(environ, start_response))
if rel_path == '-wwz-index' or rel_path.endswith('/-wwz-index'):
dir_prefix = rel_path[:-len('-wwz-index')]
return list(self.IndexListing(
start_response, environ.get('HTTP_HOST', 'HOST'),
wwz_base_url, wwz_abs_path,
rel_path, dir_prefix, last_modified))
tracer.Event('zip-begin')
# NOTE: We are doing coarse-grained locking here. Technically, we could
# try not to lock when reading the zip file, but it's more complex. We
# don't know if two cold hits in a row go to the same zip file. We don't
# want to concurrent create duplicate objects.
with self.zip_files_lock:
try:
z = self.zip_files[wwz_abs_path]
except KeyError:
tracer.Event('open-zip')
try:
z = zipimport.zipimporter(wwz_abs_path)
except zipimport.ZipImportError as e:
return NotFound(start_response, "Couldn't open wwz path %r", wwz_abs_path)
self.zip_files[wwz_abs_path] = z
tracer.Event('cached-zip')
tracer.Event('zip-end')
# It's a file
is_binary = False
# The zipimporter has directory entries. But we don't want to serve empty
# files!
if rel_path == '' or rel_path.endswith('/'):
index_html = rel_path + 'index.html'
try:
body = z.get_data(index_html)
except IOError as e:
# No index.html - redirect to -wwz-index (RELATIVE URL)
if REDIRECT_RE.match(rel_path):
return Redirect(start_response, '-wwz-index')
else:
return BadRequest(start_response, 'Invalid path %r' % rel_path)
headers = [HTML_UTF8, last_modified]
return Ok(start_response, headers, body)
if rel_path.endswith('.html'):
content_type = 'text/html'
elif rel_path.endswith('.css'):
content_type = 'text/css'
elif rel_path.endswith('.js'):
content_type = 'application/javascript'
elif rel_path.endswith('.json'):
content_type = 'application/json'
elif rel_path.endswith('.png'):
content_type = 'image/png'
is_binary = True
elif rel_path.endswith('.tar'): # for _release/oil.tar
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
content_type = 'application/x-tar'
is_binary = True
else:
content_type = 'text/plain' # default
try:
body = z.get_data(rel_path)
except IOError as e:
return NotFound(start_response, 'Path %r not found in wwz archive', rel_path)
tracer.Event('data-read')
headers = []
if not is_binary:
content_type = '%s; charset=utf-8' % content_type
# Dreamhost does send ETag.
# Semi-unique hash gets perserved. TODO: Bake an md5sum into .zip metadata?
# Does this make the browser send conditional GETs? Do crwalers ever use
# this?
#print 'ETag: %s' % hash(rel_path)
headers = [('Content-Type', content_type), last_modified]
chunks = Ok(start_response, headers, body)
tracer.Event('request-end')
return chunks
def main(argv):
log_dir = argv[1] # for exceptions
pid = os.getpid()
timestamp = time.strftime('%Y-%m-%d__%H-%M-%S')
log_requests = os.getenv('WWZ_REQUEST_LOG')
if log_requests:
path1 = os.path.join(log_dir, '%s.%d.request.log' % (timestamp, pid))
request_log = TabularLogFile(REQUEST_LOG_SCHEMA, path1)
else:
request_log = NoLogFile()
trace = os.getenv('WWZ_TRACE_LOG')
if trace:
path2 = os.path.join(log_dir, '%s.%d.trace.log' % (timestamp, pid))
trace_log = TabularLogFile(TRACE_SCHEMA, path2)
else:
trace_log = NoLogFile()
# Global instance shared by all threads.
app = App(request_log, trace_log, log_dir, pid)
if os.getenv('FASTCGI'):
from flup.server.fcgi import WSGIServer
# OLD MODULE. I tested this and it has the same 1.0 delay, which might be
# cilent DNS or Dreamhost.
#from fcgi import WSGIServer
# NOTE: debug=True shows tracebacks.
WSGIServer(app, debug=True).run()
#WSGIServer(app).run()
else:
from wsgiref.handlers import CGIHandler
CGIHandler().run(app)
if __name__ == '__main__':
main(sys.argv)