-
Notifications
You must be signed in to change notification settings - Fork 55
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Paged JSON support, 'highlight' and 'snippet' URL parameters #61
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,12 @@ | ||
#!/usr/bin/env python | ||
#{{{ debug | ||
# debug | ||
from __future__ import print_function | ||
import sys | ||
|
||
def eprint(*args, **kwargs): | ||
print(*args, file=sys.stderr, **kwargs) | ||
#}}} | ||
#{{{ imports | ||
import os | ||
import bottle | ||
|
@@ -7,13 +15,20 @@ | |
import datetime | ||
import glob | ||
import hashlib | ||
import json | ||
import csv | ||
import StringIO | ||
import ConfigParser | ||
import string | ||
import shlex | ||
import urllib | ||
|
||
# use ujson if avalible (faster than built in json) | ||
try: | ||
import ujson as json | ||
except ImportError: | ||
import json | ||
print("ujson module not found, using (slower) built-in json module instead") | ||
|
||
# import recoll and rclextract | ||
try: | ||
from recoll import recoll | ||
|
@@ -34,7 +49,7 @@ | |
'context': 30, | ||
'stem': 1, | ||
'timefmt': '%c', | ||
'dirdepth': 3, | ||
'dirdepth': 2, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the web interface can get unbearably slow with a deep tree, setting the default depth to 2 solves this issue. |
||
'maxchars': 500, | ||
'maxresults': 0, | ||
'perpage': 25, | ||
|
@@ -100,15 +115,26 @@ def normalise_filename(fn): | |
else: | ||
out += "_" | ||
return out | ||
|
||
def get_topdirs(db): | ||
rclconf = rclconfig.RclConfig(os.path.dirname(db)) | ||
return rclconf.getConfParam('topdirs') | ||
#}}} | ||
#{{{ get_config | ||
def get_config(): | ||
config = {} | ||
# get useful things from recoll.conf | ||
rclconf = rclconfig.RclConfig() | ||
config['confdir'] = rclconf.getConfDir() | ||
config['dirs'] = [os.path.expanduser(d) for d in | ||
shlex.split(rclconf.getConfParam('topdirs'))] | ||
config['extradbs'] = [] | ||
if 'RECOLL_EXTRA_DBS' in os.environ: | ||
config['extradbs'] = os.environ.get('RECOLL_EXTRA_DBS').split(':') | ||
config['dirs']={} | ||
for dir in [os.path.expanduser(d) for d in | ||
shlex.split(rclconf.getConfParam('topdirs'))]: | ||
config['dirs'][dir] = os.path.join(config['confdir'], 'xapiandb') | ||
# global options as set by the default recoll config are also used for extra databases | ||
# when searching the entire set | ||
config['stemlang'] = rclconf.getConfParam('indexstemminglanguages') | ||
# get config from cookies or defaults | ||
for k, v in DEFAULTS.items(): | ||
|
@@ -119,22 +145,27 @@ def get_config(): | |
ncf = [f for f in cf if f in FIELDS] | ||
config['csvfields'] = ' '.join(ncf) | ||
config['fields'] = ' '.join(FIELDS) | ||
# get additional databases | ||
for e in config['extradbs']: | ||
for t in [os.path.expanduser(d) for d in | ||
shlex.split(get_topdirs(e))]: | ||
config['dirs'][t] = e | ||
# get mountpoints | ||
config['mounts'] = {} | ||
for d in config['dirs']: | ||
for d,db in config['dirs'].items(): | ||
name = 'mount_%s' % urllib.quote(d,'') | ||
config['mounts'][d] = select([bottle.request.get_cookie(name), 'file://%s' % d], [None, '']) | ||
return config | ||
#}}} | ||
#{{{ get_dirs | ||
def get_dirs(tops, depth): | ||
def get_dirs(dirs, depth): | ||
v = [] | ||
for top in tops: | ||
dirs = [top] | ||
for dir,d in dirs.items(): | ||
dirs = [dir] | ||
for d in range(1, depth+1): | ||
dirs = dirs + glob.glob(top + '/*' * d) | ||
dirs = dirs + glob.glob(dir + '/*' * d) | ||
dirs = filter(lambda f: os.path.isdir(f), dirs) | ||
top_path = top.rsplit('/', 1)[0] | ||
top_path = dir.rsplit('/', 1)[0] | ||
dirs = [w.replace(top_path+'/', '', 1) for w in dirs] | ||
v = v + dirs | ||
return ['<all>'] + v | ||
|
@@ -149,6 +180,8 @@ def get_query(): | |
'sort': select([bottle.request.query.get('sort'), SORTS[0][0]]), | ||
'ascending': int(select([bottle.request.query.get('ascending'), 0])), | ||
'page': int(select([bottle.request.query.get('page'), 0])), | ||
'highlight': int(select([bottle.request.query.get('highlight'), 1])), | ||
'snippets': int(select([bottle.request.query.get('snippets'), 1])), | ||
} | ||
return query | ||
#}}} | ||
|
@@ -164,7 +197,25 @@ def query_to_recoll_string(q): | |
#{{{ recoll_initsearch | ||
def recoll_initsearch(q): | ||
config = get_config() | ||
db = recoll.connect(config['confdir']) | ||
""" The reason for this somewhat elaborate scheme is to keep the | ||
set size as small as possible by searching only those databases | ||
with matching topdirs """ | ||
if q['dir'] == '<all>': | ||
db = recoll.connect(config['confdir'], config['extradbs']) | ||
else: | ||
dbs=[] | ||
for d,db in config['dirs'].items(): | ||
if os.path.commonprefix([os.path.basename(d),q['dir']]) == q['dir']: | ||
dbs.append(db) | ||
if len(dbs) == 0: | ||
# should not happen, using non-existing q['dir']? | ||
db = recoll.connect(config['confdir'],config['extradbs']) | ||
elif len(dbs) == 1: | ||
# only one db (most common situation) | ||
db = recoll.connect(os.path.dirname(dbs[0])) | ||
else: | ||
# more than one db with matching topdir, use 'm all | ||
db = recoll.connect(dbs[0],dbs[1:]) | ||
db.setAbstractParams(config['maxchars'], config['context']) | ||
query = db.query() | ||
query.sortby(q['sort'], q['ascending']) | ||
|
@@ -183,9 +234,10 @@ def endMatch(self): | |
return '</span>' | ||
#}}} | ||
#{{{ recoll_search | ||
def recoll_search(q, dosnippets=True): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. options are communicated through |
||
def recoll_search(q): | ||
config = get_config() | ||
tstart = datetime.datetime.now() | ||
highlighter = HlMeths() | ||
results = [] | ||
query = recoll_initsearch(q) | ||
nres = query.rowcount | ||
|
@@ -199,31 +251,33 @@ def recoll_search(q, dosnippets=True): | |
q['page'] = 1 | ||
offset = (q['page'] - 1) * config['perpage'] | ||
|
||
if query.rowcount > 0: | ||
if query.rowcount > 0 and offset < query.rowcount: | ||
if type(query.next) == int: | ||
query.next = offset | ||
else: | ||
query.scroll(offset, mode='absolute') | ||
|
||
highlighter = HlMeths() | ||
for i in range(config['perpage']): | ||
try: | ||
doc = query.fetchone() | ||
except: | ||
break | ||
d = {} | ||
for f in FIELDS: | ||
v = getattr(doc, f) | ||
if v is not None: | ||
d[f] = v.encode('utf-8') | ||
else: | ||
d[f] = '' | ||
d['label'] = select([d['title'], d['filename'], '?'], [None, '']) | ||
d['sha'] = hashlib.sha1(d['url']+d['ipath']).hexdigest() | ||
d['time'] = timestr(d['mtime'], config['timefmt']) | ||
if dosnippets: | ||
d['snippet'] = query.makedocabstract(doc, highlighter).encode('utf-8') | ||
results.append(d) | ||
for i in range(config['perpage']): | ||
try: | ||
doc = query.fetchone() | ||
except: | ||
break | ||
d = {} | ||
for f in FIELDS: | ||
v = getattr(doc, f) | ||
if v is not None: | ||
d[f] = v.encode('utf-8') | ||
else: | ||
d[f] = '' | ||
d['label'] = select([d['title'], d['filename'], '?'], [None, '']) | ||
d['sha'] = hashlib.sha1(d['url']+d['ipath']).hexdigest() | ||
d['time'] = timestr(d['mtime'], config['timefmt']) | ||
if q['snippets']: | ||
if q['highlight']: | ||
d['snippet'] = query.makedocabstract(doc, highlighter).encode('utf-8') | ||
else: | ||
d['snippet'] = query.makedocabstract(doc).encode('utf-8') | ||
results.append(d) | ||
tend = datetime.datetime.now() | ||
return results, nres, tend - tstart | ||
#}}} | ||
|
@@ -315,24 +369,24 @@ def edit(resnum): | |
@bottle.route('/json') | ||
def get_json(): | ||
query = get_query() | ||
query['page'] = 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This allows the generation of paged JSON, set |
||
qs = query_to_recoll_string(query) | ||
bottle.response.headers['Content-Type'] = 'application/json' | ||
bottle.response.headers['Content-Disposition'] = 'attachment; filename=recoll-%s.json' % normalise_filename(qs) | ||
res, nres, timer = recoll_search(query) | ||
|
||
return json.dumps({ 'query': query, 'results': res }) | ||
return json.dumps({ 'query': query, 'nres': nres, 'results': res }) | ||
#}}} | ||
#{{{ csv | ||
@bottle.route('/csv') | ||
def get_csv(): | ||
config = get_config() | ||
query = get_query() | ||
query['page'] = 0 | ||
query['snippets'] = 0 | ||
qs = query_to_recoll_string(query) | ||
bottle.response.headers['Content-Type'] = 'text/csv' | ||
bottle.response.headers['Content-Disposition'] = 'attachment; filename=recoll-%s.csv' % normalise_filename(qs) | ||
res, nres, timer = recoll_search(query, False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. options are communicated through |
||
res, nres, timer = recoll_search(query) | ||
si = StringIO.StringIO() | ||
cw = csv.writer(si) | ||
fields = config['csvfields'].split() | ||
|
@@ -355,7 +409,7 @@ def set(): | |
config = get_config() | ||
for k, v in DEFAULTS.items(): | ||
bottle.response.set_cookie(k, str(bottle.request.query.get(k)), max_age=3153600000, expires=315360000) | ||
for d in config['dirs']: | ||
for d,db in config['dirs'].items(): | ||
cookie_name = 'mount_%s' % urllib.quote(d, '') | ||
bottle.response.set_cookie(cookie_name, str(bottle.request.query.get('mount_%s' % d)), max_age=3153600000, expires=315360000) | ||
bottle.redirect('./') | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
usjon is quite a bit faster but there seem to be some concerns about correctness