Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Paged JSON support, 'highlight' and 'snippet' URL parameters #61

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions views/results.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
</div>
%if len(res) > 0:
<div id="downloads">
<a href="./json?{{query_string}}">JSON</a>
<a href="./csv?{{query_string}}">CSV</a>
<a href="./json?{{query_string}}&page=0">JSON</a>
<a href="./csv?{{query_string}}&page=0">CSV</a>
</div>
%end
<br style="clear: both">
Expand Down
126 changes: 90 additions & 36 deletions webui.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
#!/usr/bin/env python
#{{{ debug
# debug
from __future__ import print_function
import sys

def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
#}}}
#{{{ imports
import os
import bottle
Expand All @@ -7,13 +15,20 @@
import datetime
import glob
import hashlib
import json
import csv
import StringIO
import ConfigParser
import string
import shlex
import urllib

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

usjon is quite a bit faster but there seem to be some concerns about correctness

# use ujson if avalible (faster than built in json)
try:
import ujson as json
except ImportError:
import json
print("ujson module not found, using (slower) built-in json module instead")

# import recoll and rclextract
try:
from recoll import recoll
Expand All @@ -34,7 +49,7 @@
'context': 30,
'stem': 1,
'timefmt': '%c',
'dirdepth': 3,
'dirdepth': 2,
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the web interface can get unbearably slow with a deep tree, setting the default depth to 2 solves this issue.

'maxchars': 500,
'maxresults': 0,
'perpage': 25,
Expand Down Expand Up @@ -100,15 +115,26 @@ def normalise_filename(fn):
else:
out += "_"
return out

def get_topdirs(db):
rclconf = rclconfig.RclConfig(os.path.dirname(db))
return rclconf.getConfParam('topdirs')
#}}}
#{{{ get_config
def get_config():
config = {}
# get useful things from recoll.conf
rclconf = rclconfig.RclConfig()
config['confdir'] = rclconf.getConfDir()
config['dirs'] = [os.path.expanduser(d) for d in
shlex.split(rclconf.getConfParam('topdirs'))]
config['extradbs'] = []
if 'RECOLL_EXTRA_DBS' in os.environ:
config['extradbs'] = os.environ.get('RECOLL_EXTRA_DBS').split(':')
config['dirs']={}
for dir in [os.path.expanduser(d) for d in
shlex.split(rclconf.getConfParam('topdirs'))]:
config['dirs'][dir] = os.path.join(config['confdir'], 'xapiandb')
# global options as set by the default recoll config are also used for extra databases
# when searching the entire set
config['stemlang'] = rclconf.getConfParam('indexstemminglanguages')
# get config from cookies or defaults
for k, v in DEFAULTS.items():
Expand All @@ -119,22 +145,27 @@ def get_config():
ncf = [f for f in cf if f in FIELDS]
config['csvfields'] = ' '.join(ncf)
config['fields'] = ' '.join(FIELDS)
# get additional databases
for e in config['extradbs']:
for t in [os.path.expanduser(d) for d in
shlex.split(get_topdirs(e))]:
config['dirs'][t] = e
# get mountpoints
config['mounts'] = {}
for d in config['dirs']:
for d,db in config['dirs'].items():
name = 'mount_%s' % urllib.quote(d,'')
config['mounts'][d] = select([bottle.request.get_cookie(name), 'file://%s' % d], [None, ''])
return config
#}}}
#{{{ get_dirs
def get_dirs(tops, depth):
def get_dirs(dirs, depth):
v = []
for top in tops:
dirs = [top]
for dir,d in dirs.items():
dirs = [dir]
for d in range(1, depth+1):
dirs = dirs + glob.glob(top + '/*' * d)
dirs = dirs + glob.glob(dir + '/*' * d)
dirs = filter(lambda f: os.path.isdir(f), dirs)
top_path = top.rsplit('/', 1)[0]
top_path = dir.rsplit('/', 1)[0]
dirs = [w.replace(top_path+'/', '', 1) for w in dirs]
v = v + dirs
return ['<all>'] + v
Expand All @@ -149,6 +180,8 @@ def get_query():
'sort': select([bottle.request.query.get('sort'), SORTS[0][0]]),
'ascending': int(select([bottle.request.query.get('ascending'), 0])),
'page': int(select([bottle.request.query.get('page'), 0])),
'highlight': int(select([bottle.request.query.get('highlight'), 1])),
'snippets': int(select([bottle.request.query.get('snippets'), 1])),
}
return query
#}}}
Expand All @@ -164,7 +197,25 @@ def query_to_recoll_string(q):
#{{{ recoll_initsearch
def recoll_initsearch(q):
config = get_config()
db = recoll.connect(config['confdir'])
""" The reason for this somewhat elaborate scheme is to keep the
set size as small as possible by searching only those databases
with matching topdirs """
if q['dir'] == '<all>':
db = recoll.connect(config['confdir'], config['extradbs'])
else:
dbs=[]
for d,db in config['dirs'].items():
if os.path.commonprefix([os.path.basename(d),q['dir']]) == q['dir']:
dbs.append(db)
if len(dbs) == 0:
# should not happen, using non-existing q['dir']?
db = recoll.connect(config['confdir'],config['extradbs'])
elif len(dbs) == 1:
# only one db (most common situation)
db = recoll.connect(os.path.dirname(dbs[0]))
else:
# more than one db with matching topdir, use 'm all
db = recoll.connect(dbs[0],dbs[1:])
db.setAbstractParams(config['maxchars'], config['context'])
query = db.query()
query.sortby(q['sort'], q['ascending'])
Expand All @@ -183,9 +234,10 @@ def endMatch(self):
return '</span>'
#}}}
#{{{ recoll_search
def recoll_search(q, dosnippets=True):
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

options are communicated through q, no need for dosnippets

def recoll_search(q):
config = get_config()
tstart = datetime.datetime.now()
highlighter = HlMeths()
results = []
query = recoll_initsearch(q)
nres = query.rowcount
Expand All @@ -199,31 +251,33 @@ def recoll_search(q, dosnippets=True):
q['page'] = 1
offset = (q['page'] - 1) * config['perpage']

if query.rowcount > 0:
if query.rowcount > 0 and offset < query.rowcount:
if type(query.next) == int:
query.next = offset
else:
query.scroll(offset, mode='absolute')

highlighter = HlMeths()
for i in range(config['perpage']):
try:
doc = query.fetchone()
except:
break
d = {}
for f in FIELDS:
v = getattr(doc, f)
if v is not None:
d[f] = v.encode('utf-8')
else:
d[f] = ''
d['label'] = select([d['title'], d['filename'], '?'], [None, ''])
d['sha'] = hashlib.sha1(d['url']+d['ipath']).hexdigest()
d['time'] = timestr(d['mtime'], config['timefmt'])
if dosnippets:
d['snippet'] = query.makedocabstract(doc, highlighter).encode('utf-8')
results.append(d)
for i in range(config['perpage']):
try:
doc = query.fetchone()
except:
break
d = {}
for f in FIELDS:
v = getattr(doc, f)
if v is not None:
d[f] = v.encode('utf-8')
else:
d[f] = ''
d['label'] = select([d['title'], d['filename'], '?'], [None, ''])
d['sha'] = hashlib.sha1(d['url']+d['ipath']).hexdigest()
d['time'] = timestr(d['mtime'], config['timefmt'])
if q['snippets']:
if q['highlight']:
d['snippet'] = query.makedocabstract(doc, highlighter).encode('utf-8')
else:
d['snippet'] = query.makedocabstract(doc).encode('utf-8')
results.append(d)
tend = datetime.datetime.now()
return results, nres, tend - tstart
#}}}
Expand Down Expand Up @@ -315,24 +369,24 @@ def edit(resnum):
@bottle.route('/json')
def get_json():
query = get_query()
query['page'] = 0
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This allows the generation of paged JSON, set page=0 or use without page parameter to get an unpaged result.

qs = query_to_recoll_string(query)
bottle.response.headers['Content-Type'] = 'application/json'
bottle.response.headers['Content-Disposition'] = 'attachment; filename=recoll-%s.json' % normalise_filename(qs)
res, nres, timer = recoll_search(query)

return json.dumps({ 'query': query, 'results': res })
return json.dumps({ 'query': query, 'nres': nres, 'results': res })
#}}}
#{{{ csv
@bottle.route('/csv')
def get_csv():
config = get_config()
query = get_query()
query['page'] = 0
query['snippets'] = 0
qs = query_to_recoll_string(query)
bottle.response.headers['Content-Type'] = 'text/csv'
bottle.response.headers['Content-Disposition'] = 'attachment; filename=recoll-%s.csv' % normalise_filename(qs)
res, nres, timer = recoll_search(query, False)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

options are communicated through query, no need for extra parameter

res, nres, timer = recoll_search(query)
si = StringIO.StringIO()
cw = csv.writer(si)
fields = config['csvfields'].split()
Expand All @@ -355,7 +409,7 @@ def set():
config = get_config()
for k, v in DEFAULTS.items():
bottle.response.set_cookie(k, str(bottle.request.query.get(k)), max_age=3153600000, expires=315360000)
for d in config['dirs']:
for d,db in config['dirs'].items():
cookie_name = 'mount_%s' % urllib.quote(d, '')
bottle.response.set_cookie(cookie_name, str(bottle.request.query.get('mount_%s' % d)), max_age=3153600000, expires=315360000)
bottle.redirect('./')
Expand Down