Skip to content

Commit

Permalink
Add paginated workspace
Browse files Browse the repository at this point in the history
  • Loading branch information
vinayak-mehta committed Nov 20, 2018
1 parent 76f1785 commit c33c41c
Show file tree
Hide file tree
Showing 9 changed files with 385 additions and 282 deletions.
15 changes: 9 additions & 6 deletions excalibur/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,18 @@ class File(Base):

file_id = Column(String(ID_LEN), primary_key=True)
uploaded_at = Column(DateTime)
page_number = Column(Integer)
pages = Column(String(STR_LEN))
total_pages = Column(Integer)
extract_pages = Column(Text)
filename = Column(String(STR_LEN))
filepath = Column(String(STR_LEN))
has_image = Column(Boolean, default=False)
imagename = Column(String(STR_LEN))
imagepath = Column(String(STR_LEN))
file_dimensions = Column(Text)
image_dimensions = Column(Text)
filenames = Column(Text)
filepaths = Column(Text)
imagenames = Column(Text)
imagepaths = Column(Text)
filedims = Column(Text)
imagedims = Column(Text)
detected_areas = Column(Text)


Expand All @@ -42,7 +46,6 @@ class Job(Base):
__tablename__ = "jobs"

job_id = Column(String(ID_LEN), primary_key=True)
page_numbers = Column(Text)
datapath = Column(String(STR_LEN), default=None)
render_files = Column(Text, default=json.dumps([]))
is_finished = Column(Boolean, default=False)
Expand Down
135 changes: 80 additions & 55 deletions excalibur/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
import datetime as dt

import camelot
from camelot.core import TableList
from camelot.parsers import Lattice, Stream

from . import configuration as conf
from .models import File, Rule, Job
from .settings import Session
from .utils.file import mkdirs
from .utils.task import (save_page, get_page_layout, get_file_dimensions,
get_image_dimensions)
from .utils.task import (get_pages, save_page, get_page_layout, get_file_dim,
get_image_dim)


def split(file_id):
Expand Down Expand Up @@ -54,59 +56,70 @@ class GhostscriptNotFound(Exception): pass

session = Session()
file = session.query(File).filter(File.file_id == file_id).first()
save_page(file.filepath, file.page_number)

filename = 'page-{}.pdf'.format(file.page_number)
filepath = os.path.join(conf.PDFS_FOLDER, file_id, filename)
imagename = ''.join([filename.replace('.pdf', ''), '.png'])
imagepath = os.path.join(conf.PDFS_FOLDER, file_id, imagename)

gs_call = [
'-q',
'-sDEVICE=png16m',
'-o',
imagepath,
'-r600',
filepath
]
gs = get_executable()
gs_call.insert(0, gs)
process = subprocess.Popen(gs_call)
out = process.communicate()[0]
ret = process.wait()

extract_pages, total_pages = get_pages(file.filepath, file.pages)

filenames, filepaths, imagenames, imagepaths, filedims, imagedims, detected_areas = ({} for i in range(7))
for page in extract_pages:
# extract into single-page PDF
save_page(file.filepath, page)

filename = 'page-{}.pdf'.format(page)
filepath = os.path.join(conf.PDFS_FOLDER, file_id, filename)
imagename = ''.join([filename.replace('.pdf', ''), '.png'])
imagepath = os.path.join(conf.PDFS_FOLDER, file_id, imagename)

# convert single-page PDF to PNG
gs_call = [
'-q',
'-sDEVICE=png16m',
'-o',
imagepath,
'-r600',
filepath
]
gs = get_executable()
gs_call.insert(0, gs)
process = subprocess.Popen(gs_call)
out = process.communicate()[0]
ret = process.wait()

filenames[page] = filename
filepaths[page] = filepath
imagenames[page] = imagename
imagepaths[page] = imagepath
filedims[page] = get_file_dim(filepath)
imagedims[page] = get_image_dim(imagepath)

lattice_areas, stream_areas = (None for i in range(2))
# lattice
parser = Lattice()
tables = parser.extract_tables(filepath)
if len(tables):
lattice_areas = []
for table in tables:
x1, y1, x2, y2 = tables[0]._bbox
lattice_areas.append((x1, y2, x2, y1))
# stream
parser = Stream()
tables = parser.extract_tables(filepath)
if len(tables):
stream_areas = []
for table in tables:
x1, y1, x2, y2 = tables[0]._bbox
stream_areas.append((x1, y2, x2, y1))

detected_areas[page] = {
'lattice': lattice_areas, 'stream': stream_areas}

file.extract_pages = json.dumps(extract_pages)
file.total_pages = total_pages
file.has_image = True
file.imagename = imagename
file.imagepath = imagepath

file_dimensions = get_file_dimensions(filepath)
image_dimensions = get_image_dimensions(imagepath)
pdf_width_scaler = image_dimensions[0] / float(file_dimensions[0])
pdf_height_scaler = image_dimensions[1] / float(file_dimensions[1])

file.file_dimensions = json.dumps(file_dimensions)
file.image_dimensions = json.dumps(image_dimensions)

lattice_areas, stream_areas = [None] * 2
# lattice
tables = camelot.read_pdf(filepath, flavor='lattice')
if len(tables):
lattice_areas = []
for table in tables:
x1, y1, x2, y2 = tables[0]._bbox
lattice_areas.append((x1, y2, x2, y1))
# stream
tables = camelot.read_pdf(filepath, flavor='stream')
if len(tables):
stream_areas = []
for table in tables:
x1, y1, x2, y2 = tables[0]._bbox
stream_areas.append((x1, y2, x2, y1))

detected_areas = {
'lattice': lattice_areas,
'stream': stream_areas
}
file.filenames = json.dumps(filenames)
file.filepaths = json.dumps(filepaths)
file.imagenames = json.dumps(imagenames)
file.imagepaths = json.dumps(imagepaths)
file.filedims = json.dumps(filedims)
file.imagedims = json.dumps(imagedims)
file.detected_areas = json.dumps(detected_areas)

session.commit()
Expand All @@ -124,7 +137,19 @@ def extract(job_id):

rule_options = json.loads(rule.rule_options)
flavor = rule_options.pop('flavor')
tables = camelot.read_pdf(file.filepath, pages=job.page_numbers, flavor=flavor.lower(), **rule_options)
pages = rule_options.pop('pages')

tables = []
filepaths = json.loads(file.filepaths)
for p in pages:
kwargs = pages[p]
kwargs.update(rule_options)
parser = Lattice(**kwargs) if flavor.lower() == 'lattice' else Stream(**kwargs)
t = parser.extract_tables(filepaths[p])
for _t in t:
_t.page = int(p)
tables.extend(t)
tables = TableList(tables)

froot, fext = os.path.splitext(file.filename)
datapath = os.path.dirname(file.filepath)
Expand Down
50 changes: 48 additions & 2 deletions excalibur/utils/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,52 @@
from camelot.utils import get_page_layout, get_text_objects, get_rotation


def get_pages(filename, pages):
"""Converts pages string to list of ints.
Parameters
----------
filename : str
Path to PDF file.
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: 1,3,4 or 1,4-end.
Returns
-------
N : int
Total pages.
P : list
List of int page numbers.
"""
page_numbers = []
inputstream = open(filename, 'rb')
infile = PdfFileReader(inputstream, strict=False)
N = infile.getNumPages()
if pages == '1':
page_numbers.append({'start': 1, 'end': 1})
else:
if infile.isEncrypted:
infile.decrypt(self.password)
if pages == 'all':
page_numbers.append({'start': 1, 'end': infile.getNumPages()})
else:
for r in pages.split(','):
if '-' in r:
a, b = r.split('-')
if b == 'end':
b = infile.getNumPages()
page_numbers.append({'start': int(a), 'end': int(b)})
else:
page_numbers.append({'start': int(r), 'end': int(r)})
inputstream.close()
P = []
for p in page_numbers:
P.extend(range(p['start'], p['end'] + 1))
return sorted(set(P)), N


def save_page(filepath, page_number):
infile = PdfFileReader(open(filepath, 'rb'), strict=False)
page = infile.getPage(page_number - 1)
Expand Down Expand Up @@ -37,11 +83,11 @@ def save_page(filepath, page_number):
outfile.write(f)


def get_file_dimensions(filepath):
def get_file_dim(filepath):
layout, dimensions = get_page_layout(filepath)
return list(dimensions)


def get_image_dimensions(imagepath):
def get_image_dim(imagepath):
image = cv2.imread(imagepath)
return [image.shape[1], image.shape[0]]
4 changes: 2 additions & 2 deletions excalibur/www/static/js/files.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ $(document).ready(function () {
$.each($('#file')[0].files, function (i, file) {
data.append('file-' + i, file);
});
var page_number = $('#page-number').val() ? Number($('#page-number').val()) : 1;
data.append('page_number', page_number);
var pages = $('#pages').val() ? $('#pages').val() : 1;
data.append('pages', pages);
$.ajax({
url: '/files',
type: 'POST',
Expand Down
Loading

0 comments on commit c33c41c

Please sign in to comment.