forked from nisaacson/pdf-text-extract
-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
71 lines (64 loc) · 1.55 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
var path = require('path')
var spawn = require('child_process').spawn
module.exports = function pdfTextExtract(filePath, options, cb) {
if (typeof(options) === 'function') {
cb = options
options = {}
}
filePath = path.resolve(filePath)
var defaultArgs = [
'-layout',
'-enc',
'UTF-8',
filePath,
'-'
];
var args = defaultArgs;
if (options.args) {
args = options.args;
};
streamResults(args, options, splitPages)
function splitPages(err, content) {
if (err) {
return cb(err)
}
var pages = content.split(/\f/)
if (!pages) {
return cb({
message: 'pdf-text-extract failed',
error: 'no text returned from the pdftotext command',
filePath: filePath,
stack: new Error().stack
})
}
// sometimes there can be an extract blank page on the end
var lastPage = pages[pages.length - 1]
if (!lastPage) {
pages.pop()
}
cb(null, pages)
}
}
function streamResults(args, options, cb) {
var output = ''
var stderr = ''
var command = 'pdftotext'
var child = spawn(command, args, options)
child.stdout.setEncoding('utf8')
child.stderr.setEncoding('utf8')
child.stdout.on('data', stdoutHandler)
child.stderr.on('data', stderrHandler)
child.on('exit', exitHandler)
function stdoutHandler(data) {
output += data
}
function stderrHandler(data) {
stderr += data
}
function exitHandler(code) {
if (code !== 0) {
return cb(new Error('pdf-text-extract command failed: ' + stderr))
}
cb(null, output)
}
}