Skip to content

Commit

Permalink
updated tests to work with upgraded tika and pdfbox
Browse files Browse the repository at this point in the history
  • Loading branch information
shebinleo committed Aug 23, 2019
1 parent 9ff685e commit 405eced
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 23 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@
pdf2html helps to convert PDF file to HTML or Text using [Apache Tika](https://tika.apache.org/). This module also helps to generate thumbnail image for PDF file using [Apache PDFBox](https://pdfbox.apache.org/).

### Installation
via yarn:

```
yarn add pdf2html
```

via npm:

```
Expand Down
53 changes: 30 additions & 23 deletions test/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,38 @@ const should = chai.should()
const pdf2html = require('../index')
const pdfFilepath = __dirname + '/../sample.pdf'
const pdfInvalidFilepath = __dirname + '/../sample2.pdf'
const pdfFileHTML = '<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<meta name="pdf:PDFVersion" content="1.3"/>\n<meta name="X-Parsed-By" content="org.apache.tika.parser.DefaultParser"/>\n<meta name="X-Parsed-By" content="org.apache.tika.parser.pdf.PDFParser"/>\n<meta name="xmp:CreatorTool" content="Rave (http://www.nevrona.com/rave)"/>\n<meta name="access_permission:modify_annotations" content="true"/>\n<meta name="access_permission:can_print_degraded" content="true"/>\n<meta name="meta:creation-date" content="2006-03-01T07:28:26Z"/>\n<meta name="created" content="Wed Mar 01 07:28:26 UTC 2006"/>\n<meta name="access_permission:extract_for_accessibility" content="true"/>\n<meta name="access_permission:assemble_document" content="true"/>\n<meta name="xmpTPg:NPages" content="2"/>\n<meta name="Creation-Date" content="2006-03-01T07:28:26Z"/>\n<meta name="resourceName" content="sample.pdf"/>\n<meta name="dcterms:created" content="2006-03-01T07:28:26Z"/>\n<meta name="dc:format" content="application/pdf; version=1.3"/>\n<meta name="access_permission:extract_content" content="true"/>\n<meta name="access_permission:can_print" content="true"/>\n<meta name="access_permission:fill_in_form" content="true"/>\n<meta name="pdf:encrypted" content="false"/>\n<meta name="producer" content="Nevrona Designs"/>\n<meta name="Content-Length" content="3028"/>\n<meta name="access_permission:can_modify" content="true"/>\n<meta name="Content-Type" content="application/pdf"/>\n<title></title>\n</head>\n<body><div class="page"><p/>\n<p> A Simple PDF File \n This is a small demonstration .pdf file - \n</p>\n<p> just for use in the Virtual Mechanics tutorials. More text. And more \n text. And more text. And more text. And more text. \n</p>\n<p> And more text. And more text. And more text. And more text. And more \n text. And more text. Boring, zzzzz. And more text. And more text. And \n more text. And more text. And more text. And more text. And more text. \n And more text. And more text. \n</p>\n<p> And more text. And more text. And more text. And more text. And more \n text. And more text. And more text. Even more. Continued on page 2 ...</p>\n<p/>\n</div>\n<div class="page"><p/>\n<p> Simple PDF File 2 \n ...continued from page 1. Yet more text. And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. Oh, how boring typing this stuff. But not as boring as watching \n paint dry. And more text. And more text. And more text. And more text. \n Boring. More, a little more text. The end, and just as well. </p>\n<p/>\n</div>\n</body></html>'
const pdfFileHTML = '<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<meta name="pdf:PDFVersion" content="1.3"/>\n<meta name="xmp:CreatorTool" content="Rave (http://www.nevrona.com/rave)"/>\n<meta name="access_permission:modify_annotations" content="true"/>\n<meta name="access_permission:can_print_degraded" content="true"/>\n<meta name="dcterms:created" content="2006-03-01T07:28:26Z"/>\n<meta name="dc:format" content="application/pdf; version=1.3"/>\n<meta name="pdf:docinfo:creator_tool" content="Rave (http://www.nevrona.com/rave)"/>\n<meta name="access_permission:fill_in_form" content="true"/>\n<meta name="pdf:encrypted" content="false"/>\n<meta name="Content-Length" content="3028"/>\n<meta name="Content-Type" content="application/pdf"/>\n<meta name="X-Parsed-By" content="org.apache.tika.parser.DefaultParser"/>\n<meta name="X-Parsed-By" content="org.apache.tika.parser.pdf.PDFParser"/>\n<meta name="meta:creation-date" content="2006-03-01T07:28:26Z"/>\n<meta name="created" content="2006-03-01T07:28:26Z"/>\n<meta name="access_permission:extract_for_accessibility" content="true"/>\n<meta name="access_permission:assemble_document" content="true"/>\n<meta name="xmpTPg:NPages" content="2"/>\n<meta name="Creation-Date" content="2006-03-01T07:28:26Z"/>\n<meta name="resourceName" content="sample.pdf"/>\n<meta name="access_permission:extract_content" content="true"/>\n<meta name="access_permission:can_print" content="true"/>\n<meta name="producer" content="Nevrona Designs"/>\n<meta name="access_permission:can_modify" content="true"/>\n<meta name="pdf:docinfo:producer" content="Nevrona Designs"/>\n<meta name="pdf:docinfo:created" content="2006-03-01T07:28:26Z"/>\n<title></title>\n</head>\n<body><div class="page"><p/>\n<p> A Simple PDF File \n This is a small demonstration .pdf file - \n</p>\n<p> just for use in the Virtual Mechanics tutorials. More text. And more \n text. And more text. And more text. And more text. \n</p>\n<p> And more text. And more text. And more text. And more text. And more \n text. And more text. Boring, zzzzz. And more text. And more text. And \n more text. And more text. And more text. And more text. And more text. \n And more text. And more text. \n</p>\n<p> And more text. And more text. And more text. And more text. And more \n text. And more text. And more text. Even more. Continued on page 2 ...</p>\n<p/>\n</div>\n<div class="page"><p/>\n<p> Simple PDF File 2 \n ...continued from page 1. Yet more text. And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. Oh, how boring typing this stuff. But not as boring as watching \n paint dry. And more text. And more text. And more text. And more text. \n Boring. More, a little more text. The end, and just as well. </p>\n<p/>\n</div>\n</body></html>'
const pdfFileText = '\n A Simple PDF File \n This is a small demonstration .pdf file - \n\n just for use in the Virtual Mechanics tutorials. More text. And more \n text. And more text. And more text. And more text. \n\n And more text. And more text. And more text. And more text. And more \n text. And more text. Boring, zzzzz. And more text. And more text. And \n more text. And more text. And more text. And more text. And more text. \n And more text. And more text. \n\n And more text. And more text. And more text. And more text. And more \n text. And more text. And more text. Even more. Continued on page 2 ...\n\n\n\n Simple PDF File 2 \n ...continued from page 1. Yet more text. And more text. And more text. \n And more text. And more text. And more text. And more text. And more \n text. Oh, how boring typing this stuff. But not as boring as watching \n paint dry. And more text. And more text. And more text. And more text. \n Boring. More, a little more text. The end, and just as well. \n\n\n'
const pdfFileMeta = {
"Content-Length": "3028",
"Content-Type": "application/pdf",
"Creation-Date": "2006-03-01T07:28:26Z",
"X-Parsed-By": ["org.apache.tika.parser.DefaultParser", "org.apache.tika.parser.pdf.PDFParser"],
"access_permission:assemble_document": "true",
"access_permission:can_modify": "true",
"access_permission:can_print": "true",
"access_permission:can_print_degraded": "true",
"access_permission:extract_content": "true",
"access_permission:extract_for_accessibility": "true",
"access_permission:fill_in_form": "true",
"access_permission:modify_annotations": "true",
"created": "Wed Mar 01 07:28:26 UTC 2006",
"dc:format": "application/pdf; version=1.3",
"dcterms:created": "2006-03-01T07:28:26Z",
"meta:creation-date": "2006-03-01T07:28:26Z",
"pdf:PDFVersion": "1.3",
"pdf:encrypted": "false",
"producer": "Nevrona Designs",
"resourceName": "sample.pdf",
"xmp:CreatorTool": "Rave (http://www.nevrona.com/rave)",
"xmpTPg:NPages": "2"
'Content-Length': '3028',
'Content-Type': 'application/pdf',
'Creation-Date': '2006-03-01T07:28:26Z',
'X-Parsed-By':
['org.apache.tika.parser.DefaultParser',
'org.apache.tika.parser.pdf.PDFParser'],
'access_permission:assemble_document': 'true',
'access_permission:can_modify': 'true',
'access_permission:can_print': 'true',
'access_permission:can_print_degraded': 'true',
'access_permission:extract_content': 'true',
'access_permission:extract_for_accessibility': 'true',
'access_permission:fill_in_form': 'true',
'access_permission:modify_annotations': 'true',
created: '2006-03-01T07:28:26Z',
'dc:format': 'application/pdf; version=1.3',
'dcterms:created': '2006-03-01T07:28:26Z',
'meta:creation-date': '2006-03-01T07:28:26Z',
'pdf:PDFVersion': '1.3',
'pdf:charsPerPage': ['569', '367'],
'pdf:docinfo:created': '2006-03-01T07:28:26Z',
'pdf:docinfo:creator_tool': 'Rave (http://www.nevrona.com/rave)',
'pdf:docinfo:producer': 'Nevrona Designs',
'pdf:encrypted': 'false',
'pdf:unmappedUnicodeCharsPerPage': ['0', '0'],
producer: 'Nevrona Designs',
resourceName: 'sample.pdf',
'xmp:CreatorTool': 'Rave (http://www.nevrona.com/rave)',
'xmpTPg:NPages': '2'
}

describe('PDF to HTML', function () {
Expand Down

0 comments on commit 405eced

Please sign in to comment.