-
Notifications
You must be signed in to change notification settings - Fork 614
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #411 from jhale1805/non_agpl_epub_extractor
Remove EbookLib dependency
- Loading branch information
Showing
4 changed files
with
45 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,53 @@ | ||
from ebooklib import epub, ITEM_DOCUMENT | ||
import zipfile | ||
from bs4 import BeautifulSoup | ||
|
||
from .utils import BaseParser | ||
|
||
|
||
class Parser(BaseParser): | ||
"""Extract text from epub using python epub library | ||
""" | ||
"""Extract text from epub""" | ||
|
||
def extract(self, filename, **kwargs): | ||
book = epub.read_epub(filename) | ||
book = zipfile.ZipFile(filename) | ||
result = '' | ||
for id, _ in book.spine: | ||
item = book.get_item_with_id(id) | ||
# Don't fail with some AttributeError exception when the item is of NoneType | ||
# (i.e. at the last position). | ||
if item is None: | ||
for text_name in self.__epub_sections(book): | ||
if not text_name.endswith("html"): | ||
continue | ||
soup = BeautifulSoup(item.content, 'lxml') | ||
for child in soup.find_all( | ||
['title', 'p', 'div', 'h1', 'h2', 'h3', 'h4'] | ||
): | ||
result = result + child.text + '\n' | ||
soup = BeautifulSoup(book.open(text_name), features='lxml') | ||
html_content_tags = ['title', 'p', 'h1', 'h2', 'h3', 'h4'] | ||
for child in soup.find_all(html_content_tags): | ||
inner_text = child.text.strip() if child.text else "" | ||
if inner_text: | ||
result += inner_text + '\n' | ||
return result | ||
|
||
def __epub_sections(self, book): | ||
opf_paths = self.__get_opf_paths(book) | ||
item_paths = self.__get_item_paths(book, opf_paths) | ||
return item_paths | ||
|
||
def __get_opf_paths(self, book): | ||
meta_inf = book.open("META-INF/container.xml") | ||
meta_soup = BeautifulSoup(meta_inf, features='lxml') | ||
return [f["full-path"] for f in meta_soup.rootfiles.find_all("rootfile")] | ||
|
||
def __get_item_paths(self, book, opf_paths): | ||
item_paths = [] | ||
for opf_path in opf_paths: | ||
opf_soup = BeautifulSoup(book.open(opf_path), "lxml") | ||
epub_items = opf_soup.spine.find_all("itemref") | ||
for epub_item in epub_items: | ||
item = self.__get_item(opf_soup, epub_item["idref"]) | ||
item_paths.append(self.__get_full_item_path(book, item["href"])) | ||
return item_paths | ||
|
||
def __get_item(self, opf_soup, item_id): | ||
for item in opf_soup.manifest.find_all("item"): | ||
if item["id"] == item_id: | ||
return item | ||
return None | ||
|
||
def __get_full_item_path(self, book, partial_path): | ||
for filename in book.namelist(): | ||
if filename.endswith(partial_path): | ||
return filename |