Skip to content

Commit

Permalink
Add --pages-files repeatable option to copy pages files to WACZ
Browse files Browse the repository at this point in the history
  • Loading branch information
tw4l committed Mar 11, 2024
1 parent 47b3eef commit 7b2e80d
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 8 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,14 @@ Overrides the pages index generation with the passed jsonl pages.
wacz create tests/fixtures/example-collection.warc -p passed_pages.jsonl
```

### --pages-file

Overrides the pages index generation by copying existing pages.jsonl and/or extraPages.jsonl directly into the WACZ. Incompatible with --detect-pages and -p/-pages options.

```
wacz create tests/fixtures/example-collection.warc --pages-file pages/pages.jsonl --pages-file pages/extraPages.jsonl
```

### -t --text

You can add a full text index by including the --text tag.
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# vim: set sw=4 et:
from setuptools import setup, find_packages

__version__ = "0.4.9"
__version__ = "0.4.10"

def load_requirements(filename):
with open(filename, "rt") as fh:
Expand Down
41 changes: 34 additions & 7 deletions wacz/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ def main(args=None):
action="store_true",
)

create.add_argument(
"--pages-file",
help="Overrides the pages generation by copying files to WACZ without parsing",
nargs="+"
)

create.add_argument(
"--hash-type",
choices=["sha256", "md5"],
Expand Down Expand Up @@ -108,11 +114,14 @@ def main(args=None):
if cmd.cmd == "create" and cmd.ts is not None and cmd.url is None:
parser.error("--url must be specified when --ts is passed")

if cmd.cmd == "create" and cmd.detect_pages is not False and cmd.pages is not None:
if cmd.cmd == "create" and cmd.detect_pages is not False and (cmd.pages is not None or cmd.pages_file is not None):
parser.error(
"--pages and --detect-pages can't be set at the same time they cancel each other out."
"--pages/--pages-file and --detect-pages can't be set at the same time they cancel each other out."
)

if cmd.cmd == "create" and cmd.pages is not None and cmd.pages_file is not None:
parser.error("--pages and --pages-file can't be set at same time as they cancel each other out.")

value = cmd.func(cmd)
return value

Expand Down Expand Up @@ -163,6 +172,9 @@ def create_wacz(res):
index_file = zipfile.ZipInfo("indexes/index.idx", now())
index_file.compress_type = zipfile.ZIP_DEFLATED

pages_jsonl = zipfile.ZipInfo("pages/pages.jsonl", now())
extra_pages_jsonl = zipfile.ZipInfo("pages/extraPages.jsonl", now())

index_buff = BytesIO()

text_wrap = TextIOWrapper(index_buff, "utf-8", write_through=True)
Expand All @@ -171,7 +183,22 @@ def create_wacz(res):

passed_pages_dict = {}

# If the flag for passed pages has been passed
# Handle pages
if res.pages_file is not None:
for page_file in in res.pages_file:
page_file = os.path.abspath(page_file)
filename = os.path.basename(page_file)

if filename == "pages.jsonl":
with wacz.open(pages_jsonl, "wb") as page_jsonl_file:
with open(page_file, "rb") as in_fh:
shutil.copyfileobj(in_fh, page_jsonl_file)

if filename == "extraPages.jsonl":
with wacz.open(extra_pages_jsonl, "wb") as extra_page_file:
with open(page_file, "rb") as in_fh:
shutil.copyfileobj(in_fh, extra_page_file)

if res.pages != None:
print("Validating passed pages.jsonl file")
passed_content = []
Expand Down Expand Up @@ -267,7 +294,7 @@ def create_wacz(res):
shutil.copyfileobj(in_fh, out_fh)
path = "logs/{}".format(log_file)

if len(wacz_indexer.pages) > 0 and res.pages == None:
if len(wacz_indexer.pages) > 0 and res.pages == None and res.pages_file is None:
print("Generating page index...")
# generate pages/text
wacz_indexer.write_page_list(
Expand All @@ -281,7 +308,7 @@ def create_wacz(res):
),
)

if len(wacz_indexer.pages) > 0 and res.pages != None:
if len(wacz_indexer.pages) > 0 and res.pages != None and res.pages_file is None:
print("Generating page index from passed pages...")
# Initially set the default value of the header id and title
id_value = "pages"
Expand Down Expand Up @@ -312,7 +339,7 @@ def create_wacz(res):
),
)

if len(wacz_indexer.extra_pages) > 0:
if len(wacz_indexer.extra_pages) > 0 and res.pages_file is None:
wacz_indexer.write_page_list(
wacz,
EXTRA_PAGES_INDEX,
Expand All @@ -324,7 +351,7 @@ def create_wacz(res):
),
)

if len(wacz_indexer.extra_page_lists) > 0:
if len(wacz_indexer.extra_page_lists) > 0 and res.pages_file is None:
print("Generating extra page lists...")

for name, pagelist in wacz_indexer.extra_page_lists.items():
Expand Down

0 comments on commit 7b2e80d

Please sign in to comment.