Skip to content

Commit

Permalink
Switch to using -c/--copy-pages flag to affect pages
Browse files Browse the repository at this point in the history
  • Loading branch information
tw4l committed Apr 11, 2024
1 parent 9f4a480 commit 68f17ea
Show file tree
Hide file tree
Showing 5 changed files with 267 additions and 81 deletions.
14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,20 @@ Overrides the pages index generation with the passed jsonl pages.
wacz create tests/fixtures/example-collection.warc -p passed_pages.jsonl
```

### --pages-file
### -e --extra-pages

Overrides the pages index generation by copying existing pages.jsonl and/or extraPages.jsonl directly into the WACZ. Incompatible with --detect-pages and -p/--pages options.
Overrides the extra pages index generation with the passed extra jsonl pages.

```
wacz create tests/fixtures/example-collection.warc --pages-file pages/pages.jsonl --pages-file pages/extraPages.jsonl
wacz create tests/fixtures/example-collection.warc -p passed_pages.jsonl -e extra_pages.jsonl
```

### -c --copy-pages

Overrides the behavior of --pages and --extra-pages options to copy existing pages.jsonl and/or extraPages.jsonl as-is directly into the WACZ rather than parsing their contents.

```
wacz create tests/fixtures/example-collection.warc --pages pages/pages.jsonl --extra-pages pages/extraPages.jsonl --copy-pages
```

### -t --text
Expand Down
9 changes: 1 addition & 8 deletions tests/test_create_wacz.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
import unittest, os, zipfile, sys, gzip, json, tempfile
from wacz.main import main, now
from unittest.mock import patch
from wacz.util import hash_stream
from wacz.util import hash_file
from frictionless import validate, Report

TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures")


def hash_file(type_, filename):
with open(filename, "rb") as fh:
size_, hash_ = hash_stream(type_, fh)

return hash_


class TestWaczFormat(unittest.TestCase):
def find_resource(self, resource_list, filename):
for file in resource_list:
Expand Down
153 changes: 153 additions & 0 deletions tests/test_optional_flags_wacz.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import os
import zipfile, json, gzip
from wacz.main import main, now
from wacz.util import hash_file
from unittest.mock import patch
import jsonlines

TEST_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures")
PAGES_DIR = os.path.join(TEST_DIR, "pages")


class TestWaczFormat(unittest.TestCase):
Expand Down Expand Up @@ -35,6 +37,95 @@ def test_warc_with_invalid_passed_pages(self):
0,
)

def test_invalid_passed_pages_copy_pages(self):
"""If a user passes an invalid pages.jsonl file using --page --copy-pages we should return an error"""
with tempfile.TemporaryDirectory() as tmpdir:
self.assertEqual(
main(
[
"create",
"-f",
os.path.join(TEST_DIR, "example-collection.warc"),
"-o",
os.path.join(
tmpdir, "example-collection-invalid-copy-pages.wacz"
),
"-p",
os.path.join(PAGES_DIR, "invalid.jsonl"),
"--copy-pages",
]
),
1,
)

self.assertEqual(
main(
[
"create",
"-f",
os.path.join(TEST_DIR, "example-collection.warc"),
"-o",
os.path.join(
tmpdir, "example-collection-invalid-copy-pages-txt.wacz"
),
"-p",
os.path.join(PAGES_DIR, "invalid.txt"),
"--copy-pages",
]
),
1,
)

def test_invalid_passed_extra_pages_copy_pages(self):
"""If a user passes an invalid extarPages.jsonl file using -e --copy-pages we still create WACZ without extra pages"""
with tempfile.TemporaryDirectory() as tmpdir:
self.assertEqual(
main(
[
"create",
"-f",
os.path.join(TEST_DIR, "example-collection.warc"),
"-o",
os.path.join(
tmpdir, "example-collection-invalid-copy-extra-pages.wacz"
),
"-p",
os.path.join(PAGES_DIR, "pages.jsonl"),
"-e",
os.path.join(PAGES_DIR, "invalid.txt"),
"--copy-pages",
]
),
0,
)

with zipfile.ZipFile(
os.path.join(
tmpdir, "example-collection-invalid-copy-extra-pages.wacz"
),
"r",
) as zip_ref:
zip_ref.extractall(os.path.join(tmpdir, "wacz_no_extra_pages"))
zip_ref.close()

self.assertEqual(
main(
[
"validate",
"-f",
os.path.join(
tmpdir, "example-collection-invalid-copy-extra-pages.wacz"
),
]
),
0,
)

self.assertFalse(
"extraPages.jsonl"
in os.listdir(os.path.join(tmpdir, "wacz_no_extra_pages/pages/"))
)

@patch("wacz.main.now")
def test_warc_with_pages_flag(self, mock_now):
"""When passing the pages flag with a valid pages.jsonl file a pages/pages.jsonl file should be created"""
Expand Down Expand Up @@ -95,6 +186,68 @@ def test_warc_with_pages_flag(self, mock_now):
self.assertTrue("url" in obj.keys())
self.assertTrue(obj["url"].encode() in cdx_content)

@patch("wacz.main.now")
def test_warc_with_copy_pages(self, mock_now):
"""When passing the pages and extra-pages flags with copy-pages, the files should end up in the WACZ exactly as-is"""
mock_now.return_value = (2020, 10, 7, 22, 29, 10)

with tempfile.TemporaryDirectory() as tmpdir:
self.assertEqual(
main(
[
"create",
"-f",
os.path.join(TEST_DIR, "example-collection.warc"),
"-o",
os.path.join(tmpdir, "example-collection-copy-pages.wacz"),
"-p",
os.path.join(PAGES_DIR, "pages.jsonl"),
"-e",
os.path.join(PAGES_DIR, "extraPages.jsonl"),
"--copy-pages",
]
),
0,
)

with zipfile.ZipFile(
os.path.join(tmpdir, "example-collection-copy-pages"), "r"
) as zip_ref:
zip_ref.extractall(os.path.join(tmpdir, "unzipped_copy_pages"))
zip_ref.close()

self.assertEqual(
main(
[
"validate",
"-f",
os.path.join(tmpdir, "example-collection-copy-pages.wacz"),
]
),
0,
)

wacz_pages = os.path.join(tmpdir, "unzipped_copy_pages/pages/pages.jsonl")
wacz_extra_pages = os.path.join(
tmpdir, "unzipped_copy_pages/pages/extraPages.jsonl"
)

self.assertTrue(
"pages.jsonl"
in os.listdir(os.path.join(tmpdir, "unzipped_copy_pages/pages/"))
)
self.assertTrue(
"extraPages.jsonl"
in os.listdir(os.path.join(tmpdir, "unzipped_copy_pages/pages/"))
)

self.assertEqual(
hash_file(wacz_pages, os.path.join(PAGES_DIR, "pages.json"))
)
self.assertEqual(
hash_file(wacz_extra_pages, os.path.join(PAGES_DIR, "extraPages.json"))
)

@patch("wacz.main.now")
def test_warc_with_detect_pages_flag(self, mock_now):
"""When passing the text index flag pages/pages.jsonl should be generated."""
Expand Down
Loading

0 comments on commit 68f17ea

Please sign in to comment.