diff --git a/Makefile b/Makefile index e2a93d0..060d574 100644 --- a/Makefile +++ b/Makefile @@ -20,10 +20,10 @@ install: ##### TESTS ##### .PHONY: test # run tests -test: test_pylint clean test_import_shaarli test_download_video test_download_audio test_export_html_table clone_awesome_selfhosted test_import_awesome_selfhosted test_process_awesome_selfhosted test_awesome_lint test_export_awesome_selfhosted_md test_export_awesome_selfhosted_html scan_trivy +test: test_pylint clean test_import_shaarli test_download_video test_download_audio test_export_html_table clone_awesome_selfhosted test_import_awesome_selfhosted test_process_awesome_selfhosted test_awesome_lint test_export_awesome_selfhosted_md test_export_awesome_selfhosted_html test_archive_webpages scan_trivy .PHONY: test_short # run tests except those that consume github API requests/long URL checks -test_short: test_pylint clean test_import_shaarli test_download_video test_download_audio test_export_html_table clone_awesome_selfhosted test_awesome_lint test_export_awesome_selfhosted_md test_export_awesome_selfhosted_html +test_short: test_pylint clean test_import_shaarli test_archive_webpages test_download_video test_download_audio test_export_html_table clone_awesome_selfhosted test_awesome_lint test_export_awesome_selfhosted_md test_export_awesome_selfhosted_html .PHONY: test_pylint # run linter (non blocking) test_pylint: install @@ -99,6 +99,11 @@ test_download_audio: install source .venv/bin/activate && \ hecat --config tests/.hecat.download_audio.yml +.PHONY: test_archive_webpages # test webpage archiving +test_archive_webpages: install + source .venv/bin/activate && \ + hecat --log-level DEBUG --config tests/.hecat.archive_webpages.yml + .PHONY: test_export_html_table # test exporting shaarli data to HTML table test_export_html_table: test_import_shaarli install mkdir -p tests/html-table diff --git a/hecat/main.py b/hecat/main.py index 2d97e13..cd3f0c0 100644 --- a/hecat/main.py +++ b/hecat/main.py @@ -4,7 +4,7 @@ import logging from .utils import load_yaml_data from .importers import import_markdown_awesome, import_shaarli_json -from .processors import add_github_metadata, awesome_lint, check_github_last_updated, check_urls,download_media +from .processors import add_github_metadata, awesome_lint, check_github_last_updated, check_urls, download_media, archive_webpages from .exporters import render_markdown_singlepage, render_html_table from .exporters import render_markdown_multipage @@ -42,6 +42,8 @@ def main(): check_github_last_updated(step) elif step['module'] == 'processors/url_check': check_urls(step) + elif step['module'] == 'processors/archive_webpages': + archive_webpages(step) elif step['module'] == 'processors/download_media': download_media(step) elif step['module'] == 'exporters/markdown_singlepage': diff --git a/hecat/processors/__init__.py b/hecat/processors/__init__.py index 498cd64..2f60d48 100644 --- a/hecat/processors/__init__.py +++ b/hecat/processors/__init__.py @@ -2,4 +2,5 @@ from .github_metadata import add_github_metadata from .awesome_lint import awesome_lint, check_github_last_updated from .download_media import download_media -from .url_check import check_urls \ No newline at end of file +from .url_check import check_urls +from .archive_webpages import archive_webpages diff --git a/hecat/processors/archive_webpages.py b/hecat/processors/archive_webpages.py new file mode 100644 index 0000000..4bec1ba --- /dev/null +++ b/hecat/processors/archive_webpages.py @@ -0,0 +1,74 @@ +"""archive webpages +TODO + +# $ cat tests/.hecat.archive_webpages.yml +steps: + - name: archive webpages + module: processors/archive_webpages + module_options: + data_file: tests/shaarli.yml # path to the YAML data file + only_tags: ['doc'] # only download items tagged with all these tags + exclude_tags: ['nodl'] # (default []), don't download items tagged with any of these tags + output_directory: 'tests/webpages' # path to the output directory for media files + skip_already_archived: True # (default True) skip processing when item already has a 'archive_path': key + +# $ hecat --config tests/.hecat.archive_webpages.yml + +Data file format (output of import_shaarli module): +# shaarli.yml +- id: 1667 # required, unique id + url: https://solar.lowtechmagazine.com/2016/10/pigeon-towers-a-low-tech-alternative-to-synthetic-fertilizers + tags: + - tag1 + - tag2 + - diy + - doc + - readlater + ... + archive_path: TODO + +Source directory structure: +└── shaarli.yml + +Output directory structure: +└── TODO +""" + +import sys +import os +import logging +import ruamel.yaml +from ..utils import load_yaml_data + +yaml = ruamel.yaml.YAML() +yaml.indent(sequence=2, offset=0) +yaml.width = 99999 + +def wget(item): + """archive a webpage with wget""" + + +def archive_webpages(step): + """archive webpages linked from each item's 'url', if their tags match one of step['only_tags'], + write path to local archive to a new key 'archive_path' in the original data file for each downloaded item + """ + skipped_count = 0 + items = load_yaml_data(step['module_options']['data_file']) + for item in items: + # skip already archived items when skip_already_archived: True + if (('skip_already_archived' not in step['module_options'].keys() or + step['module_options']['skip_already_archived']) and 'archive_path' in item.keys()): + logging.debug('skipping %s (id %s): already archived', item['url'], item['id']) + skipped_count = skipped_count +1 + # skip items matching exclude_tags + elif ('exclude_tags' in step['module_options'] and any(tag in item['tags'] for tag in step['module_options']['exclude_tags'])): + logging.debug('skipping %s (id %s): one or more tags are present in exclude_tags', item['url'], item['id']) + skipped_count = skipped_count +1 + # archive items matching only_tags + elif list(set(step['module_options']['only_tags']) & set(item['tags'])): + logging.info('archiving %s (id %s)', item['url'], item ['id']) + wget(item) + else: + logging.debug('skipping %s (id %s): no tags matching only_tags', item['url'], item['id']) + skipped_count = skipped_count + 1 + # sys.exit(1) diff --git a/hecat/processors/download_media.py b/hecat/processors/download_media.py index 9f38ea4..f657233 100644 --- a/hecat/processors/download_media.py +++ b/hecat/processors/download_media.py @@ -112,8 +112,7 @@ def download_media(step): for item in items: # skip download when skip_when_filename_present = True, and video/audio_filename key already exists if (('skip_when_filename_present' not in step['module_options'].keys() or - step['module_options']['skip_when_filename_present']) and - filename_key in item.keys()): + step['module_options']['skip_when_filename_present']) and filename_key in item.keys()): logging.debug('skipping %s (id %s): %s already recorded in the data file', item['url'], item['id'], filename_key) skipped_count = skipped_count +1 # skip download when retry_items_with_error = False, and video/audio_download_error key alraedy exists diff --git a/tests/.hecat.archive_webpages.yml b/tests/.hecat.archive_webpages.yml new file mode 100644 index 0000000..908c6b4 --- /dev/null +++ b/tests/.hecat.archive_webpages.yml @@ -0,0 +1,9 @@ +steps: + - name: archive webpages + module: processors/archive_webpages + module_options: + data_file: tests/shaarli.yml + only_tags: ['hecat'] + exclude_tags: ['nodl'] + output_directory: 'tests/webpages' + diff --git a/tests/shaarli-duplicate.json b/tests/shaarli-duplicate.json index 08d5483..ee36042 100644 --- a/tests/shaarli-duplicate.json +++ b/tests/shaarli-duplicate.json @@ -1,4 +1,21 @@ [ + { + "hecat_comment": "test webpage archiving module", + "created": "2023-07-24T12:07:00+02:00", + "description": "Sample description", + "id": 8351, + "private": false, + "shorturl": "Y6NgtQ", + "tags": [ + "doc", + "admin", + "hecat", + "dev" + ], + "title": "Template Designer Documentation — Jinja Documentation (3.2.x)", + "updated": "2023-07-24T12:07:00+02:00", + "url": "https://jinja.palletsprojects.com/en/latest/templates/" + }, { "hecat_comment": "test for nodl tag, should not be downloaded", "created": "2022-07-27T22:56:06+02:00",