WIP add processors/archive_webpages module

- fixes #36
nodiscc · Jul 25, 2023 · 2aaa7a9 · 2aaa7a9
1 parent fca5fab
commit 2aaa7a9
Show file tree

Hide file tree

Showing 7 changed files with 113 additions and 6 deletions.
diff --git a/Makefile b/Makefile
@@ -20,10 +20,10 @@ install:
 ##### TESTS #####
 
 .PHONY: test # run tests
-test: test_pylint clean test_import_shaarli test_download_video test_download_audio test_export_html_table clone_awesome_selfhosted test_import_awesome_selfhosted test_process_awesome_selfhosted test_awesome_lint test_export_awesome_selfhosted_md test_export_awesome_selfhosted_html scan_trivy
+test: test_pylint clean test_import_shaarli test_download_video test_download_audio test_export_html_table clone_awesome_selfhosted test_import_awesome_selfhosted test_process_awesome_selfhosted test_awesome_lint test_export_awesome_selfhosted_md test_export_awesome_selfhosted_html test_archive_webpages scan_trivy
 
 .PHONY: test_short # run tests except those that consume github API requests/long URL checks
-test_short: test_pylint clean test_import_shaarli test_download_video test_download_audio test_export_html_table clone_awesome_selfhosted test_awesome_lint test_export_awesome_selfhosted_md test_export_awesome_selfhosted_html
+test_short: test_pylint clean test_import_shaarli test_archive_webpages test_download_video test_download_audio test_export_html_table clone_awesome_selfhosted test_awesome_lint test_export_awesome_selfhosted_md test_export_awesome_selfhosted_html
 
 .PHONY: test_pylint # run linter (non blocking)
 test_pylint: install
@@ -99,6 +99,11 @@ test_download_audio: install
 	source .venv/bin/activate && \
 	hecat --config tests/.hecat.download_audio.yml
 
+.PHONY: test_archive_webpages # test webpage archiving
+test_archive_webpages: install
+	source .venv/bin/activate && \
+	hecat --log-level DEBUG --config tests/.hecat.archive_webpages.yml
+
 .PHONY: test_export_html_table # test exporting shaarli data to HTML table
 test_export_html_table: test_import_shaarli install
 	mkdir -p tests/html-table

diff --git a/hecat/main.py b/hecat/main.py
@@ -4,7 +4,7 @@
 import logging
 from .utils import load_yaml_data
 from .importers import import_markdown_awesome, import_shaarli_json
-from .processors import add_github_metadata, awesome_lint, check_github_last_updated, check_urls,download_media
+from .processors import add_github_metadata, awesome_lint, check_github_last_updated, check_urls, download_media, archive_webpages
 from .exporters import render_markdown_singlepage, render_html_table
 from .exporters import render_markdown_multipage
 
@@ -42,6 +42,8 @@ def main():
             check_github_last_updated(step)
         elif step['module'] == 'processors/url_check':
             check_urls(step)
+        elif step['module'] == 'processors/archive_webpages':
+            archive_webpages(step)
         elif step['module'] == 'processors/download_media':
             download_media(step)
         elif step['module'] == 'exporters/markdown_singlepage':

diff --git a/hecat/processors/__init__.py b/hecat/processors/__init__.py
@@ -2,4 +2,5 @@
 from .github_metadata import add_github_metadata
 from .awesome_lint import awesome_lint, check_github_last_updated
 from .download_media import download_media
-from .url_check import check_urls
+from .url_check import check_urls
+from .archive_webpages import archive_webpages
diff --git a/hecat/processors/archive_webpages.py b/hecat/processors/archive_webpages.py
@@ -0,0 +1,74 @@
+"""archive webpages
+TODO
+
+# $ cat tests/.hecat.archive_webpages.yml
+steps:
+  - name: archive webpages
+    module: processors/archive_webpages
+    module_options:
+      data_file: tests/shaarli.yml # path to the YAML data file
+      only_tags: ['doc'] # only download items tagged with all these tags
+      exclude_tags: ['nodl'] # (default []), don't download items tagged with any of these tags
+      output_directory: 'tests/webpages' # path to the output directory for media files
+      skip_already_archived: True # (default True) skip processing when item already has a 'archive_path': key
+
+# $ hecat --config tests/.hecat.archive_webpages.yml
+
+Data file format (output of import_shaarli module):
+# shaarli.yml
+- id: 1667 # required, unique id
+  url: https://solar.lowtechmagazine.com/2016/10/pigeon-towers-a-low-tech-alternative-to-synthetic-fertilizers
+  tags:
+    - tag1
+    - tag2
+    - diy
+    - doc
+    - readlater
+  ...
+  archive_path: TODO
+
+Source directory structure:
+└── shaarli.yml
+
+Output directory structure:
+└── TODO
+"""
+
+import sys
+import os
+import logging
+import ruamel.yaml
+from ..utils import load_yaml_data
+
+yaml = ruamel.yaml.YAML()
+yaml.indent(sequence=2, offset=0)
+yaml.width = 99999
+
+def wget(item):
+    """archive a webpage with wget"""
+
+
+def archive_webpages(step):
+    """archive webpages linked from each item's 'url', if their tags match one of step['only_tags'],
+    write path to local archive to a new key 'archive_path' in the original data file for each downloaded item
+    """
+    skipped_count = 0
+    items = load_yaml_data(step['module_options']['data_file'])
+    for item in items:
+        # skip already archived items when skip_already_archived: True
+        if (('skip_already_archived' not in step['module_options'].keys() or
+                step['module_options']['skip_already_archived']) and 'archive_path' in item.keys()):
+            logging.debug('skipping %s (id %s): already archived', item['url'], item['id'])
+            skipped_count = skipped_count +1
+        # skip items matching exclude_tags
+        elif ('exclude_tags' in step['module_options'] and any(tag in item['tags'] for tag in step['module_options']['exclude_tags'])):
+            logging.debug('skipping %s (id %s): one or more tags are present in exclude_tags', item['url'], item['id'])
+            skipped_count = skipped_count +1
+        # archive items matching only_tags
+        elif list(set(step['module_options']['only_tags']) & set(item['tags'])):
+            logging.info('archiving %s (id %s)', item['url'], item ['id'])
+            wget(item)
+        else:
+            logging.debug('skipping %s (id %s): no tags matching only_tags', item['url'], item['id'])
+            skipped_count = skipped_count + 1
+    # sys.exit(1)
diff --git a/hecat/processors/download_media.py b/hecat/processors/download_media.py
@@ -112,8 +112,7 @@ def download_media(step):
     for item in items:
         # skip download when skip_when_filename_present = True, and video/audio_filename key already exists
         if (('skip_when_filename_present' not in step['module_options'].keys() or
-                step['module_options']['skip_when_filename_present']) and
-                filename_key in item.keys()):
+                step['module_options']['skip_when_filename_present']) and filename_key in item.keys()):
             logging.debug('skipping %s (id %s): %s already recorded in the data file', item['url'], item['id'], filename_key)
             skipped_count = skipped_count +1
         # skip download when retry_items_with_error = False, and video/audio_download_error key alraedy exists

diff --git a/tests/.hecat.archive_webpages.yml b/tests/.hecat.archive_webpages.yml
@@ -0,0 +1,9 @@
+steps:
+  - name: archive webpages
+    module: processors/archive_webpages
+    module_options:
+      data_file: tests/shaarli.yml
+      only_tags: ['hecat']
+      exclude_tags: ['nodl']
+      output_directory: 'tests/webpages'
+
diff --git a/tests/shaarli-duplicate.json b/tests/shaarli-duplicate.json
@@ -1,4 +1,21 @@
 [
+    {
+        "hecat_comment": "test webpage archiving module",
+        "created": "2023-07-24T12:07:00+02:00",
+        "description": "Sample description",
+        "id": 8351,
+        "private": false,
+        "shorturl": "Y6NgtQ",
+        "tags": [
+            "doc",
+            "admin",
+            "hecat",
+            "dev"
+        ],
+        "title": "Template Designer Documentation — Jinja Documentation (3.2.x)",
+        "updated": "2023-07-24T12:07:00+02:00",
+        "url": "https://jinja.palletsprojects.com/en/latest/templates/"
+    },
     {
         "hecat_comment": "test for nodl tag, should not be downloaded",
         "created": "2022-07-27T22:56:06+02:00",