Skip to content

Commit

Permalink
WIP add processors/archive_webpages module
Browse files Browse the repository at this point in the history
- fixes #36
  • Loading branch information
nodiscc committed Jul 25, 2023
1 parent fca5fab commit 2aaa7a9
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 6 deletions.
9 changes: 7 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ install:
##### TESTS #####

.PHONY: test # run tests
test: test_pylint clean test_import_shaarli test_download_video test_download_audio test_export_html_table clone_awesome_selfhosted test_import_awesome_selfhosted test_process_awesome_selfhosted test_awesome_lint test_export_awesome_selfhosted_md test_export_awesome_selfhosted_html scan_trivy
test: test_pylint clean test_import_shaarli test_download_video test_download_audio test_export_html_table clone_awesome_selfhosted test_import_awesome_selfhosted test_process_awesome_selfhosted test_awesome_lint test_export_awesome_selfhosted_md test_export_awesome_selfhosted_html test_archive_webpages scan_trivy

.PHONY: test_short # run tests except those that consume github API requests/long URL checks
test_short: test_pylint clean test_import_shaarli test_download_video test_download_audio test_export_html_table clone_awesome_selfhosted test_awesome_lint test_export_awesome_selfhosted_md test_export_awesome_selfhosted_html
test_short: test_pylint clean test_import_shaarli test_archive_webpages test_download_video test_download_audio test_export_html_table clone_awesome_selfhosted test_awesome_lint test_export_awesome_selfhosted_md test_export_awesome_selfhosted_html

.PHONY: test_pylint # run linter (non blocking)
test_pylint: install
Expand Down Expand Up @@ -99,6 +99,11 @@ test_download_audio: install
source .venv/bin/activate && \
hecat --config tests/.hecat.download_audio.yml

.PHONY: test_archive_webpages # test webpage archiving
test_archive_webpages: install
source .venv/bin/activate && \
hecat --log-level DEBUG --config tests/.hecat.archive_webpages.yml

.PHONY: test_export_html_table # test exporting shaarli data to HTML table
test_export_html_table: test_import_shaarli install
mkdir -p tests/html-table
Expand Down
4 changes: 3 additions & 1 deletion hecat/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import logging
from .utils import load_yaml_data
from .importers import import_markdown_awesome, import_shaarli_json
from .processors import add_github_metadata, awesome_lint, check_github_last_updated, check_urls,download_media
from .processors import add_github_metadata, awesome_lint, check_github_last_updated, check_urls, download_media, archive_webpages
from .exporters import render_markdown_singlepage, render_html_table
from .exporters import render_markdown_multipage

Expand Down Expand Up @@ -42,6 +42,8 @@ def main():
check_github_last_updated(step)
elif step['module'] == 'processors/url_check':
check_urls(step)
elif step['module'] == 'processors/archive_webpages':
archive_webpages(step)
elif step['module'] == 'processors/download_media':
download_media(step)
elif step['module'] == 'exporters/markdown_singlepage':
Expand Down
3 changes: 2 additions & 1 deletion hecat/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
from .github_metadata import add_github_metadata
from .awesome_lint import awesome_lint, check_github_last_updated
from .download_media import download_media
from .url_check import check_urls
from .url_check import check_urls
from .archive_webpages import archive_webpages
74 changes: 74 additions & 0 deletions hecat/processors/archive_webpages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""archive webpages
TODO
# $ cat tests/.hecat.archive_webpages.yml
steps:
- name: archive webpages
module: processors/archive_webpages
module_options:
data_file: tests/shaarli.yml # path to the YAML data file
only_tags: ['doc'] # only download items tagged with all these tags
exclude_tags: ['nodl'] # (default []), don't download items tagged with any of these tags
output_directory: 'tests/webpages' # path to the output directory for media files
skip_already_archived: True # (default True) skip processing when item already has a 'archive_path': key
# $ hecat --config tests/.hecat.archive_webpages.yml
Data file format (output of import_shaarli module):
# shaarli.yml
- id: 1667 # required, unique id
url: https://solar.lowtechmagazine.com/2016/10/pigeon-towers-a-low-tech-alternative-to-synthetic-fertilizers
tags:
- tag1
- tag2
- diy
- doc
- readlater
...
archive_path: TODO
Source directory structure:
└── shaarli.yml
Output directory structure:
└── TODO
"""

import sys
import os
import logging
import ruamel.yaml
from ..utils import load_yaml_data

yaml = ruamel.yaml.YAML()
yaml.indent(sequence=2, offset=0)
yaml.width = 99999

def wget(item):
"""archive a webpage with wget"""


def archive_webpages(step):
"""archive webpages linked from each item's 'url', if their tags match one of step['only_tags'],
write path to local archive to a new key 'archive_path' in the original data file for each downloaded item
"""
skipped_count = 0
items = load_yaml_data(step['module_options']['data_file'])
for item in items:
# skip already archived items when skip_already_archived: True
if (('skip_already_archived' not in step['module_options'].keys() or
step['module_options']['skip_already_archived']) and 'archive_path' in item.keys()):
logging.debug('skipping %s (id %s): already archived', item['url'], item['id'])
skipped_count = skipped_count +1
# skip items matching exclude_tags
elif ('exclude_tags' in step['module_options'] and any(tag in item['tags'] for tag in step['module_options']['exclude_tags'])):
logging.debug('skipping %s (id %s): one or more tags are present in exclude_tags', item['url'], item['id'])
skipped_count = skipped_count +1
# archive items matching only_tags
elif list(set(step['module_options']['only_tags']) & set(item['tags'])):
logging.info('archiving %s (id %s)', item['url'], item ['id'])
wget(item)
else:
logging.debug('skipping %s (id %s): no tags matching only_tags', item['url'], item['id'])
skipped_count = skipped_count + 1
# sys.exit(1)
3 changes: 1 addition & 2 deletions hecat/processors/download_media.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,7 @@ def download_media(step):
for item in items:
# skip download when skip_when_filename_present = True, and video/audio_filename key already exists
if (('skip_when_filename_present' not in step['module_options'].keys() or
step['module_options']['skip_when_filename_present']) and
filename_key in item.keys()):
step['module_options']['skip_when_filename_present']) and filename_key in item.keys()):
logging.debug('skipping %s (id %s): %s already recorded in the data file', item['url'], item['id'], filename_key)
skipped_count = skipped_count +1
# skip download when retry_items_with_error = False, and video/audio_download_error key alraedy exists
Expand Down
9 changes: 9 additions & 0 deletions tests/.hecat.archive_webpages.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
steps:
- name: archive webpages
module: processors/archive_webpages
module_options:
data_file: tests/shaarli.yml
only_tags: ['hecat']
exclude_tags: ['nodl']
output_directory: 'tests/webpages'

17 changes: 17 additions & 0 deletions tests/shaarli-duplicate.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,21 @@
[
{
"hecat_comment": "test webpage archiving module",
"created": "2023-07-24T12:07:00+02:00",
"description": "Sample description",
"id": 8351,
"private": false,
"shorturl": "Y6NgtQ",
"tags": [
"doc",
"admin",
"hecat",
"dev"
],
"title": "Template Designer Documentation — Jinja Documentation (3.2.x)",
"updated": "2023-07-24T12:07:00+02:00",
"url": "https://jinja.palletsprojects.com/en/latest/templates/"
},
{
"hecat_comment": "test for nodl tag, should not be downloaded",
"created": "2022-07-27T22:56:06+02:00",
Expand Down

0 comments on commit 2aaa7a9

Please sign in to comment.