From 4d429d16723de5d0401fa1cf951540969ad9b9a5 Mon Sep 17 00:00:00 2001 From: 7h3Rabbit <62792609+7h3Rabbit@users.noreply.github.com> Date: Wed, 8 Jan 2025 18:26:26 +0100 Subject: [PATCH] Added support for all webperf.se categories as -i sources use for example: python default.py -i help.webprf to see all available categories. --- engines/webperf.py | 70 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 14 deletions(-) diff --git a/engines/webperf.py b/engines/webperf.py index 76530daf..e0a368d5 100644 --- a/engines/webperf.py +++ b/engines/webperf.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import json import re from engines.utils import use_item from tests.utils import get_http_content @@ -24,22 +25,63 @@ def read_sites(input_url, input_skip, input_take): list: The list of sites read from the specified category on https://webperf.se. """ sites = [] - - if 'offentlig-sektor' in input_url: - input_url = 'https://webperf.se/category/ovrig-offentlig-sektor/' - elif 'kommuner' in input_url: - input_url = 'https://webperf.se/category/kommuner/' - elif 'regioner' in input_url: - input_url = 'https://webperf.se/category/regioner/' - elif 'toplist' in input_url: - input_url = 'https://webperf.se/toplist/' - elif 'digitalt' in input_url: - input_url = 'https://webperf.se/category/digitalt-sverige/' - elif 'webbyraer' in input_url: - input_url = 'https://webperf.se/category/webbyraer/' + all_categories_url = 'https://webperf.se/sites/' + categories_fallback = { + 'offentlig-sektor': '/category/ovrig-offentlig-sektor/', + 'kommuner': '/category/kommuner/', + 'regioner': '/category/regioner/', + 'toplist': '/toplist/', + 'digitalt': '/category/digitalt-sverige/', + 'webbyraer': '/category/webbyraer/' + } + + all_categories_content = get_http_content(all_categories_url) + if all_categories_content != '': + categories = {} + categories_regex = r"Kategori<\/th>.*?(?P.*?)<\/tbody>" + categories_matches = re.finditer( + categories_regex, all_categories_content, re.MULTILINE | re.S) + for _, match in enumerate(categories_matches, start=1): + all_categories_subcontent = match.group('categories') + # \/category\/(?P[^\"]+)/)\"> + category_regex = r"\/category\/(?P[^\"]+)/)\">" + category_matches = re.finditer( + category_regex, all_categories_subcontent, re.MULTILINE | re.S) + for _, match in enumerate(category_matches, start=1): + category_url = match.group('url') + category_name = match.group('name') + categories[category_name] = category_url + else: + categories = categories_fallback + + found = False + for category_name, category_url in categories.items(): + if category_name in input_url: + input_url = category_url + found = True + + if not found: + for category_name, category_url in categories_fallback.items(): + if category_name in input_url: + input_url = category_url + found = True + + if found: + input_url = f'https://webperf.se{input_url}' else: - raise NotImplementedError('input is incorrect') + print('Error: No valid webperf option') + print('') + print('Available webperf.se input values:') + for category_name, category_url in categories.items(): + print(f'-i {category_name}.webprf') + return sites + + sites.extend(get_category_sites(input_url, input_skip, input_take)) + return sites +def get_category_sites(input_url, input_skip, input_take): + print(f'Retrieving sites from {input_url}') + sites = [] category_content = get_http_content(input_url) category_regex = r"\/site\/[^\"]+)\""