sitemap compatability improvements + refact + rating

Webperf-se · Mar 22, 2024 · c6717b5 · c6717b5
1 parent 1de6439
commit c6717b5
Show file tree

Hide file tree

Showing 4 changed files with 255 additions and 108 deletions.
diff --git a/engines/sitemap.py b/engines/sitemap.py
@@ -1,29 +1,28 @@
 # -*- coding: utf-8 -*-
-from models import Sites
-from engines.utils import use_item
-import config
-from tests.utils import *
-import re
+import os
+from urllib.parse import urlparse
 import gzip
 import io
+from bs4 import BeautifulSoup
+from engines.utils import use_item
+from tests.utils import get_content_type, httpRequestGetContent, cache_time_delta
+from utils import merge_dicts
 
 def read_sites(input_sitemap_url, input_skip, input_take):
     ignore_none_html = True
-    return read_sitemap(input_sitemap_url, input_skip, input_take, ignore_none_html)
+    sitemaps = read_sitemap(input_sitemap_url, input_skip, input_take, ignore_none_html)
+
+    sites = []
+    for index, address in enumerate(sitemaps['all']):
+        sites.append((index, address))
+
+    return sites
 
 def read_sitemap(input_sitemap_url, input_skip, input_take, ignore_none_html):
-    # TODO, handle this?: <loc><![CDATA[https://melanomforeningen.se/post-sitemap.xml]]></loc>
-
-    # TODO: CDATA everything: https://melanomforeningen.se/post-sitemap.xml
-	# <url>
-	# 	<loc><![CDATA[https://melanomforeningen.se/nyheter/]]></loc>
-	# 	<lastmod><![CDATA[2024-01-26T11:22:43+00:00]]></lastmod>
-	# 	<changefreq><![CDATA[weekly]]></changefreq>
-	# 	<priority><![CDATA[0.7]]></priority>
-	# 	<image:image>
-	# 		<image:loc><![CDATA[https://melanomforeningen.se/wp-content/uploads/newspapers-444447_1280.jpg]]></image:loc>
-	# 	</image:image>
-	# </url>    
+    result = {
+        'all': [],
+        input_sitemap_url: []
+    }
 
     if input_sitemap_url.endswith('.xml.gz'):
         # unpack gzip:ed sitemap
@@ -32,36 +31,78 @@ def read_sitemap(input_sitemap_url, input_skip, input_take, ignore_none_html):
         with gzip.GzipFile(fileobj=gzip_io, mode='rb') as gzip_file:
             gzip_content = gzip_file.read()
             sitemap_content = gzip_content.decode('utf-8', 'ignore')
-            return read_sitemap_xml(sitemap_content, input_skip, input_take, ignore_none_html)
+            result = merge_dicts(read_sitemap_xml(
+                input_sitemap_url,
+                sitemap_content,
+                input_skip,
+                input_take,
+                ignore_none_html), result, True, False)
     else:
         sitemap_content = httpRequestGetContent(input_sitemap_url, True, True)
-        # TODO: read sitemap as XML, to avoid parsing problems.
-        return read_sitemap_xml(sitemap_content, input_skip, input_take, ignore_none_html)
+        result = merge_dicts(read_sitemap_xml(input_sitemap_url,
+            sitemap_content,
+            input_skip,
+            input_take,
+            ignore_none_html), result, True, False)
+
+    return result
+
+def read_sitemap_xml(key, sitemap_content, input_skip, input_take, ignore_none_html):
+    result = {
+        'all': [],
+        key: []
+    }
+
+    soup = BeautifulSoup(sitemap_content, 'xml')
+
+    root_element = None
+    is_sitemap_index = False
+    for element in soup.contents:
+        if element.name is None:
+            continue
+        low_name = element.name.lower()
+        if 'sitemapindex' == low_name:
+            root_element = element
+            is_sitemap_index = True
+            break
+        elif 'urlset' == low_name:
+            root_element = element
+            break
 
-def read_sitemap_xml(sitemap_content, input_skip, input_take, ignore_none_html):
-    sites = list()
+    if root_element is None:
+        return result
 
-    # do we have sitemaps in our sitemap?...
-    is_recursive = '<sitemap>' in sitemap_content
+    # Get the direct children of the root element
+    children = [child for child in root_element.children \
+                if child.name == 'url' or child.name == 'sitemap']
 
-    regex = r"<loc>(?P<itemurl>[^<]+)<"
-    matches = re.finditer(regex, sitemap_content, re.MULTILINE)
 
+    nof_children = len(children)
+    print('\tnof_children =', nof_children)
+
+    # https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
     current_index = 0
-    for matchNum, match in enumerate(matches, start=1):
+    for child in children:
+        loc_children = [child for child in child.children \
+                    if child.name == 'loc']
+        if len(loc_children) == 0:
+            continue
+
+        item_url = loc_children[0].text
 
         if not use_item(current_index, input_skip, input_take):
             current_index += 1
             continue
 
-        item_url = match.group('itemurl')
-        # TODO: validate url encoding ( Example: https://www.gotene.se/webdav/files/Centrumhuset/Kultur, turism & fritid/Biblioteket/hemsidefilm/loss_teckensprak.html )
-        item_url = item_url.replace(' ', '%20')
+        if is_sitemap_index:
+            print('\tsitemap =', item_url)
 
-        if is_recursive:
-            tmp_sites = read_sitemap(item_url, input_skip, input_take, ignore_none_html)
-            current_index += len(tmp_sites)
-            sites.extend(tmp_sites)
+            result = merge_dicts(read_sitemap(
+                item_url,
+                input_skip,
+                input_take,
+                ignore_none_html), result, True, False)
+            current_index += len(result['all'])
         else:
             if ignore_none_html:
                 item_type = 'html'
@@ -72,35 +113,35 @@ def read_sitemap_xml(sitemap_content, input_skip, input_take, ignore_none_html):
                     item_type = tmp
 
                 if 'html' != item_type and 'htm' != item_type:
-                    print('- skipping because it is of type: {0}'.format(item_type))
-                    # current_index += 1
+                    print(f'- skipping because it is of type: {item_type}')
                     continue
 
                 item_content_type = get_content_type(item_url, cache_time_delta)
                 print('content-type', item_content_type)
                 if item_content_type == 401:
-                    print('- skipping because it is of status-code: {0}'.format(item_content_type))
+                    print(f'- skipping because it is of status-code: {item_content_type}')
                     continue
-                elif item_content_type != None and 'html' not in item_content_type:
-                    print('- skipping because it is of content-type: {0}'.format(item_content_type))
-                    # current_index += 1
+                elif item_content_type is not None and 'html' not in item_content_type:
+                    print(f'- skipping because it is of content-type: {item_content_type}')
                     continue
-
-            sites.append([current_index, item_url])
+            result['all'].append(item_url)
+            result[key].append(item_url)
         current_index += 1
-    return sites
+    return result
 
 
-def add_site(input_filename, url, input_skip, input_take):
-    print("WARNING: sitemap engine is a read only method for testing all pages in a sitemap.xml, NO changes will be made")
+def add_site(input_filename, _, input_skip, input_take):
+    print("WARNING: sitemap engine is a read only method for testing all pages in a sitemap.xml,"
+          ,"NO changes will be made")
 
     sites = read_sites(input_filename, input_skip, input_take)
 
     return sites
 
 
-def delete_site(input_filename, url, input_skip, input_take):
-    print("WARNING: sitemap engine is a read only method for testing all pages in a sitemap.xml, NO changes will be made")
+def delete_site(input_filename, _, input_skip, input_take):
+    print("WARNING: sitemap engine is a read only method for testing all pages in a sitemap.xml,"
+          ,"NO changes will be made")
 
     sites = read_sites(input_filename, input_skip, input_take)
 

diff --git a/tests/http_validator.py b/tests/http_validator.py
@@ -17,6 +17,7 @@
 from models import Rating
 from tests.utils import dns_lookup
 from tests.utils import *
+from utils import merge_dicts
 from tests.sitespeed_base import get_result
 import dns.name
 import dns.query
@@ -92,7 +93,7 @@ def run_test(_, langCode, url):
     hostname = o.hostname
 
     if csp_only:
-        result_dict = merge_dicts(check_csp(url), csp_only_global_result_dict)
+        result_dict = merge_dicts(check_csp(url), csp_only_global_result_dict, True, True)
         if 'nof_pages' not in result_dict:
             result_dict['nof_pages'] = 1
         else:
@@ -885,34 +886,6 @@ def cleanup(result_dict):
                 result_dict[domain][subkey] = sorted(list(set(result_dict[domain][subkey])))
     return result_dict
 
-def merge_dicts(dict1, dict2):
-    if dict1 == None:
-        return dict2
-    if dict2 == None:
-        return dict1
-
-    for domain, value in dict2.items():
-        if domain in dict1:
-            type_of_value = type(value)
-            if type_of_value == dict:
-                for subkey, subvalue in value.items():
-                    if subkey in dict1[domain]:
-                        if type(subvalue) == dict:
-                            merge_dicts(dict1[domain][subkey], dict2[domain][subkey])
-                        elif type(subvalue) == list:
-                            dict1[domain][subkey].extend(subvalue)
-                            dict1[domain][subkey] = sorted(list(set(dict1[domain][subkey])))
-                    else:
-                        dict1[domain][subkey] = dict2[domain][subkey]
-            elif type_of_value == list:
-                dict1[domain].extend(value)
-                dict1[domain] = sorted(list(set(dict1[domain])))
-            elif type_of_value == int:
-                dict1[domain] = dict1[domain] + value
-        else:
-            dict1[domain] = value
-    return dict1
-
 def host_source_2_url(host_source):
     result = host_source
     if '*' in result:
@@ -1836,7 +1809,7 @@ def check_http_to_https(url):
             result_dict[o_domain]['schemes'].append('HTTP-REDIRECT*')
             https_url = url.replace('http://', 'https://')
             print('HTTPS', o_domain)
-            result_dict = merge_dicts(get_website_support_from_sitespeed(https_url, o_domain, configuration, browser, sitespeed_timeout), result_dict)
+            result_dict = merge_dicts(get_website_support_from_sitespeed(https_url, o_domain, configuration, browser, sitespeed_timeout), result_dict, True, True)
         else:
             result_dict[o_domain]['schemes'].append('HTTPS-REDIRECT*')
 
@@ -1854,7 +1827,7 @@ def check_http_to_https(url):
             result_dict[www_domain_key]['schemes'].append('HTTPS-REDIRECT*')
             www_http_url = http_url.replace(o_domain, www_domain_key)
             print('HTTP', www_domain_key)
-            result_dict = merge_dicts(get_website_support_from_sitespeed(www_http_url, www_domain_key, configuration, browser, sitespeed_timeout), result_dict)
+            result_dict = merge_dicts(get_website_support_from_sitespeed(www_http_url, www_domain_key, configuration, browser, sitespeed_timeout), result_dict, True, True)
         else:
             result_dict[www_domain_key]['schemes'].append('HTTP-REDIRECT*')
 
@@ -2272,21 +2245,21 @@ def check_http_version(url, result_dict):
         configuration = ' --firefox.preference network.http.http2.enabled:false --firefox.preference network.http.http3.enable:false'
         url2 = change_url_to_test_url(url, 'HTTPv1')
         print('HTTP/1.1')
-        result_dict = merge_dicts(get_website_support_from_sitespeed(url2, o_domain, configuration, browser, sitespeed_timeout), result_dict)
+        result_dict = merge_dicts(get_website_support_from_sitespeed(url2, o_domain, configuration, browser, sitespeed_timeout), result_dict, True, True)
 
     if not contains_value_for_all(result_dict, 'protocols', 'HTTP/2'):
         browser = 'firefox'
         configuration = ' --firefox.preference network.http.http2.enabled:true --firefox.preference network.http.http3.enable:false --firefox.preference network.http.version:3.0'
         url2 = change_url_to_test_url(url, 'HTTPv2')
         print('HTTP/2')
-        result_dict = merge_dicts(get_website_support_from_sitespeed(url2, o_domain, configuration, browser, sitespeed_timeout), result_dict)
+        result_dict = merge_dicts(get_website_support_from_sitespeed(url2, o_domain, configuration, browser, sitespeed_timeout), result_dict, True, True)
 
     if not contains_value_for_all(result_dict, 'protocols', 'HTTP/3'):
         browser = 'firefox'
         configuration = ' --firefox.preference network.http.http2.enabled:false --firefox.preference network.http.http3.enable:true --firefox.preference network.http.version:3.0'
         url2 = change_url_to_test_url(url, 'HTTPv3')
         print('HTTP/3')
-        result_dict = merge_dicts(get_website_support_from_sitespeed(url2, o_domain, configuration, browser, sitespeed_timeout), result_dict)
+        result_dict = merge_dicts(get_website_support_from_sitespeed(url2, o_domain, configuration, browser, sitespeed_timeout), result_dict, True, True)
 
     return result_dict