Skip to content

Commit

Permalink
sitemap compatability improvements + refact + rating
Browse files Browse the repository at this point in the history
  • Loading branch information
7h3Rabbit committed Mar 22, 2024
1 parent 1de6439 commit c6717b5
Show file tree
Hide file tree
Showing 4 changed files with 255 additions and 108 deletions.
137 changes: 89 additions & 48 deletions engines/sitemap.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,28 @@
# -*- coding: utf-8 -*-
from models import Sites
from engines.utils import use_item
import config
from tests.utils import *
import re
import os
from urllib.parse import urlparse
import gzip
import io
from bs4 import BeautifulSoup
from engines.utils import use_item
from tests.utils import get_content_type, httpRequestGetContent, cache_time_delta
from utils import merge_dicts

def read_sites(input_sitemap_url, input_skip, input_take):
ignore_none_html = True
return read_sitemap(input_sitemap_url, input_skip, input_take, ignore_none_html)
sitemaps = read_sitemap(input_sitemap_url, input_skip, input_take, ignore_none_html)

sites = []
for index, address in enumerate(sitemaps['all']):
sites.append((index, address))

return sites

def read_sitemap(input_sitemap_url, input_skip, input_take, ignore_none_html):
# TODO, handle this?: <loc><![CDATA[https://melanomforeningen.se/post-sitemap.xml]]></loc>

# TODO: CDATA everything: https://melanomforeningen.se/post-sitemap.xml
# <url>
# <loc><![CDATA[https://melanomforeningen.se/nyheter/]]></loc>
# <lastmod><![CDATA[2024-01-26T11:22:43+00:00]]></lastmod>
# <changefreq><![CDATA[weekly]]></changefreq>
# <priority><![CDATA[0.7]]></priority>
# <image:image>
# <image:loc><![CDATA[https://melanomforeningen.se/wp-content/uploads/newspapers-444447_1280.jpg]]></image:loc>
# </image:image>
# </url>
result = {
'all': [],
input_sitemap_url: []
}

if input_sitemap_url.endswith('.xml.gz'):
# unpack gzip:ed sitemap
Expand All @@ -32,36 +31,78 @@ def read_sitemap(input_sitemap_url, input_skip, input_take, ignore_none_html):
with gzip.GzipFile(fileobj=gzip_io, mode='rb') as gzip_file:
gzip_content = gzip_file.read()
sitemap_content = gzip_content.decode('utf-8', 'ignore')
return read_sitemap_xml(sitemap_content, input_skip, input_take, ignore_none_html)
result = merge_dicts(read_sitemap_xml(
input_sitemap_url,
sitemap_content,
input_skip,
input_take,
ignore_none_html), result, True, False)
else:
sitemap_content = httpRequestGetContent(input_sitemap_url, True, True)
# TODO: read sitemap as XML, to avoid parsing problems.
return read_sitemap_xml(sitemap_content, input_skip, input_take, ignore_none_html)
result = merge_dicts(read_sitemap_xml(input_sitemap_url,
sitemap_content,
input_skip,
input_take,
ignore_none_html), result, True, False)

return result

def read_sitemap_xml(key, sitemap_content, input_skip, input_take, ignore_none_html):
result = {
'all': [],
key: []
}

soup = BeautifulSoup(sitemap_content, 'xml')

root_element = None
is_sitemap_index = False
for element in soup.contents:
if element.name is None:
continue
low_name = element.name.lower()
if 'sitemapindex' == low_name:
root_element = element
is_sitemap_index = True
break
elif 'urlset' == low_name:
root_element = element
break

def read_sitemap_xml(sitemap_content, input_skip, input_take, ignore_none_html):
sites = list()
if root_element is None:
return result

# do we have sitemaps in our sitemap?...
is_recursive = '<sitemap>' in sitemap_content
# Get the direct children of the root element
children = [child for child in root_element.children \
if child.name == 'url' or child.name == 'sitemap']

regex = r"<loc>(?P<itemurl>[^<]+)<"
matches = re.finditer(regex, sitemap_content, re.MULTILINE)

nof_children = len(children)
print('\tnof_children =', nof_children)

# https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
current_index = 0
for matchNum, match in enumerate(matches, start=1):
for child in children:
loc_children = [child for child in child.children \
if child.name == 'loc']
if len(loc_children) == 0:
continue

item_url = loc_children[0].text

if not use_item(current_index, input_skip, input_take):
current_index += 1
continue

item_url = match.group('itemurl')
# TODO: validate url encoding ( Example: https://www.gotene.se/webdav/files/Centrumhuset/Kultur, turism & fritid/Biblioteket/hemsidefilm/loss_teckensprak.html )
item_url = item_url.replace(' ', '%20')
if is_sitemap_index:
print('\tsitemap =', item_url)

if is_recursive:
tmp_sites = read_sitemap(item_url, input_skip, input_take, ignore_none_html)
current_index += len(tmp_sites)
sites.extend(tmp_sites)
result = merge_dicts(read_sitemap(
item_url,
input_skip,
input_take,
ignore_none_html), result, True, False)
current_index += len(result['all'])
else:
if ignore_none_html:
item_type = 'html'
Expand All @@ -72,35 +113,35 @@ def read_sitemap_xml(sitemap_content, input_skip, input_take, ignore_none_html):
item_type = tmp

if 'html' != item_type and 'htm' != item_type:
print('- skipping because it is of type: {0}'.format(item_type))
# current_index += 1
print(f'- skipping because it is of type: {item_type}')
continue

item_content_type = get_content_type(item_url, cache_time_delta)
print('content-type', item_content_type)
if item_content_type == 401:
print('- skipping because it is of status-code: {0}'.format(item_content_type))
print(f'- skipping because it is of status-code: {item_content_type}')
continue
elif item_content_type != None and 'html' not in item_content_type:
print('- skipping because it is of content-type: {0}'.format(item_content_type))
# current_index += 1
elif item_content_type is not None and 'html' not in item_content_type:
print(f'- skipping because it is of content-type: {item_content_type}')
continue

sites.append([current_index, item_url])
result['all'].append(item_url)
result[key].append(item_url)
current_index += 1
return sites
return result


def add_site(input_filename, url, input_skip, input_take):
print("WARNING: sitemap engine is a read only method for testing all pages in a sitemap.xml, NO changes will be made")
def add_site(input_filename, _, input_skip, input_take):
print("WARNING: sitemap engine is a read only method for testing all pages in a sitemap.xml,"
,"NO changes will be made")

sites = read_sites(input_filename, input_skip, input_take)

return sites


def delete_site(input_filename, url, input_skip, input_take):
print("WARNING: sitemap engine is a read only method for testing all pages in a sitemap.xml, NO changes will be made")
def delete_site(input_filename, _, input_skip, input_take):
print("WARNING: sitemap engine is a read only method for testing all pages in a sitemap.xml,"
,"NO changes will be made")

sites = read_sites(input_filename, input_skip, input_take)

Expand Down
41 changes: 7 additions & 34 deletions tests/http_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from models import Rating
from tests.utils import dns_lookup
from tests.utils import *
from utils import merge_dicts
from tests.sitespeed_base import get_result
import dns.name
import dns.query
Expand Down Expand Up @@ -92,7 +93,7 @@ def run_test(_, langCode, url):
hostname = o.hostname

if csp_only:
result_dict = merge_dicts(check_csp(url), csp_only_global_result_dict)
result_dict = merge_dicts(check_csp(url), csp_only_global_result_dict, True, True)
if 'nof_pages' not in result_dict:
result_dict['nof_pages'] = 1
else:
Expand Down Expand Up @@ -885,34 +886,6 @@ def cleanup(result_dict):
result_dict[domain][subkey] = sorted(list(set(result_dict[domain][subkey])))
return result_dict

def merge_dicts(dict1, dict2):
if dict1 == None:
return dict2
if dict2 == None:
return dict1

for domain, value in dict2.items():
if domain in dict1:
type_of_value = type(value)
if type_of_value == dict:
for subkey, subvalue in value.items():
if subkey in dict1[domain]:
if type(subvalue) == dict:
merge_dicts(dict1[domain][subkey], dict2[domain][subkey])
elif type(subvalue) == list:
dict1[domain][subkey].extend(subvalue)
dict1[domain][subkey] = sorted(list(set(dict1[domain][subkey])))
else:
dict1[domain][subkey] = dict2[domain][subkey]
elif type_of_value == list:
dict1[domain].extend(value)
dict1[domain] = sorted(list(set(dict1[domain])))
elif type_of_value == int:
dict1[domain] = dict1[domain] + value
else:
dict1[domain] = value
return dict1

def host_source_2_url(host_source):
result = host_source
if '*' in result:
Expand Down Expand Up @@ -1836,7 +1809,7 @@ def check_http_to_https(url):
result_dict[o_domain]['schemes'].append('HTTP-REDIRECT*')
https_url = url.replace('http://', 'https://')
print('HTTPS', o_domain)
result_dict = merge_dicts(get_website_support_from_sitespeed(https_url, o_domain, configuration, browser, sitespeed_timeout), result_dict)
result_dict = merge_dicts(get_website_support_from_sitespeed(https_url, o_domain, configuration, browser, sitespeed_timeout), result_dict, True, True)
else:
result_dict[o_domain]['schemes'].append('HTTPS-REDIRECT*')

Expand All @@ -1854,7 +1827,7 @@ def check_http_to_https(url):
result_dict[www_domain_key]['schemes'].append('HTTPS-REDIRECT*')
www_http_url = http_url.replace(o_domain, www_domain_key)
print('HTTP', www_domain_key)
result_dict = merge_dicts(get_website_support_from_sitespeed(www_http_url, www_domain_key, configuration, browser, sitespeed_timeout), result_dict)
result_dict = merge_dicts(get_website_support_from_sitespeed(www_http_url, www_domain_key, configuration, browser, sitespeed_timeout), result_dict, True, True)
else:
result_dict[www_domain_key]['schemes'].append('HTTP-REDIRECT*')

Expand Down Expand Up @@ -2272,21 +2245,21 @@ def check_http_version(url, result_dict):
configuration = ' --firefox.preference network.http.http2.enabled:false --firefox.preference network.http.http3.enable:false'
url2 = change_url_to_test_url(url, 'HTTPv1')
print('HTTP/1.1')
result_dict = merge_dicts(get_website_support_from_sitespeed(url2, o_domain, configuration, browser, sitespeed_timeout), result_dict)
result_dict = merge_dicts(get_website_support_from_sitespeed(url2, o_domain, configuration, browser, sitespeed_timeout), result_dict, True, True)

if not contains_value_for_all(result_dict, 'protocols', 'HTTP/2'):
browser = 'firefox'
configuration = ' --firefox.preference network.http.http2.enabled:true --firefox.preference network.http.http3.enable:false --firefox.preference network.http.version:3.0'
url2 = change_url_to_test_url(url, 'HTTPv2')
print('HTTP/2')
result_dict = merge_dicts(get_website_support_from_sitespeed(url2, o_domain, configuration, browser, sitespeed_timeout), result_dict)
result_dict = merge_dicts(get_website_support_from_sitespeed(url2, o_domain, configuration, browser, sitespeed_timeout), result_dict, True, True)

if not contains_value_for_all(result_dict, 'protocols', 'HTTP/3'):
browser = 'firefox'
configuration = ' --firefox.preference network.http.http2.enabled:false --firefox.preference network.http.http3.enable:true --firefox.preference network.http.version:3.0'
url2 = change_url_to_test_url(url, 'HTTPv3')
print('HTTP/3')
result_dict = merge_dicts(get_website_support_from_sitespeed(url2, o_domain, configuration, browser, sitespeed_timeout), result_dict)
result_dict = merge_dicts(get_website_support_from_sitespeed(url2, o_domain, configuration, browser, sitespeed_timeout), result_dict, True, True)

return result_dict

Expand Down
Loading

0 comments on commit c6717b5

Please sign in to comment.