Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
7h3Rabbit committed Mar 22, 2024
1 parent c6717b5 commit b763365
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 28 deletions.
7 changes: 3 additions & 4 deletions engines/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,8 @@ def read_sitemap_xml(key, sitemap_content, input_skip, input_take, ignore_none_h
children = [child for child in root_element.children \
if child.name == 'url' or child.name == 'sitemap']


nof_children = len(children)
print('\tnof_children =', nof_children)
# nof_children = len(children)
# print('\tnof_children =', nof_children)

# https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
current_index = 0
Expand All @@ -95,7 +94,7 @@ def read_sitemap_xml(key, sitemap_content, input_skip, input_take, ignore_none_h
continue

if is_sitemap_index:
print('\tsitemap =', item_url)
# print('\tsitemap =', item_url)

result = merge_dicts(read_sitemap(
item_url,
Expand Down
71 changes: 47 additions & 24 deletions tests/standard_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
useragent = config.useragent
review_show_improvements_only = config.review_show_improvements_only

try:
use_detailed_report = config.use_detailed_report
except:
# If use_detailed_report variable is not set in config.py this will be the default
use_detailed_report = False


def run_test(_, langCode, url):
"""
Expand Down Expand Up @@ -122,19 +128,30 @@ def validate_sitemaps(_, _local, robots_url, robots_content, has_robots_txt):
if len(found_smaps) > 0:
return_dict["sitemaps"] = found_smaps

print('found sitemaps = ', found_smaps)
# print('found sitemaps = ', found_smaps)

sitemaps_rating = Rating(_, review_show_improvements_only)
for sitemap_url in found_smaps:
sitemaps_rating += validate_sitemap(sitemap_url, robots_url, return_dict, _, _local)
rating += sitemaps_rating

final_rating = Rating(_, review_show_improvements_only)
if sitemaps_rating.is_set:
if use_detailed_report:
final_rating.set_overall(sitemaps_rating.get_overall())
final_rating.overall_review = sitemaps_rating.overall_review
final_rating.set_standards(sitemaps_rating.get_standards())
final_rating.standards_review = sitemaps_rating.standards_review
# final_rating.set_integrity_and_security(rating.get_integrity_and_security())
# final_rating.integrity_and_security_review = rating.integrity_and_security_review
else:
final_rating.set_overall(sitemaps_rating.get_overall())
final_rating.set_standards(sitemaps_rating.get_standards(), _local("TEXT_SITEMAP_OK"))
# final_rating.set_integrity_and_security(rating.get_integrity_and_security(), _local('TEXT_REVIEW_CSP').format(domain))
rating += final_rating
else:
rating.set_overall(2.0)
rating.set_standards(2.0, _local("TEXT_SITEMAP_FOUND"))




return (rating, return_dict)

def validate_sitemap(sitemap_url, robots_url, return_dict, _, _local):
Expand All @@ -147,7 +164,7 @@ def validate_sitemap(sitemap_url, robots_url, return_dict, _, _local):
parsed_robots_url = urllib.parse.urlparse(robots_url)
robots_domain = parsed_robots_url.hostname

print(sitemap_url)
# print(sitemap_url)
sitemaps = read_sitemap(sitemap_url, -1, -1, False)
sitemap_items = sitemaps['all']

Expand All @@ -165,19 +182,17 @@ def validate_sitemap(sitemap_url, robots_url, return_dict, _, _local):
if robots_domain != parsed_item_url.hostname:
always_uses_same_domain = False

if '&' in item_url:
print('\t-', item_url)

# TODO: validate url encoding ( Example: https://www.gotene.se/webdav/files/Centrumhuset/Kultur, turism & fritid/Biblioteket/hemsidefilm/loss_teckensprak.html )
tmp = os.path.splitext(parsed_item_url.path)[1].strip('.').lower()
ext_len = len(tmp)
# print('ext', tmp)
if ext_len <= 4 and ext_len >= 2:
item_type = tmp
# TODO: should we do some checking for html and htm pages? (gotene.se seems to use it for flash (only?)...)
if tmp not in ('html','htm'):
# TODO: ensure known file extention
item_type = tmp
elif parsed_item_url.path.startswith('/download/'):
item_type = 'unknown-in-download'


if item_type not in item_types:
item_types[item_type] = []
item_types[item_type].append(item_url)
Expand All @@ -191,8 +206,8 @@ def validate_sitemap(sitemap_url, robots_url, return_dict, _, _local):
sitemap_items = list(set(sitemap_items))
total_nof_items_no_duplicates = len(sitemap_items)

print('total_nof_items =', total_nof_items)
print('total_nof_items_no_duplicates =', total_nof_items_no_duplicates)
# print('total_nof_items =', total_nof_items)
# print('total_nof_items_no_duplicates =', total_nof_items_no_duplicates)

if not always_starts_with_https_scheme:
sub_rating = Rating(_, review_show_improvements_only)
Expand Down Expand Up @@ -226,11 +241,13 @@ def validate_sitemap(sitemap_url, robots_url, return_dict, _, _local):


if total_nof_items != total_nof_items_no_duplicates:
ratio = total_nof_items_no_duplicates / total_nof_items
duplicates_points = 3.0 * ratio
sub_rating = Rating(_, review_show_improvements_only)
sub_rating.set_overall(
4.0)
duplicates_points)
sub_rating.set_standards(
4.0, _local("TEXT_SITEMAP_INCLUDE_DUPLICATES"))
duplicates_points, _local("TEXT_SITEMAP_INCLUDE_DUPLICATES"))
rating += sub_rating
else:
sub_rating = Rating(_, review_show_improvements_only)
Expand All @@ -241,11 +258,17 @@ def validate_sitemap(sitemap_url, robots_url, return_dict, _, _local):
rating += sub_rating

if len(item_type_keys) > 1:
webpages_points = 1.0
if 'webpage' in item_type_keys:
nof_webpages = len(item_types['webpage'])
ratio = nof_webpages / total_nof_items
webpages_points = 5.0 * ratio

sub_rating = Rating(_, review_show_improvements_only)
sub_rating.set_overall(
1.0)
webpages_points)
sub_rating.set_standards(
1.0, _local("TEXT_SITEMAP_NOT_ONLY_WEBPAGES"))
webpages_points, _local("TEXT_SITEMAP_NOT_ONLY_WEBPAGES"))
rating += sub_rating
else:
sub_rating = Rating(_, review_show_improvements_only)
Expand All @@ -265,9 +288,9 @@ def validate_sitemap(sitemap_url, robots_url, return_dict, _, _local):
if nof_items > 50_000:
sub_rating = Rating(_, review_show_improvements_only)
sub_rating.set_overall(
3.0)
1.0)
sub_rating.set_standards(
3.0, _local("TEXT_SITEMAP_TOO_LARGE"))
1.0, _local("TEXT_SITEMAP_TOO_LARGE"))
rating += sub_rating
else:
sub_rating = Rating(_, review_show_improvements_only)
Expand All @@ -283,15 +306,15 @@ def validate_sitemap(sitemap_url, robots_url, return_dict, _, _local):
item_types[key] = list(set(item_types[key]))
type_spread[key] = len(item_types[key])

nice_items = json.dumps(type_spread, indent=14)
print('\tsitemap[distribution of types]', nice_items)
# nice_items = json.dumps(type_spread, indent=14)
# print('\tsitemap[distribution of types]', nice_items)

if total_nof_items == 0:
sub_rating = Rating(_, review_show_improvements_only)
sub_rating.set_overall(
3.0)
1.0)
sub_rating.set_standards(
3.0, _local("TEXT_SITEMAP_BROKEN"))
1.0, _local("TEXT_SITEMAP_BROKEN"))
rating += sub_rating

return_dict['sitemap_check'] = f"'{sitemap_url}' seem to be broken"
Expand Down

0 comments on commit b763365

Please sign in to comment.