Skip to content

Commit

Permalink
Updated files to accept multiple sitemaps request
Browse files Browse the repository at this point in the history
  • Loading branch information
RK206 committed Nov 5, 2024
1 parent b076f3c commit 442a5a4
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 19 deletions.
10 changes: 10 additions & 0 deletions doajtest/mocks/models_Cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@ def cache_sitemap(cls, filename):
"filename" : filename
}

@classmethod
def cache_nth_sitemap(cls, n, url):
cls.__memory__["sitemap" + str(n)] = {
"filename": url
}

@classmethod
def get_sitemap(cls, n):
return cls.__memory__["sitemap" + str(n)]

@classmethod
def get_latest_sitemap(cls):
return cls.__memory__["sitemap"]
Expand Down
66 changes: 53 additions & 13 deletions doajtest/unit/test_bll_site_sitemap.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import os.path
from io import StringIO

from combinatrix.testintegration import load_parameter_sets
from lxml import etree
from parameterized import parameterized

from doajtest import helpers
from doajtest.fixtures import JournalFixtureFactory
from doajtest.fixtures import JournalFixtureFactory, ArticleFixtureFactory
from doajtest.helpers import DoajTestCase, patch_config
from doajtest.mocks.models_Cache import ModelCacheMockFactory
from doajtest.mocks.store import StoreMockFactory
Expand Down Expand Up @@ -67,6 +68,10 @@ def setUp(self):
}
]

self.base_url = app.config.get("BASE_URL")
if not self.base_url.endswith("/"):
self.base_url += "/"

def tearDown(self):
self.localStore.delete_container(self.container_id)
self.tmpStore.delete_container(self.container_id)
Expand Down Expand Up @@ -110,12 +115,21 @@ def test_sitemap(self, name, kwargs):

expectations = [(j.bibjson().get_preferred_issn(), j.last_updated) for j in journals]

articles = []
for s in ArticleFixtureFactory.make_many_article_sources(count=10, in_doaj=True):
a = models.Article(**s)
a.save()
articles.append(a)
models.Article.blockall([(a.id, a.last_updated) for a in articles])

articles_expectations = [(a.id, a.last_updated) for a in articles]

if prune:
self.localStore.store(self.container_id, "sitemap__doaj_20180101_0000_utf8.xml",
self.localStore.store(self.container_id, "sitemap_doaj_20180101_0000_utf8.xml",
source_stream=StringIO("test1"))
self.localStore.store(self.container_id, "sitemap__doaj_20180601_0000_utf8.xml",
self.localStore.store(self.container_id, "sitemap_doaj_20180601_0000_utf8.xml",
source_stream=StringIO("test2"))
self.localStore.store(self.container_id, "sitemap__doaj_20190101_0000_utf8.xml",
self.localStore.store(self.container_id, "sitemap_doaj_20190101_0000_utf8.xml",
source_stream=StringIO("test3"))

###########################################################
Expand All @@ -139,41 +153,63 @@ def test_sitemap(self, name, kwargs):
filenames = self.localStore.list(self.container_id)
if prune:
assert len(filenames) == 2, "expected 0, received {}".format(len(filenames))
assert "sitemap__doaj_20180101_0000_utf8.xml" not in filenames
assert "sitemap__doaj_20180601_0000_utf8.xml" not in filenames
assert "sitemap__doaj_20190101_0000_utf8.xml" in filenames
assert "sitemap_doaj_20180101_0000_utf8.xml" not in filenames
assert "sitemap_doaj_20180601_0000_utf8.xml" not in filenames
assert "sitemap_doaj_20190101_0000_utf8.xml" in filenames
else:
assert len(filenames) == 1, "expected 0, received {}".format(len(filenames))

latest = None
for fn in filenames:
if fn != "sitemap__doaj_20190101_0000_utf8.xml":
if fn != "sitemap_doaj_20190101_0000_utf8.xml":
latest = fn
break

handle = self.localStore.get(self.container_id, latest, encoding="utf-8")
NS = "{http://www.sitemaps.org/schemas/sitemap/0.9}"

file_date = '_'.join(latest.split('_')[2:])
index_file = os.path.join(latest, 'sitemap_index_doaj_'+file_date+'_utf8.xml')

# check the contents
handle = self.localStore.get(self.container_id, index_file, encoding="utf-8")

# check sitemap index file
tree = etree.parse(handle)
urlElements = tree.getroot().getchildren()
for urlElement in urlElements:
loc = urlElement.find(NS + "loc").text
assert loc.startswith(self.base_url + "sitemap")

tocs = []
statics = []
NS = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
article_ids = []

# check sitemap file
sitemap_file = os.path.join(latest, 'sitemap_doaj_' + file_date + '_0_utf8.xml')
handle = self.localStore.get(self.container_id, sitemap_file, encoding="utf-8")

tree = etree.parse(handle)
urlElements = tree.getroot().getchildren()

for urlElement in urlElements:
loc = urlElement.find(NS + "loc").text
lm = urlElement.find(NS + "lastmod")
if lm is not None:
lm = lm.text
cf = urlElement.find(NS + "changefreq").text

assert cf == "daily"
# check journals
if "/toc" in loc:
for exp in expectations:
if loc.endswith(exp[0]):
tocs.append(exp[0])
assert lm == exp[1]
assert cf == "daily"
# check articles
elif "/article/" in loc:
for exp in articles_expectations:
if loc.endswith(exp[0]):
article_ids.append(exp[0])
assert lm == exp[1]
# check static pages
else:
statics.append(loc)
assert lm is None
Expand All @@ -183,6 +219,10 @@ def test_sitemap(self, name, kwargs):
list(set(tocs))
assert len(tocs) == len(expectations)

# deduplicate the list of articles, to check that we saw all articles
list(set(article_ids))
assert len(article_ids) == len(articles_expectations)

# deduplicate the statics, to check we saw all of them too
_urls = (get_full_url_safe(r)
for r in nav.yield_all_route(self.static_entries))
Expand Down
26 changes: 22 additions & 4 deletions portality/bll/services/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
from portality.core import app
from portality.lib import nav, dates
from portality.lib.argvalidate import argvalidate
from portality.lib.dates import FMT_DATETIME_SHORT
from portality.lib.dates import FMT_DATETIME_SHORT, FMT_DATETIME_STD
from portality.store import StoreFactory, prune_container
from portality.util import get_full_url_safe
from portality.view.doaj import sitemap

NS = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
IN_DOAJ = {
Expand Down Expand Up @@ -103,6 +104,7 @@ def sitemap(self, prune: bool = True):
base_url += "/"

run_start_time = dates.now_str(FMT_DATETIME_SHORT)
lastmod_date = dates.now_str(FMT_DATETIME_STD)

filename_prefix = 'sitemap_doaj_' + run_start_time
cache_container_id = app.config.get("STORE_CACHE_CONTAINER")
Expand Down Expand Up @@ -150,6 +152,7 @@ def sitemap(self, prune: bool = True):
sitemap_generator.add_url(article_loc, lastmod=a.last_updated)
total_articles_count += 1

# check last sitemap
if sitemap_generator.get_url_count() > 0:
sitemap_generator.finalize_sitemap_file()

Expand All @@ -159,13 +162,28 @@ def sitemap(self, prune: bool = True):
with open(sitemap_index_path, "w") as f:
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n')
sitemap_count = 0
for sitemap_url in sitemap_generator.get_sitemap_files():
f.write(f" <sitemap>\n")
f.write(f" <loc>{sitemap_url}</loc>\n")
f.write(f" <lastmod>{run_start_time}</lastmod>\n")
f.write(f" <loc>{base_url}sitemap{sitemap_count}.xml</loc>\n")
f.write(f" <lastmod>{lastmod_date}</lastmod>\n")
f.write(f" </sitemap>\n")
# Cache the sitemap
models.Cache.cache_nth_sitemap(sitemap_count, sitemap_url)
sitemap_count += 1
f.write('</sitemapindex>\n')

# Delete any previous cache. Usually this may not be the situation but check
# if there are any previous sitemap available and delete
while True:
cache = models.Cache.pull("sitemap"+str(sitemap_count))
if cache:
cache.delete()
else:
break
sitemap_count += 1


mainStore.store(container_id, sitemap_index_filename, source_path=sitemap_index_path)
index_url = mainStore.url(container_id, sitemap_index_filename)

Expand All @@ -174,7 +192,7 @@ def sitemap(self, prune: bool = True):
# Prune old sitemaps if required
if prune:
def sort(filelist):
rx = "sitemap_doaj_(\d{8})_(\d{4})"
rx = r"^sitemap_doaj_(\d{8})_(\d{4})"

matched_dates = [
(filename, datetime.strptime(match.groups()[0]+"_"+match.groups()[1], FMT_DATETIME_SHORT))
Expand Down
15 changes: 15 additions & 0 deletions portality/models/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,28 @@ def cache_sitemap(cls, url):
cobj.set_id("sitemap")
cobj.save()

@classmethod
def cache_nth_sitemap(cls, n, url):
cobj = cls(**{
"filename": url
})
cobj.set_id("sitemap"+str(n))
cobj.save()

@classmethod
def get_latest_sitemap(cls):
rec = cls.pull("sitemap")
if rec is None:
return None
return rec.get("filename")

@classmethod
def get_sitemap(cls, n):
rec = cls.pull("sitemap"+str(n))
if rec is None:
return None
return rec.get("filename")

@classmethod
def cache_public_data_dump(cls, article_container, article_filename, article_url, article_size,
journal_container, journal_filename, journal_url, journal_size):
Expand Down
17 changes: 15 additions & 2 deletions portality/view/doaj.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import os.path
import re
import urllib.error
import urllib.parse
Expand Down Expand Up @@ -168,7 +169,7 @@ def csv_data():
store_url = "/store" + store_url
return redirect(store_url, code=307)


@blueprint.route("/sitemap_index.xml")
@blueprint.route("/sitemap.xml")
def sitemap():
sitemap_url = models.Cache.get_latest_sitemap()
Expand All @@ -178,6 +179,13 @@ def sitemap():
sitemap_url = "/store" + sitemap_url
return redirect(sitemap_url, code=307)

@blueprint.route("/sitemap<n>.xml")
def nth_sitemap(n):
sitemap_url = models.Cache.get_sitemap(n)
if sitemap_url.startswith("/"):
sitemap_url = "/store" + sitemap_url
return redirect(sitemap_url, code=307)


@blueprint.route("/public-data-dump/<record_type>")
@api_key_required
Expand Down Expand Up @@ -206,6 +214,10 @@ def public_data_dump_redirect(record_type):

return redirect(store_url, code=307)

@blueprint.route("/store/<container>/<dir>/<filename>")
def get_from_local_store_dir(container, dir, filename):
file = os.path.join(dir, filename)
return get_from_local_store(container, file)

@blueprint.route("/store/<container>/<filename>")
def get_from_local_store(container, filename):
Expand All @@ -215,7 +227,8 @@ def get_from_local_store(container, filename):
from portality import store
localStore = store.StoreFactory.get(None)
file_handle = localStore.get(container, filename)
return send_file(file_handle, mimetype="application/octet-stream", as_attachment=True, attachment_filename=filename)
return send_file(file_handle, mimetype="application/octet-stream", as_attachment=True,
attachment_filename=os.path.basename(filename))


@blueprint.route('/autocomplete/<doc_type>/<field_name>', methods=["GET", "POST"])
Expand Down

0 comments on commit 442a5a4

Please sign in to comment.