Skip to content

Commit

Permalink
Basic e-commerce sitemap support
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Jan 7, 2025
1 parent 1b72aa8 commit dfb4df0
Show file tree
Hide file tree
Showing 6 changed files with 525 additions and 25 deletions.
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"form2request>=0.2.0",
"formasaurus>=0.10.0",
"jmespath>=0.9.5",
"protego>=0.3.0",
"pydantic>=2.1",
"requests>=2.31.0",
"scrapinghub >= 2.4.0",
Expand Down
1 change: 1 addition & 0 deletions tests/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/fs.example/
114 changes: 107 additions & 7 deletions tests/mockserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
import socket
import sys
import time
from base64 import b64encode
from importlib import import_module
from pathlib import Path
from subprocess import PIPE, Popen
from typing import Any, Dict
from urllib.parse import urlparse

from scrapy_zyte_api.responses import _API_RESPONSE
from twisted.internet import reactor
Expand Down Expand Up @@ -36,20 +39,30 @@ class DefaultResource(Resource):
https://example.com/category/1
https://example.com/category/1/page/2
https://example.com/non-navigation
https://example.com/sitemap-category
https://example.com/sitemap-product/1
https://example.com/sitemap-product/2
```
When browserHtml is requested (for any URL, listed above or not), it is
a minimal HTML with an anchor tag pointing to
https://example.com/non-navigation.
When productNavigation is requested, nextPage and subCategories are filled
accordingly. productNavigation.items always has 2 product URLs, which are
the result of appending ``/product/<n>`` to the request URL.
https://example.com/non-navigation is not reachable through
productNavigation.
When productNavigation is requested, nextPage and subCategories are
filled accordingly. productNavigation.items always has 2 product URLs,
which are the result of appending ``/product/<n>`` to the request URL.
When product or productList is requested, an item with the current URL is
always returned.
The following pages are not reachable through productNavigation:
- https://example.com/non-navigation is in an a element of the HTML
of every page.
- https://example.com/sitemap-* URLs are linked from
https://example.com/sitemap*.xml, which are linked from
https://example.com/robots.txt.
When product or productList is requested, an item with the current URL
is always returned.
All output also includes unsupported links (mailto:…).
Expand All @@ -61,6 +74,16 @@ class DefaultResource(Resource):
- https://jobs.offsite.example/jobs/1 (jobPosting)
- https://jobs.offsite.example/jobs/2 (jobPosting)
- For fs.example subdomains, a matching file is looked for in the file
system. If found, its content is base64-encoded and returned as
httpResponseBody. Else, a product response is returned.
For example, for the URL https://abcdefg.fs.example/foo, if a file
exists at tests/fs.example/abcdefg/foo, its content is returned as
httpResponseBody. Otherwise, the response is as empty as possible based
on input parameters, except when requesting productNavigation, which
always includes an item at <url>/p.
"""

def getChild(self, path, request):
Expand Down Expand Up @@ -99,6 +122,82 @@ def render_POST(self, request):
}
return json.dumps(response_data).encode()

if request_data["url"] == "https://example.com/robots.txt":
assert request_data["httpResponseBody"] is True
body = b"""
Sitemap: https://example.com/sitemap.xml # Link to category
SiTeMaP: https://example.com/sitemap-index.xml # Links to products
"""
response_data["httpResponseBody"] = b64encode(body).decode()
return json.dumps(response_data).encode()

if request_data["url"] == "https://example.com/sitemap.xml":
assert request_data["httpResponseBody"] is True
body = b"""
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://example.com/sitemap-category</loc>
</url>
</urlset>
"""
response_data["httpResponseBody"] = b64encode(body).decode()
return json.dumps(response_data).encode()

if request_data["url"] == "https://example.com/sitemap-index.xml":
assert request_data["httpResponseBody"] is True
body = b"""
<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>https://example.com/sitemap-products.xml</loc>
</sitemap>
</sitemapindex>
"""
response_data["httpResponseBody"] = b64encode(body).decode()
return json.dumps(response_data).encode()

if request_data["url"] == "https://example.com/sitemap-products.xml":
assert request_data["httpResponseBody"] is True
body = b"""
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://example.com/sitemap-product/1</loc>
</url>
<url>
<loc>https://example.com/sitemap-product/2</loc>
</url>
</urlset>
"""
response_data["httpResponseBody"] = b64encode(body).decode()
return json.dumps(response_data).encode()

if "fs.example" in request_data["url"]:
parsed_url = urlparse(request_data["url"])
subdir_name = parsed_url.netloc[: -len(".fs.example")]
root_dir = Path(__file__).parent / "fs.example"
subdir = root_dir / subdir_name
filepath = subdir / parsed_url.path.lstrip("/")
if filepath != subdir and filepath.exists():
response_data["httpResponseBody"] = b64encode(
filepath.read_bytes()
).decode()
else:
if request_data.get("browserHtml", False) is True:
response_data["browserHtml"] = "<!doctype html><title>a</title>"
if request_data.get("product", False) is True:
response_data["product"] = {"url": response_data["url"]}
if request_data.get("productNavigation", False) is True:
items = []
if filepath != subdir:
items = [{"url": f"{request_data['url'].rstrip('/')}/p"}]
response_data["productNavigation"] = {
"url": response_data["url"],
"items": items,
}
return json.dumps(response_data).encode()

non_navigation_url = "https://example.com/non-navigation"
html = f"""<html><body><a href="{non_navigation_url}"></a><a href="mailto:[email protected]"></a></body></html>"""
if request_data.get("browserHtml", False) is True:
Expand All @@ -119,6 +218,7 @@ def render_POST(self, request):
if (
"/page/" not in request_data["url"]
and "/non-navigation" not in request_data["url"]
and "/sitemap" not in request_data["url"]
):
kwargs["nextPage"] = {
"url": f"{request_data['url'].rstrip('/')}/page/2"
Expand Down
Loading

0 comments on commit dfb4df0

Please sign in to comment.