Skip to content

Commit

Permalink
adding option to choose which jsonld to use if > 1 (#57)
Browse files Browse the repository at this point in the history
  • Loading branch information
iannesbitt committed Mar 7, 2024
1 parent db5eaa0 commit 2698389
Showing 1 changed file with 5 additions and 0 deletions.
5 changes: 5 additions & 0 deletions soscan/spiders/jsonldspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __init__(self, *args, **kwargs):
self.start_point = None
self.url_match = None
self.reversed = None
self.which_jsonld = 0
if len(self.sitemap_urls) < 1:
raise ValueError("At least one sitemap URL is required.")
if self.lastmod_filter is not None:
Expand Down Expand Up @@ -105,6 +106,8 @@ def from_crawler(cls, crawler, *args, **kwargs):
spider.url_match = _cs.get(s, None)
if s in "reversed":
spider.reversed = _cs.get(s, None)
if s in "which_jsonld":
spider.which_jsonld = _cs.get(s, None)
return spider

def sitemap_filter(self, entries):
Expand Down Expand Up @@ -223,6 +226,8 @@ def parse(self, response, **kwargs):
# format_id
if len(jsonld) == 1:
jsonld=jsonld[0]
else:
jsonld=jsonld[self.which_jsonld]

item = soscan.items.SoscanItem()
item["url"] = response.url
Expand Down

0 comments on commit 2698389

Please sign in to comment.