Skip to content

Commit

Permalink
support more edge cases
Browse files Browse the repository at this point in the history
  • Loading branch information
honzajavorek committed Feb 15, 2024
1 parent 8b1a601 commit c569f7b
Show file tree
Hide file tree
Showing 6 changed files with 581 additions and 10 deletions.
30 changes: 21 additions & 9 deletions juniorguru_plucker/jobs_jobscz/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,18 @@

WIDGET_DATA_RE = re.compile(r"window\.__LMC_CAREER_WIDGET__\.push\((.+)\);")

WIDGET_DATA_SCRIPT_RE = re.compile(
r"exports=JSON.parse\('(((?!function).)+)'\)},function"
)

WIDGET_DATA_SCRIPT_RE = re.compile(
r"""
exports=JSON.parse\('
( # group we're matching
{"id":
( # one or more characters that are not the start of the word "function"
(?!function)
.
)+
)
'\)},function
'\)}
(,function|]\);) # either next function or the end of the JSON
""",
re.VERBOSE,
)
Expand Down Expand Up @@ -122,7 +120,9 @@ def parse_job_widget_data(
widget_data = json.loads(response.css("script::text").re(WIDGET_DATA_RE)[0])
except IndexError:
self.logger.debug("Looking for widget data in attached JavaScript")
script_url = response.css('script[src*="script.min.js"]::attr(src)').get()
script_url = response.css(
'script[src*="assets/js/script.min.js"]::attr(src)'
).get()
yield response.follow(
script_url,
callback=self.parse_job_widget_script,
Expand All @@ -141,13 +141,18 @@ def parse_job_widget_script(
self, script_response: TextResponse, html_response: HtmlResponse, item: Job
) -> Generator[Request, None, None]:
if match := re.search(WIDGET_DATA_SCRIPT_RE, script_response.text):
data = json.loads(match.group(1))
data_text = re.sub(r"\'", r"\\'", match.group(1))
data = json.loads(data_text)

widget_name = select_widget(list(data["widgets"].keys()))
widget_data = data["widgets"][widget_name]

yield from self.parse_job_widget(
html_response,
item,
widget_host=data["host"],
widget_api_key=data["widgets"]["main"]["apiKey"],
widget_id=data["widgets"]["main"]["id"],
widget_api_key=widget_data["apiKey"],
widget_id=widget_data["id"],
)
else:
raise NotImplementedError("Widget data not found")
Expand Down Expand Up @@ -228,6 +233,13 @@ def load_gql(path: str | Path) -> str:
return Path(path).read_text()


def select_widget(names: list[str]) -> str:
for name in names:
if name.startswith("main"):
return name
return names[0]


def clean_url(url: str) -> str:
return strip_params(
url, ["positionOfAdInAgentEmail", "searchId", "rps", "impressionId"]
Expand Down
8 changes: 8 additions & 0 deletions tests/jobs_jobscz/job_widget_script2.js

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/jobs_jobscz/job_widget_script3.js

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/jobs_jobscz/job_widget_script4.js

Large diffs are not rendered by default.

Loading

0 comments on commit c569f7b

Please sign in to comment.