-
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
2710df6
commit 8df7baa
Showing
11 changed files
with
964 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
{ | ||
"actorSpecification": 1, | ||
"name": "jobs-weworkremotely", | ||
"title": "jobs-weworkremotely", | ||
"version": "0.0", | ||
"dockerfile": "../../../Dockerfile", | ||
"dockerContextDir": "../../../", | ||
"storages": { | ||
"dataset": "../../schemas/jobSchema.json" | ||
} | ||
} |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
import html | ||
import json | ||
import time | ||
from datetime import date, datetime | ||
|
||
import extruct | ||
import feedparser | ||
from itemloaders.processors import Identity, MapCompose, TakeFirst | ||
from lxml import etree | ||
from scrapy import Spider as BaseSpider | ||
from scrapy.loader import ItemLoader | ||
|
||
from juniorguru_plucker.items import Job | ||
from juniorguru_plucker.processors import absolute_url | ||
|
||
|
||
class Spider(BaseSpider): | ||
name = "jobs-weworkremotely" | ||
start_urls = [ | ||
"https://weworkremotely.com/categories/remote-devops-sysadmin-jobs.rss", | ||
"https://weworkremotely.com/categories/remote-programming-jobs.rss", | ||
] | ||
|
||
def parse(self, response): | ||
for entry in feedparser.parse(response.text).entries: | ||
feed_data = dict( | ||
title=entry.title, | ||
first_seen_on=parse_struct_time(entry.published_parsed), | ||
company_logo_urls=[ | ||
c["url"] for c in getattr(entry, "media_content", []) | ||
], | ||
description_html=entry.summary, | ||
remote=True, | ||
source_urls=response.url, | ||
) | ||
yield response.follow( | ||
entry.link, callback=self.parse_job, cb_kwargs=dict(feed_data=feed_data) | ||
) | ||
|
||
def parse_job(self, response, feed_data): | ||
loader = Loader(item=Job(), response=response) | ||
loader.add_value("url", response.url) | ||
|
||
for key, value in feed_data.items(): | ||
loader.add_value(key, value) | ||
|
||
try: | ||
data = extract_job_posting(response.text, response.url) | ||
except (ValueError, json.JSONDecodeError, etree.ParserError): | ||
pass | ||
else: | ||
loader.add_value("source", self.name) | ||
loader.add_value("source_urls", response.url) | ||
loader.add_value("title", data["title"]) | ||
loader.add_value("first_seen_on", data["datePosted"]) | ||
loader.add_value("description_html", html.unescape(data["description"])) | ||
loader.add_value("company_logo_urls", data.get("image")) | ||
loader.add_value("employment_types", [data["employmentType"]]) | ||
loader.add_value("company_name", data["hiringOrganization"]["name"]) | ||
loader.add_value("company_url", data["hiringOrganization"]["sameAs"]) | ||
loader.add_value("locations_raw", data["hiringOrganization"]["address"]) | ||
yield loader.load_item() | ||
|
||
|
||
def parse_struct_time(struct_time): | ||
if struct_time: | ||
return datetime.fromtimestamp(time.mktime(struct_time)).date() | ||
|
||
|
||
def parse_date(value: str | None) -> date | None: | ||
if value: | ||
return date.fromisoformat(value[:10]) | ||
|
||
|
||
def extract_job_posting(html_string, base_url): | ||
data = extruct.extract(html_string, base_url, syntaxes=["json-ld"]) | ||
try: | ||
return [ | ||
data_item | ||
for data_item in data["json-ld"] | ||
if data_item["@type"] == "JobPosting" | ||
][0] | ||
except IndexError: | ||
raise ValueError("json-ld provided no job postings") | ||
|
||
|
||
class Loader(ItemLoader): | ||
default_input_processor = MapCompose(str.strip) | ||
default_output_processor = TakeFirst() | ||
company_url_in = MapCompose(absolute_url) | ||
first_seen_on_in = MapCompose(parse_date) | ||
company_logo_urls_out = Identity() | ||
remote_in = MapCompose(bool) | ||
locations_raw_out = Identity() | ||
source_urls_out = Identity() |
Oops, something went wrong.