Skip to content

Commit

Permalink
add weworkremotely
Browse files Browse the repository at this point in the history
  • Loading branch information
honzajavorek committed Jan 17, 2024
1 parent 2710df6 commit 8df7baa
Show file tree
Hide file tree
Showing 11 changed files with 964 additions and 3 deletions.
1 change: 0 additions & 1 deletion juniorguru_plucker/jobs_startupjobs/spider.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import html
from datetime import datetime
from typing import Generator

from itemloaders.processors import Compose, Identity, MapCompose, TakeFirst
Expand Down
11 changes: 11 additions & 0 deletions juniorguru_plucker/jobs_weworkremotely/.actor/actor.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"actorSpecification": 1,
"name": "jobs-weworkremotely",
"title": "jobs-weworkremotely",
"version": "0.0",
"dockerfile": "../../../Dockerfile",
"dockerContextDir": "../../../",
"storages": {
"dataset": "../../schemas/jobSchema.json"
}
}
Empty file.
95 changes: 95 additions & 0 deletions juniorguru_plucker/jobs_weworkremotely/spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import html
import json
import time
from datetime import date, datetime

import extruct
import feedparser
from itemloaders.processors import Identity, MapCompose, TakeFirst
from lxml import etree
from scrapy import Spider as BaseSpider
from scrapy.loader import ItemLoader

from juniorguru_plucker.items import Job
from juniorguru_plucker.processors import absolute_url


class Spider(BaseSpider):
name = "jobs-weworkremotely"
start_urls = [
"https://weworkremotely.com/categories/remote-devops-sysadmin-jobs.rss",
"https://weworkremotely.com/categories/remote-programming-jobs.rss",
]

def parse(self, response):
for entry in feedparser.parse(response.text).entries:
feed_data = dict(
title=entry.title,
first_seen_on=parse_struct_time(entry.published_parsed),
company_logo_urls=[
c["url"] for c in getattr(entry, "media_content", [])
],
description_html=entry.summary,
remote=True,
source_urls=response.url,
)
yield response.follow(
entry.link, callback=self.parse_job, cb_kwargs=dict(feed_data=feed_data)
)

def parse_job(self, response, feed_data):
loader = Loader(item=Job(), response=response)
loader.add_value("url", response.url)

for key, value in feed_data.items():
loader.add_value(key, value)

try:
data = extract_job_posting(response.text, response.url)
except (ValueError, json.JSONDecodeError, etree.ParserError):
pass
else:
loader.add_value("source", self.name)
loader.add_value("source_urls", response.url)
loader.add_value("title", data["title"])
loader.add_value("first_seen_on", data["datePosted"])
loader.add_value("description_html", html.unescape(data["description"]))
loader.add_value("company_logo_urls", data.get("image"))
loader.add_value("employment_types", [data["employmentType"]])
loader.add_value("company_name", data["hiringOrganization"]["name"])
loader.add_value("company_url", data["hiringOrganization"]["sameAs"])
loader.add_value("locations_raw", data["hiringOrganization"]["address"])
yield loader.load_item()


def parse_struct_time(struct_time):
if struct_time:
return datetime.fromtimestamp(time.mktime(struct_time)).date()


def parse_date(value: str | None) -> date | None:
if value:
return date.fromisoformat(value[:10])


def extract_job_posting(html_string, base_url):
data = extruct.extract(html_string, base_url, syntaxes=["json-ld"])
try:
return [
data_item
for data_item in data["json-ld"]
if data_item["@type"] == "JobPosting"
][0]
except IndexError:
raise ValueError("json-ld provided no job postings")


class Loader(ItemLoader):
default_input_processor = MapCompose(str.strip)
default_output_processor = TakeFirst()
company_url_in = MapCompose(absolute_url)
first_seen_on_in = MapCompose(parse_date)
company_logo_urls_out = Identity()
remote_in = MapCompose(bool)
locations_raw_out = Identity()
source_urls_out = Identity()
Loading

0 comments on commit 8df7baa

Please sign in to comment.