We will be building and deploying a python script to scrape sites which will end up looking like this:
run:
python3.12 -m venv .venv && . .venv/bin/activate
then
pip install -r requirements.txt
then
playwright install
finally:
python spiders/google_job_hunt.py
import re
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
class GoogleSpider(scrapy.Spider):
name = 'google_spider'
allowed_domains = ['www.google.com']
custom_settings = {
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
}
def __init__(self, domain, stop, user_agent, *args, **kwargs):
super().__init__(*args, **kwargs)
self.domain = domain
self.stop = int(stop)
self.custom_settings['USER_AGENT'] = user_agent
self.start_urls = [
f'https://www.google.com/search?q=intitle%3A%28%22Data+Scientist%22+OR+%22Data+Engineer%22+OR+%22Machine+Learning%22+OR+%22Data+Analyst%22+OR+%22Software+Engineer%22%29+Remote+-%22Director%22+-%22Principal%22+-%22Staff%22+-%22Frontend%22+-%22Front+End%22+-%22Full+Stack%22+site%3A{self.domain}%2F%2A+after%3A2023-03-27']
self.urls_collected = []
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
return super().from_crawler(crawler, *args, **kwargs)
def start_requests(self):
yield scrapy.Request(self.start_urls[0], meta={"playwright": True,
"playwright_include_page": True})
async def get_page_info(self, page):
for i in range(10):
val = page.viewport_size["height"]
await page.mouse.wheel(0, val)
await page.wait_for_timeout(1000)
text = await page.content()
selector = Selector(text=text)
urls = []
for row in selector.xpath("//div[contains(@class, 'kCrYT')]"):
text = row.xpath(".//h3//text()").get()
url = row.xpath(".//a/@href").get()
if url:
urls.append({text: url})
print(urls)
self.urls_collected += urls
return urls
async def parse(self, response):
page = response.meta['playwright_page']
urls = await self.get_page_info(page)
found = True
while found:
try:
element = page.get_by_text("Next")
print(element, "parsing next page")
await element.click()
more_urls = await self.get_page_info(page)
urls += more_urls
except:
found = False
return urls
def main(domain, stop, user_agent):
process = CrawlerProcess()
process.crawl(GoogleSpider, domain=domain, stop=stop, user_agent=user_agent)
process.start()
if __name__ == '__main__':
domain = 'jobs.lever.co'
stop = 25
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
user_agent2 = "Opera/9.80 (Windows NT 5.1; U; MRA 5.5 (build 02842); ru) Presto/2.7.62 Version/11.00"
user_agent3 = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)"
main(domain=domain, stop=stop, user_agent=user_agent3)