diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5534be6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +build/* +data/* +*.env +*.egg-info +*.pyc +*.rdb +.vscode/* +dist/* +.DS_Store +*.sqlite3 \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2dde64a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM alephdata/memorious:latest + +COPY setup.py /crawlers/ +COPY src /crawlers/src +RUN pip3 install -q -e /crawlers +COPY config /crawlers/config + +ENV MEMORIOUS_BASE_PATH=/data \ + MEMORIOUS_CONFIG_PATH=/crawlers/config \ + MEMORIOUS_DEBUG=false \ + ARCHIVE_PATH=/data/archive \ + REDIS_URL=redis://redis:6379/0 \ + MEMORIOUS_DATASTORE_URI=postgresql://datastore:datastore@datastore/datastore \ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..0ec5f09 --- /dev/null +++ b/README.md @@ -0,0 +1,5 @@ +# Memorious example project + +This folder can be used as an example template of a memorious deployment. +Copy it into it's own git repository as a starting point, then add your +own crawlers and scrapers as needed. diff --git a/config/covid19.yml b/config/covid19.yml new file mode 100644 index 0000000..23ca9ca --- /dev/null +++ b/config/covid19.yml @@ -0,0 +1,54 @@ +# Scraper for the OCCRP web site. +# The goal is not to download all HTML, but only PDFs & other documents +# linked from the page as proof. +name: covid19 + +# A title for display in the UI: +description: "Covid19 NL" + +pipeline: + init: + # This first stage will get the ball rolling with a seed URL. + method: seed + params: + urls: + - https://do-ams3-17.hw.webhare.net/services/wobcovid19-prod-1/search/?first=0&count=10000&orderby=publicationdate + handle: + pass: fetch + + fetch: + # Download the seed page + method: fetch + params: + # These rules specify which pages should be scraped or included: + rules: + and: + - domain: webhare.net + # - not: + # or: + # - mime_group: assets + # - mime_group: images + handle: + pass: parse + + parse: + # Parse the scraped pages to find if they contain additional links. + method: parse + params: + # Additional rules to determine if a scraped page should be stored or not. + # In this example, we're only keeping PDFs, word files, etc. + store: + or: + - mime_group: web + - mime_group: archives + - mime_group: documents + handle: + store: store + # this makes it a recursive web crawler: + fetch: fetch + + store: + # Store the crawled documents to a directory + method: directory + params: + path: /data/results diff --git a/config/extended_web_scraper.yml b/config/extended_web_scraper.yml new file mode 100644 index 0000000..198a4be --- /dev/null +++ b/config/extended_web_scraper.yml @@ -0,0 +1,43 @@ +# Example scraper to demonstrate extending Memorious +name: quote_scraper +description: Quotes to scraper +# delay: 2 +pipeline: + init: + # The first stage logs in and creates an HTTP session which is used for subsequent requests. + method: example.quotes:login + params: + url: http://quotes.toscrape.com + username: fred + password: asdfasdf + handle: + pass: fetch + fetch: + # Download the page passed from the login stage. + method: fetch + params: + http_rate_limit: 60 + handle: + pass: crawl + crawl: + # Crawl the HTML of the page passed in to extract specific things. + method: example.quotes:crawl + handle: + # If the 'fetch' rule is invoked, re-trigger the fetch stage + fetch: fetch + # If the 'cleanup' rule is invoked, delete the downloaded page from archive + cleanup: cleanup + # Otherwise, pass data on to the store stage + pass: store + store: + # Use a database to store structured data (which is either SQLite or otherwise set with the MEMORIOUS_DATASTORE_URI environment variable). + method: example.quotes:store + params: + table: example_quotes + cleanup: + method: cleanup_archive +aggregator: + method: example.quotes:export + params: + table: example_quotes + filename: all_quotes.json diff --git a/config/simple_article_scraper.yml b/config/simple_article_scraper.yml new file mode 100644 index 0000000..20f63ba --- /dev/null +++ b/config/simple_article_scraper.yml @@ -0,0 +1,53 @@ +# Example scraper to demonstrate Memorious XPath narrowing +name: occrp_entity_scraper +description: A simple scrape of all the existing OCCRP investigations +# Uncomment to run this scraper automatically: +# schedule: weekly +pipeline: + init: + # Start URL + method: seed + params: + urls: + - https://www.occrp.org/en/investigations + handle: + pass: fetch + fetch: + # Download the page passed from the seed stage. + method: fetch + params: + rules: + and: + - pattern: '.*investigations.*' + - domain: occrp.org + - not: + or: + - domain: vis.occrp.org + - domain: tech.occrp.org + - domain: data.occrp.org + - mime_group: assets + - mime_group: images + - pattern: "https://www.occrp.org/en/component/.*" + - pattern: "https://www.occrp.org/en/donate.*" + - pattern: "https://www.occrp.org/.*start=.*" + - pattern: "https://www.occrp.org/ru/.*" + handle: + pass: parse + parse: + method: example.article:parse + params: + schema: Article + store: + and: + - mime_group: web + properties: + title: .//meta[@property="og:title"]/@content + author: .//meta[@name="author"]/@content + publishedAt: .//*[@class="date"]/text() + description: .//meta[@property="og:description"]/@content + handle: + store: store + fetch: fetch + store: + # Store the crawled document as an ftm entity + method: aleph_emit_entity \ No newline at end of file diff --git a/config/simple_web_scraper.yml b/config/simple_web_scraper.yml new file mode 100644 index 0000000..a1c02f5 --- /dev/null +++ b/config/simple_web_scraper.yml @@ -0,0 +1,60 @@ +# Scraper for the OCCRP web site. +# The goal is not to download all HTML, but only PDFs & other documents +# linked from the page as proof. +name: occrp_web_site + +# A title for display in the UI: +description: "Organized Crime and Corruption Reporting Project" + +pipeline: + init: + # This first stage will get the ball rolling with a seed URL. + method: seed + params: + urls: + - https://occrp.org + handle: + pass: fetch + + fetch: + # Download the seed page + method: fetch + params: + # These rules specify which pages should be scraped or included: + rules: + and: + - domain: occrp.org + - not: + or: + - domain: vis.occrp.org + - domain: tech.occrp.org + - domain: data.occrp.org + - mime_group: assets + - mime_group: images + - pattern: "https://www.occrp.org/en/component/.*" + - pattern: "https://www.occrp.org/en/donate.*" + - pattern: "https://www.occrp.org/.*start=.*" + - pattern: "https://www.occrp.org/ru/.*" + handle: + pass: parse + + parse: + # Parse the scraped pages to find if they contain additional links. + method: parse + params: + # Additional rules to determine if a scraped page should be stored or not. + # In this example, we're only keeping PDFs, word files, etc. + store: + or: + - mime_group: archives + - mime_group: documents + handle: + store: store + # this makes it a recursive web crawler: + fetch: fetch + + store: + # Store the crawled documents to a directory + method: directory + params: + path: /data/results diff --git a/config/simple_web_scraper_2.yml b/config/simple_web_scraper_2.yml new file mode 100644 index 0000000..efc8fa0 --- /dev/null +++ b/config/simple_web_scraper_2.yml @@ -0,0 +1,42 @@ +# Example scraper to demonstrate Memorious XPath narrowing +name: book_scraper +description: Books to scraper +pipeline: + init: + # Start URL + method: seed + params: + urls: + - http://books.toscrape.com + handle: + pass: fetch + fetch: + # Download the page passed from the seed stage. + method: fetch + handle: + pass: parse + parse: + # Crawl the HTML of the page passed in to extract specific things. + method: parse + params: + # This only checks the
element for links to follow (effectively keeping only links to book pages and pagination, and skipping the sidebar which lists book categories). + include_paths: + - ".//section" + # This tells the parser to also extract additional metadata from the DOM, which is added to `data` and passed to the 'store' stage. + meta: + title: './/article[@class="product_page"]//h1' + price: './/article[@class="product_page"]//p[@class="price_color"]' + # It uses a regex rule to skip URLs with '/category/' in them, so it only stores the book pages and not the listings. + store: + not: + pattern: ".*/category/.*" + handle: + # If the 'fetch' rule is invoked, re-trigger the fetch stage + fetch: fetch + # Otherwise, pass data on to the store stage + store: store + store: + # Store the crawled documents to a directory + method: directory + params: + path: /data/results diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..8232ea6 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,30 @@ +version: "2" + +services: + datastore: + image: postgres:11.4 + volumes: + - "./build/datastore:/var/lib/postgresql/data" + environment: + POSTGRES_USER: datastore + POSTGRES_PASSWORD: datastore + + redis: + image: redis:alpine + command: ["redis-server", "--appendonly", "yes"] + volumes: + - ./build/redis-data:/data + + shell: + build: . + command: /bin/bash + links: + - redis + - datastore + volumes: + - "./build/data:/data" + - "./config:/crawlers/config" + - "./src:/crawlers/src" + - "./entities:/crawlers/entities" + tmpfs: + - "/tmp" diff --git a/scripts/worker.sh b/scripts/worker.sh new file mode 100644 index 0000000..6b2a824 --- /dev/null +++ b/scripts/worker.sh @@ -0,0 +1,7 @@ +#!/bin/bash +pip3 install -q -e /crawlers + +# For debugging inside a container, attach a terming and try: +# python3 /tmp/debugpy --wait-for-client --listen 0.0.0.0:5678 memorious/cli.py --debug run book_scraper +pip3 install debugpy -t /tmp +/bin/bash \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..d60a352 --- /dev/null +++ b/setup.py @@ -0,0 +1,16 @@ +from datetime import datetime +from setuptools import setup, find_packages + +setup( + name="jodal-sources", + version=datetime.utcnow().date().isoformat(), + classifiers=[], + keywords="", + packages=find_packages("src"), + package_dir={"": "src"}, + namespace_packages=[], + include_package_data=True, + zip_safe=False, + install_requires=["memorious", "datafreeze", "newspaper3k"], + entry_points={"memorious.plugins": ["example = example:init"]}, +) diff --git a/src/example/__init__.py b/src/example/__init__.py new file mode 100644 index 0000000..9835dc9 --- /dev/null +++ b/src/example/__init__.py @@ -0,0 +1,8 @@ +import os +from memorious.core import manager + + +def init(): + file_path = os.path.dirname(__file__) + config_path = os.path.join(file_path, "..", "..", "config") + manager.load_path(config_path) diff --git a/src/example/article.py b/src/example/article.py new file mode 100644 index 0000000..ad3f1e0 --- /dev/null +++ b/src/example/article.py @@ -0,0 +1,49 @@ +import logging +import memorious.operations.parse +import hashlib + +from newspaper import Article +from memorious.helpers.rule import Rule + +log = logging.getLogger(__name__) + + +def parse_article(context: object, data: dict, article: Article) -> None: + with context.http.rehash(data) as result: + if result.html is not None: + properties = context.params.get("properties") + data["schema"] = "Article" + data["entity_id"] = hashlib.md5(data["url"].encode("utf-8")).hexdigest() + data["properties"] = { + "title": result.html.xpath(properties["title"]) + if properties.get("title") + else getattr(article, "title", None), + "description": result.html.xpath(properties.get("description")) + if properties.get("description") + else getattr(article, "description", None), + "author": result.html.xpath(properties.get("author")) + if properties.get("author") + else getattr(article, "authors", None), + "publishedAt": result.html.xpath(properties.get("publishedAt")) + if properties.get("publishedAt") + else getattr(article, "publish_date", None), + "bodyText": result.html.xpath(properties.get("bodyText")) + if properties.get("bodyText") + else getattr(article, "text", None), + } + + +def parse(context, data): + with context.http.rehash(data) as result: + news_article = Article(url=data["url"]) + news_article.download() + news_article.parse() + parse_article(context, data, news_article) + + if result.html is not None: + memorious.operations.parse.parse_for_metadata(context, data, result.html) + memorious.operations.parse.parse_html(context, data, result) + + rules = context.params.get("match") or {"match_all": {}} + if Rule.get_rule(rules).apply(result): + context.emit(rule="store", data=data) diff --git a/src/example/quotes.py b/src/example/quotes.py new file mode 100644 index 0000000..e8b896d --- /dev/null +++ b/src/example/quotes.py @@ -0,0 +1,76 @@ +from urllib.parse import urljoin +import datafreeze + + +def login(context, data): + # Get parameters from the stage which calls this method in the yaml file + base_url = context.params.get("url") + url = urljoin(base_url, "login") + username = context.params.get("username") + password = context.params.get("password") + + # Context wraps requests, and reuses the same session. + # When we login here, this is persisted across future uses of + # context.http + res = context.http.get(url) + # Get the login form and post the credentials. + # Uses lxml under the hood. + page = res.html + form = page.find(".//form") + login_url = urljoin(base_url, form.get("action")) + login_data = {"username": username, "password": password} + # We also need to pass the hidden inputs from the form. + hidden_inputs = { + h_in.get("name"): h_in.get("value") + for h_in in form.xpath('./input[@type="hidden"]') + } + login_data.update(hidden_inputs) + context.http.post(login_url, data=login_data) + + # Set data for input to the next stage, and proceed. + # (The next stage is 'fetch' which takes a 'url' input.) + data = {"url": base_url} + context.emit(data=data) + + +def crawl(context, data): + # This stage comes after 'fetch' so the 'data' input contains an + # HTTPResponse object. + response = context.http.rehash(data) + url = response.url + page = response.html + + # If we find a next link, recursively fetch that page by handing it back + # to the 'fetch' stage. + next_link = page.find('.//nav//li[@class="next"]/a') + if next_link is not None: + next_url = urljoin(url, next_link.get("href")) + context.emit(rule="fetch", data={"url": next_url}) + + # Parse the rest of the page to extract structured data. + for quote in page.findall('.//div[@class="quote"]'): + quote_data = { + "text": quote.find('.//span[@class="text"]').text_content(), + "author": quote.find('.//small[@class="author"]').text_content(), + "tags": ", ".join( + [tag.text_content() for tag in quote.findall('.//a[@class="tag"]')] + ), # noqa + } + + # If 'rule' is not set, it defaults to 'pass', which triggers the + # final 'store' stage. + context.emit(data=quote_data) + context.emit(rule="cleanup", data={"content_hash": response.content_hash}) + + +def store(context, data): + # This example uses a database to store structured data, which you can + # access through context.datastore. + table = context.datastore[context.params.get("table")] + # The data is passed in from context.emit of the previous 'crawl' stage. + table.upsert(data, ["text", "author"]) + + +def export(context, params): + table = context.datastore[params["table"]] + datafreeze.freeze(table, format="json", filename=params["filename"])