Initial commit.

openstate · Mar 14, 2022 · b100b0b · b100b0b
1 parent 6e1523c
commit b100b0b
Show file tree

Hide file tree

Showing 14 changed files with 466 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,10 @@
+build/*
+data/*
+*.env
+*.egg-info
+*.pyc
+*.rdb
+.vscode/*
+dist/*
+.DS_Store
+*.sqlite3
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,13 @@
+FROM alephdata/memorious:latest
+
+COPY setup.py /crawlers/
+COPY src /crawlers/src
+RUN pip3 install -q -e /crawlers
+COPY config /crawlers/config
+
+ENV MEMORIOUS_BASE_PATH=/data \
+    MEMORIOUS_CONFIG_PATH=/crawlers/config \
+    MEMORIOUS_DEBUG=false \
+    ARCHIVE_PATH=/data/archive \
+    REDIS_URL=redis://redis:6379/0 \
+    MEMORIOUS_DATASTORE_URI=postgresql://datastore:datastore@datastore/datastore \
diff --git a/README.md b/README.md
@@ -0,0 +1,5 @@
+# Memorious example project
+
+This folder can be used as an example template of a memorious deployment.
+Copy it into it's own git repository as a starting point, then add your
+own crawlers and scrapers as needed.
diff --git a/config/covid19.yml b/config/covid19.yml
@@ -0,0 +1,54 @@
+# Scraper for the OCCRP web site.
+# The goal is not to download all HTML, but only PDFs & other documents
+# linked from the page as proof.
+name: covid19
+
+# A title for display in the UI:
+description: "Covid19 NL"
+
+pipeline:
+  init:
+    # This first stage will get the ball rolling with a seed URL.
+    method: seed
+    params:
+      urls:
+        - https://do-ams3-17.hw.webhare.net/services/wobcovid19-prod-1/search/?first=0&count=10000&orderby=publicationdate
+    handle:
+      pass: fetch
+
+  fetch:
+    # Download the seed page
+    method: fetch
+    params:
+      # These rules specify which pages should be scraped or included:
+      rules:
+        and:
+          - domain: webhare.net
+          # - not:
+          #     or:
+          #       - mime_group: assets
+          #       - mime_group: images
+    handle:
+      pass: parse
+
+  parse:
+    # Parse the scraped pages to find if they contain additional links.
+    method: parse
+    params:
+      # Additional rules to determine if a scraped page should be stored or not.
+      # In this example, we're only keeping PDFs, word files, etc.
+      store:
+        or:
+          - mime_group: web
+          - mime_group: archives
+          - mime_group: documents
+    handle:
+      store: store
+      # this makes it a recursive web crawler:
+      fetch: fetch
+
+  store:
+    # Store the crawled documents to a directory
+    method: directory
+    params:
+      path: /data/results
diff --git a/config/extended_web_scraper.yml b/config/extended_web_scraper.yml
@@ -0,0 +1,43 @@
+# Example scraper to demonstrate extending Memorious
+name: quote_scraper
+description: Quotes to scraper
+# delay: 2
+pipeline:
+  init:
+    # The first stage logs in and creates an HTTP session which is used for subsequent requests.
+    method: example.quotes:login
+    params:
+      url: http://quotes.toscrape.com
+      username: fred
+      password: asdfasdf
+    handle:
+      pass: fetch
+  fetch:
+    # Download the page passed from the login stage.
+    method: fetch
+    params:
+      http_rate_limit: 60
+    handle:
+      pass: crawl
+  crawl:
+    # Crawl the HTML of the page passed in to extract specific things.
+    method: example.quotes:crawl
+    handle:
+      # If the 'fetch' rule is invoked, re-trigger the fetch stage
+      fetch: fetch
+      # If the 'cleanup' rule is invoked, delete the downloaded page from archive
+      cleanup: cleanup
+      # Otherwise, pass data on to the store stage
+      pass: store
+  store:
+    # Use a database to store structured data (which is either SQLite or otherwise set with the MEMORIOUS_DATASTORE_URI environment variable).
+    method: example.quotes:store
+    params:
+      table: example_quotes
+  cleanup:
+    method: cleanup_archive
+aggregator:
+  method: example.quotes:export
+  params:
+    table: example_quotes
+    filename: all_quotes.json
diff --git a/config/simple_article_scraper.yml b/config/simple_article_scraper.yml
@@ -0,0 +1,53 @@
+# Example scraper to demonstrate Memorious XPath narrowing
+name: occrp_entity_scraper
+description: A simple scrape of all the existing OCCRP investigations
+# Uncomment to run this scraper automatically:
+# schedule: weekly
+pipeline:
+  init:
+    # Start URL
+    method: seed
+    params:
+      urls:
+        - https://www.occrp.org/en/investigations
+    handle:
+      pass: fetch
+  fetch:
+    # Download the page passed from the seed stage.
+    method: fetch
+    params:
+      rules:
+        and:
+          - pattern: '.*investigations.*'
+          - domain: occrp.org
+          - not:
+              or:
+                - domain: vis.occrp.org
+                - domain: tech.occrp.org
+                - domain: data.occrp.org
+                - mime_group: assets
+                - mime_group: images
+                - pattern: "https://www.occrp.org/en/component/.*"
+                - pattern: "https://www.occrp.org/en/donate.*"
+                - pattern: "https://www.occrp.org/.*start=.*"
+                - pattern: "https://www.occrp.org/ru/.*"
+    handle:
+      pass: parse
+  parse:
+    method: example.article:parse
+    params:
+      schema: Article
+      store:
+        and:
+          - mime_group: web
+      properties:
+        title: .//meta[@property="og:title"]/@content
+        author: .//meta[@name="author"]/@content
+        publishedAt: .//*[@class="date"]/text()
+        description: .//meta[@property="og:description"]/@content
+    handle:
+      store: store
+      fetch: fetch
+  store:
+    # Store the crawled document as an ftm entity
+    method: aleph_emit_entity
diff --git a/config/simple_web_scraper.yml b/config/simple_web_scraper.yml
@@ -0,0 +1,60 @@
+# Scraper for the OCCRP web site.
+# The goal is not to download all HTML, but only PDFs & other documents
+# linked from the page as proof.
+name: occrp_web_site
+
+# A title for display in the UI:
+description: "Organized Crime and Corruption Reporting Project"
+
+pipeline:
+  init:
+    # This first stage will get the ball rolling with a seed URL.
+    method: seed
+    params:
+      urls:
+        - https://occrp.org
+    handle:
+      pass: fetch
+
+  fetch:
+    # Download the seed page
+    method: fetch
+    params:
+      # These rules specify which pages should be scraped or included:
+      rules:
+        and:
+          - domain: occrp.org
+          - not:
+              or:
+                - domain: vis.occrp.org
+                - domain: tech.occrp.org
+                - domain: data.occrp.org
+                - mime_group: assets
+                - mime_group: images
+                - pattern: "https://www.occrp.org/en/component/.*"
+                - pattern: "https://www.occrp.org/en/donate.*"
+                - pattern: "https://www.occrp.org/.*start=.*"
+                - pattern: "https://www.occrp.org/ru/.*"
+    handle:
+      pass: parse
+
+  parse:
+    # Parse the scraped pages to find if they contain additional links.
+    method: parse
+    params:
+      # Additional rules to determine if a scraped page should be stored or not.
+      # In this example, we're only keeping PDFs, word files, etc.
+      store:
+        or:
+          - mime_group: archives
+          - mime_group: documents
+    handle:
+      store: store
+      # this makes it a recursive web crawler:
+      fetch: fetch
+
+  store:
+    # Store the crawled documents to a directory
+    method: directory
+    params:
+      path: /data/results
diff --git a/config/simple_web_scraper_2.yml b/config/simple_web_scraper_2.yml
@@ -0,0 +1,42 @@
+# Example scraper to demonstrate Memorious XPath narrowing
+name: book_scraper
+description: Books to scraper
+pipeline:
+  init:
+    # Start URL
+    method: seed
+    params:
+      urls:
+        - http://books.toscrape.com
+    handle:
+      pass: fetch
+  fetch:
+    # Download the page passed from the seed stage.
+    method: fetch
+    handle:
+      pass: parse
+  parse:
+    # Crawl the HTML of the page passed in to extract specific things.
+    method: parse
+    params:
+      # This only checks the <section> element for links to follow (effectively keeping only links to book pages and pagination, and skipping the sidebar which lists book categories).
+      include_paths:
+        - ".//section"
+      # This tells the parser to also extract additional metadata from the DOM, which is added to `data` and passed to the 'store' stage.
+      meta:
+        title: './/article[@class="product_page"]//h1'
+        price: './/article[@class="product_page"]//p[@class="price_color"]'
+      # It uses a regex rule to skip URLs with '/category/' in them, so it only stores the book pages and not the listings.
+      store:
+        not:
+          pattern: ".*/category/.*"
+    handle:
+      # If the 'fetch' rule is invoked, re-trigger the fetch stage
+      fetch: fetch
+      # Otherwise, pass data on to the store stage
+      store: store
+  store:
+    # Store the crawled documents to a directory
+    method: directory
+    params:
+      path: /data/results
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,30 @@
+version: "2"
+
+services:
+  datastore:
+    image: postgres:11.4
+    volumes:
+      - "./build/datastore:/var/lib/postgresql/data"
+    environment:
+      POSTGRES_USER: datastore
+      POSTGRES_PASSWORD: datastore
+
+  redis:
+    image: redis:alpine
+    command: ["redis-server", "--appendonly", "yes"]
+    volumes:
+      - ./build/redis-data:/data
+
+  shell:
+    build: .
+    command: /bin/bash
+    links:
+      - redis
+      - datastore
+    volumes:
+      - "./build/data:/data"
+      - "./config:/crawlers/config"
+      - "./src:/crawlers/src"
+      - "./entities:/crawlers/entities"
+    tmpfs:
+      - "/tmp"
diff --git a/scripts/worker.sh b/scripts/worker.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+pip3 install -q -e /crawlers
+
+# For debugging inside a container, attach a terming and try:
+# python3 /tmp/debugpy --wait-for-client --listen 0.0.0.0:5678 memorious/cli.py --debug run book_scraper
+pip3 install debugpy -t /tmp
+/bin/bash
diff --git a/setup.py b/setup.py
@@ -0,0 +1,16 @@
+from datetime import datetime
+from setuptools import setup, find_packages
+
+setup(
+    name="jodal-sources",
+    version=datetime.utcnow().date().isoformat(),
+    classifiers=[],
+    keywords="",
+    packages=find_packages("src"),
+    package_dir={"": "src"},
+    namespace_packages=[],
+    include_package_data=True,
+    zip_safe=False,
+    install_requires=["memorious", "datafreeze", "newspaper3k"],
+    entry_points={"memorious.plugins": ["example = example:init"]},
+)
diff --git a/src/example/__init__.py b/src/example/__init__.py
@@ -0,0 +1,8 @@
+import os
+from memorious.core import manager
+
+
+def init():
+    file_path = os.path.dirname(__file__)
+    config_path = os.path.join(file_path, "..", "..", "config")
+    manager.load_path(config_path)