Skip to content

Commit

Permalink
Initial commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
breyten committed Mar 14, 2022
1 parent 6e1523c commit b100b0b
Show file tree
Hide file tree
Showing 14 changed files with 466 additions and 0 deletions.
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
build/*
data/*
*.env
*.egg-info
*.pyc
*.rdb
.vscode/*
dist/*
.DS_Store
*.sqlite3
13 changes: 13 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM alephdata/memorious:latest

COPY setup.py /crawlers/
COPY src /crawlers/src
RUN pip3 install -q -e /crawlers
COPY config /crawlers/config

ENV MEMORIOUS_BASE_PATH=/data \
MEMORIOUS_CONFIG_PATH=/crawlers/config \
MEMORIOUS_DEBUG=false \
ARCHIVE_PATH=/data/archive \
REDIS_URL=redis://redis:6379/0 \
MEMORIOUS_DATASTORE_URI=postgresql://datastore:datastore@datastore/datastore \
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Memorious example project

This folder can be used as an example template of a memorious deployment.
Copy it into it's own git repository as a starting point, then add your
own crawlers and scrapers as needed.
54 changes: 54 additions & 0 deletions config/covid19.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Scraper for the OCCRP web site.
# The goal is not to download all HTML, but only PDFs & other documents
# linked from the page as proof.
name: covid19

# A title for display in the UI:
description: "Covid19 NL"

pipeline:
init:
# This first stage will get the ball rolling with a seed URL.
method: seed
params:
urls:
- https://do-ams3-17.hw.webhare.net/services/wobcovid19-prod-1/search/?first=0&count=10000&orderby=publicationdate
handle:
pass: fetch

fetch:
# Download the seed page
method: fetch
params:
# These rules specify which pages should be scraped or included:
rules:
and:
- domain: webhare.net
# - not:
# or:
# - mime_group: assets
# - mime_group: images
handle:
pass: parse

parse:
# Parse the scraped pages to find if they contain additional links.
method: parse
params:
# Additional rules to determine if a scraped page should be stored or not.
# In this example, we're only keeping PDFs, word files, etc.
store:
or:
- mime_group: web
- mime_group: archives
- mime_group: documents
handle:
store: store
# this makes it a recursive web crawler:
fetch: fetch

store:
# Store the crawled documents to a directory
method: directory
params:
path: /data/results
43 changes: 43 additions & 0 deletions config/extended_web_scraper.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Example scraper to demonstrate extending Memorious
name: quote_scraper
description: Quotes to scraper
# delay: 2
pipeline:
init:
# The first stage logs in and creates an HTTP session which is used for subsequent requests.
method: example.quotes:login
params:
url: http://quotes.toscrape.com
username: fred
password: asdfasdf
handle:
pass: fetch
fetch:
# Download the page passed from the login stage.
method: fetch
params:
http_rate_limit: 60
handle:
pass: crawl
crawl:
# Crawl the HTML of the page passed in to extract specific things.
method: example.quotes:crawl
handle:
# If the 'fetch' rule is invoked, re-trigger the fetch stage
fetch: fetch
# If the 'cleanup' rule is invoked, delete the downloaded page from archive
cleanup: cleanup
# Otherwise, pass data on to the store stage
pass: store
store:
# Use a database to store structured data (which is either SQLite or otherwise set with the MEMORIOUS_DATASTORE_URI environment variable).
method: example.quotes:store
params:
table: example_quotes
cleanup:
method: cleanup_archive
aggregator:
method: example.quotes:export
params:
table: example_quotes
filename: all_quotes.json
53 changes: 53 additions & 0 deletions config/simple_article_scraper.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Example scraper to demonstrate Memorious XPath narrowing
name: occrp_entity_scraper
description: A simple scrape of all the existing OCCRP investigations
# Uncomment to run this scraper automatically:
# schedule: weekly
pipeline:
init:
# Start URL
method: seed
params:
urls:
- https://www.occrp.org/en/investigations
handle:
pass: fetch
fetch:
# Download the page passed from the seed stage.
method: fetch
params:
rules:
and:
- pattern: '.*investigations.*'
- domain: occrp.org
- not:
or:
- domain: vis.occrp.org
- domain: tech.occrp.org
- domain: data.occrp.org
- mime_group: assets
- mime_group: images
- pattern: "https://www.occrp.org/en/component/.*"
- pattern: "https://www.occrp.org/en/donate.*"
- pattern: "https://www.occrp.org/.*start=.*"
- pattern: "https://www.occrp.org/ru/.*"
handle:
pass: parse
parse:
method: example.article:parse
params:
schema: Article
store:
and:
- mime_group: web
properties:
title: .//meta[@property="og:title"]/@content
author: .//meta[@name="author"]/@content
publishedAt: .//*[@class="date"]/text()
description: .//meta[@property="og:description"]/@content
handle:
store: store
fetch: fetch
store:
# Store the crawled document as an ftm entity
method: aleph_emit_entity
60 changes: 60 additions & 0 deletions config/simple_web_scraper.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Scraper for the OCCRP web site.
# The goal is not to download all HTML, but only PDFs & other documents
# linked from the page as proof.
name: occrp_web_site

# A title for display in the UI:
description: "Organized Crime and Corruption Reporting Project"

pipeline:
init:
# This first stage will get the ball rolling with a seed URL.
method: seed
params:
urls:
- https://occrp.org
handle:
pass: fetch

fetch:
# Download the seed page
method: fetch
params:
# These rules specify which pages should be scraped or included:
rules:
and:
- domain: occrp.org
- not:
or:
- domain: vis.occrp.org
- domain: tech.occrp.org
- domain: data.occrp.org
- mime_group: assets
- mime_group: images
- pattern: "https://www.occrp.org/en/component/.*"
- pattern: "https://www.occrp.org/en/donate.*"
- pattern: "https://www.occrp.org/.*start=.*"
- pattern: "https://www.occrp.org/ru/.*"
handle:
pass: parse

parse:
# Parse the scraped pages to find if they contain additional links.
method: parse
params:
# Additional rules to determine if a scraped page should be stored or not.
# In this example, we're only keeping PDFs, word files, etc.
store:
or:
- mime_group: archives
- mime_group: documents
handle:
store: store
# this makes it a recursive web crawler:
fetch: fetch

store:
# Store the crawled documents to a directory
method: directory
params:
path: /data/results
42 changes: 42 additions & 0 deletions config/simple_web_scraper_2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Example scraper to demonstrate Memorious XPath narrowing
name: book_scraper
description: Books to scraper
pipeline:
init:
# Start URL
method: seed
params:
urls:
- http://books.toscrape.com
handle:
pass: fetch
fetch:
# Download the page passed from the seed stage.
method: fetch
handle:
pass: parse
parse:
# Crawl the HTML of the page passed in to extract specific things.
method: parse
params:
# This only checks the <section> element for links to follow (effectively keeping only links to book pages and pagination, and skipping the sidebar which lists book categories).
include_paths:
- ".//section"
# This tells the parser to also extract additional metadata from the DOM, which is added to `data` and passed to the 'store' stage.
meta:
title: './/article[@class="product_page"]//h1'
price: './/article[@class="product_page"]//p[@class="price_color"]'
# It uses a regex rule to skip URLs with '/category/' in them, so it only stores the book pages and not the listings.
store:
not:
pattern: ".*/category/.*"
handle:
# If the 'fetch' rule is invoked, re-trigger the fetch stage
fetch: fetch
# Otherwise, pass data on to the store stage
store: store
store:
# Store the crawled documents to a directory
method: directory
params:
path: /data/results
30 changes: 30 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
version: "2"

services:
datastore:
image: postgres:11.4
volumes:
- "./build/datastore:/var/lib/postgresql/data"
environment:
POSTGRES_USER: datastore
POSTGRES_PASSWORD: datastore

redis:
image: redis:alpine
command: ["redis-server", "--appendonly", "yes"]
volumes:
- ./build/redis-data:/data

shell:
build: .
command: /bin/bash
links:
- redis
- datastore
volumes:
- "./build/data:/data"
- "./config:/crawlers/config"
- "./src:/crawlers/src"
- "./entities:/crawlers/entities"
tmpfs:
- "/tmp"
7 changes: 7 additions & 0 deletions scripts/worker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash
pip3 install -q -e /crawlers

# For debugging inside a container, attach a terming and try:
# python3 /tmp/debugpy --wait-for-client --listen 0.0.0.0:5678 memorious/cli.py --debug run book_scraper
pip3 install debugpy -t /tmp
/bin/bash
16 changes: 16 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from datetime import datetime
from setuptools import setup, find_packages

setup(
name="jodal-sources",
version=datetime.utcnow().date().isoformat(),
classifiers=[],
keywords="",
packages=find_packages("src"),
package_dir={"": "src"},
namespace_packages=[],
include_package_data=True,
zip_safe=False,
install_requires=["memorious", "datafreeze", "newspaper3k"],
entry_points={"memorious.plugins": ["example = example:init"]},
)
8 changes: 8 additions & 0 deletions src/example/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import os
from memorious.core import manager


def init():
file_path = os.path.dirname(__file__)
config_path = os.path.join(file_path, "..", "..", "config")
manager.load_path(config_path)
Loading

0 comments on commit b100b0b

Please sign in to comment.