-
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
14 changed files
with
466 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
build/* | ||
data/* | ||
*.env | ||
*.egg-info | ||
*.pyc | ||
*.rdb | ||
.vscode/* | ||
dist/* | ||
.DS_Store | ||
*.sqlite3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
FROM alephdata/memorious:latest | ||
|
||
COPY setup.py /crawlers/ | ||
COPY src /crawlers/src | ||
RUN pip3 install -q -e /crawlers | ||
COPY config /crawlers/config | ||
|
||
ENV MEMORIOUS_BASE_PATH=/data \ | ||
MEMORIOUS_CONFIG_PATH=/crawlers/config \ | ||
MEMORIOUS_DEBUG=false \ | ||
ARCHIVE_PATH=/data/archive \ | ||
REDIS_URL=redis://redis:6379/0 \ | ||
MEMORIOUS_DATASTORE_URI=postgresql://datastore:datastore@datastore/datastore \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# Memorious example project | ||
|
||
This folder can be used as an example template of a memorious deployment. | ||
Copy it into it's own git repository as a starting point, then add your | ||
own crawlers and scrapers as needed. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# Scraper for the OCCRP web site. | ||
# The goal is not to download all HTML, but only PDFs & other documents | ||
# linked from the page as proof. | ||
name: covid19 | ||
|
||
# A title for display in the UI: | ||
description: "Covid19 NL" | ||
|
||
pipeline: | ||
init: | ||
# This first stage will get the ball rolling with a seed URL. | ||
method: seed | ||
params: | ||
urls: | ||
- https://do-ams3-17.hw.webhare.net/services/wobcovid19-prod-1/search/?first=0&count=10000&orderby=publicationdate | ||
handle: | ||
pass: fetch | ||
|
||
fetch: | ||
# Download the seed page | ||
method: fetch | ||
params: | ||
# These rules specify which pages should be scraped or included: | ||
rules: | ||
and: | ||
- domain: webhare.net | ||
# - not: | ||
# or: | ||
# - mime_group: assets | ||
# - mime_group: images | ||
handle: | ||
pass: parse | ||
|
||
parse: | ||
# Parse the scraped pages to find if they contain additional links. | ||
method: parse | ||
params: | ||
# Additional rules to determine if a scraped page should be stored or not. | ||
# In this example, we're only keeping PDFs, word files, etc. | ||
store: | ||
or: | ||
- mime_group: web | ||
- mime_group: archives | ||
- mime_group: documents | ||
handle: | ||
store: store | ||
# this makes it a recursive web crawler: | ||
fetch: fetch | ||
|
||
store: | ||
# Store the crawled documents to a directory | ||
method: directory | ||
params: | ||
path: /data/results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# Example scraper to demonstrate extending Memorious | ||
name: quote_scraper | ||
description: Quotes to scraper | ||
# delay: 2 | ||
pipeline: | ||
init: | ||
# The first stage logs in and creates an HTTP session which is used for subsequent requests. | ||
method: example.quotes:login | ||
params: | ||
url: http://quotes.toscrape.com | ||
username: fred | ||
password: asdfasdf | ||
handle: | ||
pass: fetch | ||
fetch: | ||
# Download the page passed from the login stage. | ||
method: fetch | ||
params: | ||
http_rate_limit: 60 | ||
handle: | ||
pass: crawl | ||
crawl: | ||
# Crawl the HTML of the page passed in to extract specific things. | ||
method: example.quotes:crawl | ||
handle: | ||
# If the 'fetch' rule is invoked, re-trigger the fetch stage | ||
fetch: fetch | ||
# If the 'cleanup' rule is invoked, delete the downloaded page from archive | ||
cleanup: cleanup | ||
# Otherwise, pass data on to the store stage | ||
pass: store | ||
store: | ||
# Use a database to store structured data (which is either SQLite or otherwise set with the MEMORIOUS_DATASTORE_URI environment variable). | ||
method: example.quotes:store | ||
params: | ||
table: example_quotes | ||
cleanup: | ||
method: cleanup_archive | ||
aggregator: | ||
method: example.quotes:export | ||
params: | ||
table: example_quotes | ||
filename: all_quotes.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# Example scraper to demonstrate Memorious XPath narrowing | ||
name: occrp_entity_scraper | ||
description: A simple scrape of all the existing OCCRP investigations | ||
# Uncomment to run this scraper automatically: | ||
# schedule: weekly | ||
pipeline: | ||
init: | ||
# Start URL | ||
method: seed | ||
params: | ||
urls: | ||
- https://www.occrp.org/en/investigations | ||
handle: | ||
pass: fetch | ||
fetch: | ||
# Download the page passed from the seed stage. | ||
method: fetch | ||
params: | ||
rules: | ||
and: | ||
- pattern: '.*investigations.*' | ||
- domain: occrp.org | ||
- not: | ||
or: | ||
- domain: vis.occrp.org | ||
- domain: tech.occrp.org | ||
- domain: data.occrp.org | ||
- mime_group: assets | ||
- mime_group: images | ||
- pattern: "https://www.occrp.org/en/component/.*" | ||
- pattern: "https://www.occrp.org/en/donate.*" | ||
- pattern: "https://www.occrp.org/.*start=.*" | ||
- pattern: "https://www.occrp.org/ru/.*" | ||
handle: | ||
pass: parse | ||
parse: | ||
method: example.article:parse | ||
params: | ||
schema: Article | ||
store: | ||
and: | ||
- mime_group: web | ||
properties: | ||
title: .//meta[@property="og:title"]/@content | ||
author: .//meta[@name="author"]/@content | ||
publishedAt: .//*[@class="date"]/text() | ||
description: .//meta[@property="og:description"]/@content | ||
handle: | ||
store: store | ||
fetch: fetch | ||
store: | ||
# Store the crawled document as an ftm entity | ||
method: aleph_emit_entity |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# Scraper for the OCCRP web site. | ||
# The goal is not to download all HTML, but only PDFs & other documents | ||
# linked from the page as proof. | ||
name: occrp_web_site | ||
|
||
# A title for display in the UI: | ||
description: "Organized Crime and Corruption Reporting Project" | ||
|
||
pipeline: | ||
init: | ||
# This first stage will get the ball rolling with a seed URL. | ||
method: seed | ||
params: | ||
urls: | ||
- https://occrp.org | ||
handle: | ||
pass: fetch | ||
|
||
fetch: | ||
# Download the seed page | ||
method: fetch | ||
params: | ||
# These rules specify which pages should be scraped or included: | ||
rules: | ||
and: | ||
- domain: occrp.org | ||
- not: | ||
or: | ||
- domain: vis.occrp.org | ||
- domain: tech.occrp.org | ||
- domain: data.occrp.org | ||
- mime_group: assets | ||
- mime_group: images | ||
- pattern: "https://www.occrp.org/en/component/.*" | ||
- pattern: "https://www.occrp.org/en/donate.*" | ||
- pattern: "https://www.occrp.org/.*start=.*" | ||
- pattern: "https://www.occrp.org/ru/.*" | ||
handle: | ||
pass: parse | ||
|
||
parse: | ||
# Parse the scraped pages to find if they contain additional links. | ||
method: parse | ||
params: | ||
# Additional rules to determine if a scraped page should be stored or not. | ||
# In this example, we're only keeping PDFs, word files, etc. | ||
store: | ||
or: | ||
- mime_group: archives | ||
- mime_group: documents | ||
handle: | ||
store: store | ||
# this makes it a recursive web crawler: | ||
fetch: fetch | ||
|
||
store: | ||
# Store the crawled documents to a directory | ||
method: directory | ||
params: | ||
path: /data/results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# Example scraper to demonstrate Memorious XPath narrowing | ||
name: book_scraper | ||
description: Books to scraper | ||
pipeline: | ||
init: | ||
# Start URL | ||
method: seed | ||
params: | ||
urls: | ||
- http://books.toscrape.com | ||
handle: | ||
pass: fetch | ||
fetch: | ||
# Download the page passed from the seed stage. | ||
method: fetch | ||
handle: | ||
pass: parse | ||
parse: | ||
# Crawl the HTML of the page passed in to extract specific things. | ||
method: parse | ||
params: | ||
# This only checks the <section> element for links to follow (effectively keeping only links to book pages and pagination, and skipping the sidebar which lists book categories). | ||
include_paths: | ||
- ".//section" | ||
# This tells the parser to also extract additional metadata from the DOM, which is added to `data` and passed to the 'store' stage. | ||
meta: | ||
title: './/article[@class="product_page"]//h1' | ||
price: './/article[@class="product_page"]//p[@class="price_color"]' | ||
# It uses a regex rule to skip URLs with '/category/' in them, so it only stores the book pages and not the listings. | ||
store: | ||
not: | ||
pattern: ".*/category/.*" | ||
handle: | ||
# If the 'fetch' rule is invoked, re-trigger the fetch stage | ||
fetch: fetch | ||
# Otherwise, pass data on to the store stage | ||
store: store | ||
store: | ||
# Store the crawled documents to a directory | ||
method: directory | ||
params: | ||
path: /data/results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
version: "2" | ||
|
||
services: | ||
datastore: | ||
image: postgres:11.4 | ||
volumes: | ||
- "./build/datastore:/var/lib/postgresql/data" | ||
environment: | ||
POSTGRES_USER: datastore | ||
POSTGRES_PASSWORD: datastore | ||
|
||
redis: | ||
image: redis:alpine | ||
command: ["redis-server", "--appendonly", "yes"] | ||
volumes: | ||
- ./build/redis-data:/data | ||
|
||
shell: | ||
build: . | ||
command: /bin/bash | ||
links: | ||
- redis | ||
- datastore | ||
volumes: | ||
- "./build/data:/data" | ||
- "./config:/crawlers/config" | ||
- "./src:/crawlers/src" | ||
- "./entities:/crawlers/entities" | ||
tmpfs: | ||
- "/tmp" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#!/bin/bash | ||
pip3 install -q -e /crawlers | ||
|
||
# For debugging inside a container, attach a terming and try: | ||
# python3 /tmp/debugpy --wait-for-client --listen 0.0.0.0:5678 memorious/cli.py --debug run book_scraper | ||
pip3 install debugpy -t /tmp | ||
/bin/bash |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
from datetime import datetime | ||
from setuptools import setup, find_packages | ||
|
||
setup( | ||
name="jodal-sources", | ||
version=datetime.utcnow().date().isoformat(), | ||
classifiers=[], | ||
keywords="", | ||
packages=find_packages("src"), | ||
package_dir={"": "src"}, | ||
namespace_packages=[], | ||
include_package_data=True, | ||
zip_safe=False, | ||
install_requires=["memorious", "datafreeze", "newspaper3k"], | ||
entry_points={"memorious.plugins": ["example = example:init"]}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
import os | ||
from memorious.core import manager | ||
|
||
|
||
def init(): | ||
file_path = os.path.dirname(__file__) | ||
config_path = os.path.join(file_path, "..", "..", "config") | ||
manager.load_path(config_path) |
Oops, something went wrong.