Skip to content

datasets

datasets #114

Workflow file for this run

name: datasets
on:
schedule:
- cron: "0 0 * * *"
workflow_dispatch: {}
push: {}
jobs:
data:
runs-on: ubuntu-latest
container: ghcr.io/investigativedata/investigraph-eu:main
services:
postgres:
image: postgres:alpine
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
ports:
- 5432:5432
env:
POSTGRES_USER: investigraph
POSTGRES_DB: investigraph
POSTGRES_PASSWORD: investigraph
redis:
image: redis:alpine
options: >-
--health-cmd "redis-cli ping"
--health-interval 10s
--health-timeout 5s
--health-retries 5
ports:
- 6379:6379
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
FSSPEC_S3_ENDPOINT_URL: ${{ secrets.FSSPEC_S3_ENDPOINT_URL }}
DEBUG: false
REDIS_URL: redis://redis:6379/0
PREFECT_API_DATABASE_CONNECTION_URL: postgresql+asyncpg://investigraph:investigraph@postgres/investigraph
PREFECT_TASK_RUNNER: dask
strategy:
matrix:
dataset:
# - eu_authorities
- ec_meetings
- eu_transparency_register
steps:
- name: parse and write the dataset
run: "investigraph run -c /datasets/${{ matrix.dataset }}/config.yml"
catalog:
needs: data
runs-on: ubuntu-latest
container: ghcr.io/investigativedata/investigraph-eu:main
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
FSSPEC_S3_ENDPOINT_URL: ${{ secrets.FSSPEC_S3_ENDPOINT_URL }}
steps:
- name: Update the catalog
run: "investigraph build-catalog /datasets/catalog.yml -o s3://data.ftm.store/investigraph.eu.json"
- name: Notify success
run: "curl ${{ secrets.NOTIFY_HOOK }}"