diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..659d6e5
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,12 @@
+# syntax=docker/dockerfile:1
+
+FROM python:3.11.5
+
+WORKDIR /app
+
+ADD app app
+
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+
+CMD [ "python", "-m", "streamlit", "run", "app/main.py" ]
diff --git a/README.md b/README.md
index faa0cff..986a2b9 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,27 @@
-# Hack4Good - OECD
+# Hack4Good - NLP for policy trend analysis (OECD)
[](pyproject.toml)
[](.pre-commit-config.yaml)
-* https://hackmd.io/g5AYgepnQrqrk4V26DrQMg
+This project was created for the [Hack4Good 2023](https://www.analytics-club.org/hack4good) hackathon in collaboration with [OECD](https://www.oecd.org/switzerland/).
-* https://docs.google.com/spreadsheets/d/1pKb_1Je4hD2X8IfYrFXYqBhWfPg5lgPY/edit?usp=sharing&ouid=110500414719598262605&rtpof=true&sd=true
+## GUI Quickstart
-## Getting started
+Environment variables need to be set in order to run the code.
+Create a `.env` (you can use `cp .env.default .env`) file in the root of the repo with the following contents:
-### Conda Environment
-To run the code in this repo create a [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) environment with the required dependencies:
+| Environment Variable | Description |
+| --- | --- |
+| `ADOBE_CLIENT_ID` | Create Adobe Developer account and select "Get credentials" [here](https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/) |
+| `ADOBE_CLIENT_SECRET` | Copy from "Get credentials" [here](https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/) as with `ADOBE_CLIENT_ID` |
+| `OPENAI_API_KEY` | Get the [OpenAI API key](https://help.openai.com/en/articles/4936850-where-do-i-find-my-api-key) |
+
+After setting the environment variables, you can run the code in one of two ways:
+
+
+Conda Environment
+
+1. Create a [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) environment with the required dependencies:
To **create** a conda environment after cloning the repo:
```
@@ -22,7 +33,36 @@ conda activate hack4good
conda deactivate
```
+(Optional) To **update** the conda environment after pulling latest changes:
+```
+conda activate hack4good
+conda env update -f environment.yml --prune
+```
+
+(Optional) To **remove** the conda environment:
+```
+conda deactivate
+conda env remove -n hack4good
+```
+
+2. Run the streamlit app
+```
+python -m streamlit run app/main.py
+```
+
+
+
+Docker
+
+1. Pull (or build) the docker image
+
+To **pull** the latest docker image:
+```
+docker pull ghc
+```
+
+
## Contributing
1. Install [pre-commit](https://pre-commit.com/#installation).
-2. `pre-commit install`
-3. Add changes, commit and pull request to `main` branch.
+2. Run `pre-commit install` to apply the repo's pre-commit hooks to your local git repo.
+3. Add your changes, commit and create a pull request with `main` branch as the target.
diff --git a/environment.yml b/environment.yml
index 28efbaf..aafc347 100644
--- a/environment.yml
+++ b/environment.yml
@@ -5,13 +5,4 @@ dependencies:
- pip=23.2.1
- python=3.11.5
- pip:
- - pdfplumber
- - pdfminer.six
- - tqdm
- - torch
- - nougat-ocr
- - streamlit
- - langchain
- - python-dotenv
- - openai
- - python-multipart
+ - -r requirements.txt
diff --git a/requirements.in b/requirements.in
new file mode 100644
index 0000000..3c941d2
--- /dev/null
+++ b/requirements.in
@@ -0,0 +1,12 @@
+pdfplumber
+pdfminer.six
+tqdm
+torch
+nougat-ocr
+streamlit
+langchain
+python-dotenv
+openai
+python-multipart
+pdfservices-sdk
+requests
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..75c0fd8
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,469 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+# pip-compile --resolver=backtracking requirements.in
+#
+aiohttp==3.9.1
+ # via
+ # datasets
+ # fsspec
+ # langchain
+aiosignal==1.3.1
+ # via aiohttp
+albumentations==1.3.1
+ # via nougat-ocr
+altair==5.2.0
+ # via streamlit
+annotated-types==0.6.0
+ # via pydantic
+anyio==3.7.1
+ # via
+ # httpx
+ # langchain
+ # openai
+attrs==23.2.0
+ # via
+ # aiohttp
+ # jsonschema
+ # referencing
+blinker==1.7.0
+ # via streamlit
+build==0.9.0
+ # via pdfservices-sdk
+cachetools==5.3.2
+ # via streamlit
+certifi==2022.12.7
+ # via
+ # httpcore
+ # httpx
+ # pdfservices-sdk
+ # requests
+cffi==1.15.1
+ # via
+ # cryptography
+ # pdfservices-sdk
+chardet==5.2.0
+ # via pdfminer-six
+charset-normalizer==2.0.12
+ # via
+ # pdfminer-six
+ # requests
+click==8.1.7
+ # via
+ # nltk
+ # streamlit
+cryptography==3.4.6
+ # via
+ # pdfminer-six
+ # pdfservices-sdk
+dataclasses-json==0.6.3
+ # via langchain
+datasets[vision]==2.16.1
+ # via nougat-ocr
+dill==0.3.7
+ # via
+ # datasets
+ # multiprocess
+distro==1.9.0
+ # via openai
+filelock==3.13.1
+ # via
+ # datasets
+ # huggingface-hub
+ # torch
+ # transformers
+frozenlist==1.4.1
+ # via
+ # aiohttp
+ # aiosignal
+fsspec[http]==2023.10.0
+ # via
+ # datasets
+ # huggingface-hub
+ # lightning
+ # pytorch-lightning
+ # torch
+gitdb==4.0.11
+ # via gitpython
+gitpython==3.1.40
+ # via streamlit
+h11==0.14.0
+ # via httpcore
+httpcore==1.0.2
+ # via httpx
+httpx==0.26.0
+ # via openai
+huggingface-hub==0.20.2
+ # via
+ # datasets
+ # tokenizers
+ # transformers
+idna==3.6
+ # via
+ # anyio
+ # httpx
+ # requests
+ # yarl
+imageio==2.33.1
+ # via scikit-image
+importlib-metadata==6.11.0
+ # via streamlit
+jinja2==3.1.2
+ # via
+ # altair
+ # pydeck
+ # torch
+joblib==1.3.2
+ # via
+ # nltk
+ # scikit-learn
+jsonpatch==1.33
+ # via
+ # langchain
+ # langchain-core
+jsonpointer==2.4
+ # via jsonpatch
+jsonschema==4.20.0
+ # via altair
+jsonschema-specifications==2023.12.1
+ # via jsonschema
+langchain==0.0.347
+ # via -r requirements.in
+langchain-core==0.0.11
+ # via langchain
+langsmith==0.0.79
+ # via
+ # langchain
+ # langchain-core
+lazy-loader==0.3
+ # via scikit-image
+levenshtein==0.23.0
+ # via python-levenshtein
+lightning==2.1.3
+ # via nougat-ocr
+lightning-utilities==0.10.0
+ # via
+ # lightning
+ # pytorch-lightning
+ # torchmetrics
+markdown-it-py==3.0.0
+ # via rich
+markupsafe==2.1.3
+ # via jinja2
+marshmallow==3.20.2
+ # via dataclasses-json
+mdurl==0.1.2
+ # via markdown-it-py
+mpmath==1.3.0
+ # via sympy
+multidict==6.0.4
+ # via
+ # aiohttp
+ # yarl
+multipart==0.2.4
+ # via pdfservices-sdk
+multiprocess==0.70.15
+ # via datasets
+munch==4.0.0
+ # via sconf
+mypy-extensions==1.0.0
+ # via typing-inspect
+networkx==3.2.1
+ # via
+ # scikit-image
+ # torch
+nltk==3.8.1
+ # via nougat-ocr
+nougat-ocr==0.1.17
+ # via -r requirements.in
+numpy==1.26.3
+ # via
+ # albumentations
+ # altair
+ # datasets
+ # imageio
+ # langchain
+ # lightning
+ # opencv-python-headless
+ # pandas
+ # pyarrow
+ # pydeck
+ # pytorch-lightning
+ # qudida
+ # scikit-image
+ # scikit-learn
+ # scipy
+ # streamlit
+ # tifffile
+ # torchmetrics
+ # torchvision
+ # transformers
+openai==1.7.0
+ # via -r requirements.in
+opencv-python-headless==4.9.0.80
+ # via
+ # albumentations
+ # nougat-ocr
+ # qudida
+orjson==3.9.10
+ # via nougat-ocr
+packaging==21.3
+ # via
+ # altair
+ # build
+ # datasets
+ # huggingface-hub
+ # lightning
+ # lightning-utilities
+ # marshmallow
+ # pdfservices-sdk
+ # pytorch-lightning
+ # scikit-image
+ # streamlit
+ # torchmetrics
+ # transformers
+pandas==2.1.4
+ # via
+ # altair
+ # datasets
+ # streamlit
+pdfminer-six==20220319
+ # via
+ # -r requirements.in
+ # pdfplumber
+pdfplumber==0.6.2
+ # via -r requirements.in
+pdfservices-sdk==2.3.0
+ # via -r requirements.in
+pep517==0.13.0
+ # via
+ # build
+ # pdfservices-sdk
+pillow==10.2.0
+ # via
+ # datasets
+ # imageio
+ # pdfplumber
+ # scikit-image
+ # streamlit
+ # torchvision
+polling==0.3.2
+ # via pdfservices-sdk
+polling2==0.5.0
+ # via pdfservices-sdk
+protobuf==4.25.1
+ # via streamlit
+pyarrow==14.0.2
+ # via
+ # datasets
+ # streamlit
+pyarrow-hotfix==0.6
+ # via datasets
+pycparser==2.21
+ # via
+ # cffi
+ # pdfservices-sdk
+pydantic==2.5.3
+ # via
+ # langchain
+ # langchain-core
+ # langsmith
+ # openai
+pydantic-core==2.14.6
+ # via pydantic
+pydeck==0.8.1b0
+ # via streamlit
+pygments==2.14.0
+ # via
+ # pdfservices-sdk
+ # rich
+pyjwt==2.4.0
+ # via pdfservices-sdk
+pyparsing==3.0.9
+ # via
+ # packaging
+ # pdfservices-sdk
+pypdf==3.17.4
+ # via nougat-ocr
+pypdfium2==4.25.0
+ # via
+ # nougat-ocr
+ # pdfplumber
+python-dateutil==2.8.2
+ # via
+ # pandas
+ # streamlit
+python-dotenv==1.0.0
+ # via -r requirements.in
+python-levenshtein==0.23.0
+ # via nougat-ocr
+python-multipart==0.0.6
+ # via -r requirements.in
+pytorch-lightning==2.1.3
+ # via lightning
+pytz==2023.3.post1
+ # via pandas
+pyyaml==6.0
+ # via
+ # albumentations
+ # datasets
+ # huggingface-hub
+ # langchain
+ # lightning
+ # pdfservices-sdk
+ # pytorch-lightning
+ # transformers
+qudida==0.0.4
+ # via albumentations
+rapidfuzz==3.6.1
+ # via levenshtein
+referencing==0.32.1
+ # via
+ # jsonschema
+ # jsonschema-specifications
+regex==2023.12.25
+ # via
+ # nltk
+ # transformers
+requests==2.27.1
+ # via
+ # -r requirements.in
+ # datasets
+ # fsspec
+ # huggingface-hub
+ # langchain
+ # langsmith
+ # pdfservices-sdk
+ # requests-toolbelt
+ # streamlit
+ # torchvision
+ # transformers
+requests-toolbelt==0.10.1
+ # via pdfservices-sdk
+rich==13.7.0
+ # via streamlit
+rpds-py==0.16.2
+ # via
+ # jsonschema
+ # referencing
+ruamel-yaml==0.18.5
+ # via sconf
+ruamel-yaml-clib==0.2.8
+ # via ruamel-yaml
+safetensors==0.4.1
+ # via transformers
+scikit-image==0.22.0
+ # via albumentations
+scikit-learn==1.3.2
+ # via qudida
+scipy==1.11.4
+ # via
+ # albumentations
+ # scikit-image
+ # scikit-learn
+sconf==0.2.5
+ # via nougat-ocr
+sentencepiece==0.1.99
+ # via nougat-ocr
+six==1.16.0
+ # via
+ # pdfservices-sdk
+ # python-dateutil
+smmap==5.0.1
+ # via gitdb
+sniffio==1.3.0
+ # via
+ # anyio
+ # httpx
+ # openai
+sqlalchemy==2.0.25
+ # via langchain
+streamlit==1.29.0
+ # via -r requirements.in
+sympy==1.12
+ # via torch
+tenacity==8.2.3
+ # via
+ # langchain
+ # langchain-core
+ # streamlit
+threadpoolctl==3.2.0
+ # via scikit-learn
+tifffile==2023.12.9
+ # via scikit-image
+timm==0.5.4
+ # via nougat-ocr
+tokenizers==0.15.0
+ # via transformers
+toml==0.10.2
+ # via
+ # pdfservices-sdk
+ # streamlit
+toolz==0.12.0
+ # via altair
+torch==2.1.2
+ # via
+ # -r requirements.in
+ # lightning
+ # pytorch-lightning
+ # timm
+ # torchmetrics
+ # torchvision
+torchmetrics==1.2.1
+ # via
+ # lightning
+ # pytorch-lightning
+torchvision==0.16.2
+ # via timm
+tornado==6.4
+ # via streamlit
+tqdm==4.66.1
+ # via
+ # -r requirements.in
+ # datasets
+ # huggingface-hub
+ # lightning
+ # nltk
+ # openai
+ # pytorch-lightning
+ # transformers
+transformers==4.36.2
+ # via nougat-ocr
+typing-extensions==4.9.0
+ # via
+ # huggingface-hub
+ # lightning
+ # lightning-utilities
+ # openai
+ # pydantic
+ # pydantic-core
+ # pytorch-lightning
+ # qudida
+ # sqlalchemy
+ # streamlit
+ # torch
+ # typing-inspect
+typing-inspect==0.9.0
+ # via dataclasses-json
+tzdata==2023.4
+ # via pandas
+tzlocal==5.2
+ # via streamlit
+urllib3==1.26.13
+ # via
+ # pdfservices-sdk
+ # requests
+validators==0.22.0
+ # via streamlit
+wand==0.6.13
+ # via pdfplumber
+xxhash==3.4.1
+ # via datasets
+yarl==1.9.4
+ # via aiohttp
+zipp==3.17.0
+ # via importlib-metadata
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools