diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..659d6e5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +# syntax=docker/dockerfile:1 + +FROM python:3.11.5 + +WORKDIR /app + +ADD app app + +COPY requirements.txt requirements.txt +RUN pip install -r requirements.txt + +CMD [ "python", "-m", "streamlit", "run", "app/main.py" ] diff --git a/README.md b/README.md index faa0cff..986a2b9 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,27 @@ -# Hack4Good - OECD +# Hack4Good - NLP for policy trend analysis (OECD) [![Code style](https://img.shields.io/badge/code%20style-black-000000.svg)](pyproject.toml) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](.pre-commit-config.yaml) -* https://hackmd.io/g5AYgepnQrqrk4V26DrQMg +This project was created for the [Hack4Good 2023](https://www.analytics-club.org/hack4good) hackathon in collaboration with [OECD](https://www.oecd.org/switzerland/). -* https://docs.google.com/spreadsheets/d/1pKb_1Je4hD2X8IfYrFXYqBhWfPg5lgPY/edit?usp=sharing&ouid=110500414719598262605&rtpof=true&sd=true +## GUI Quickstart -## Getting started +Environment variables need to be set in order to run the code. +Create a `.env` (you can use `cp .env.default .env`) file in the root of the repo with the following contents: -### Conda Environment -To run the code in this repo create a [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) environment with the required dependencies: +| Environment Variable | Description | +| --- | --- | +| `ADOBE_CLIENT_ID` | Create Adobe Developer account and select "Get credentials" [here](https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/) | +| `ADOBE_CLIENT_SECRET` | Copy from "Get credentials" [here](https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/) as with `ADOBE_CLIENT_ID` | +| `OPENAI_API_KEY` | Get the [OpenAI API key](https://help.openai.com/en/articles/4936850-where-do-i-find-my-api-key) | + +After setting the environment variables, you can run the code in one of two ways: + +
+Conda Environment + +1. Create a [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) environment with the required dependencies: To **create** a conda environment after cloning the repo: ``` @@ -22,7 +33,36 @@ conda activate hack4good conda deactivate ``` +(Optional) To **update** the conda environment after pulling latest changes: +``` +conda activate hack4good +conda env update -f environment.yml --prune +``` + +(Optional) To **remove** the conda environment: +``` +conda deactivate +conda env remove -n hack4good +``` + +2. Run the streamlit app +``` +python -m streamlit run app/main.py +``` +
+ +
+Docker + +1. Pull (or build) the docker image + +To **pull** the latest docker image: +``` +docker pull ghc +``` +
+ ## Contributing 1. Install [pre-commit](https://pre-commit.com/#installation). -2. `pre-commit install` -3. Add changes, commit and pull request to `main` branch. +2. Run `pre-commit install` to apply the repo's pre-commit hooks to your local git repo. +3. Add your changes, commit and create a pull request with `main` branch as the target. diff --git a/environment.yml b/environment.yml index 28efbaf..aafc347 100644 --- a/environment.yml +++ b/environment.yml @@ -5,13 +5,4 @@ dependencies: - pip=23.2.1 - python=3.11.5 - pip: - - pdfplumber - - pdfminer.six - - tqdm - - torch - - nougat-ocr - - streamlit - - langchain - - python-dotenv - - openai - - python-multipart + - -r requirements.txt diff --git a/requirements.in b/requirements.in new file mode 100644 index 0000000..3c941d2 --- /dev/null +++ b/requirements.in @@ -0,0 +1,12 @@ +pdfplumber +pdfminer.six +tqdm +torch +nougat-ocr +streamlit +langchain +python-dotenv +openai +python-multipart +pdfservices-sdk +requests diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..75c0fd8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,469 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile --resolver=backtracking requirements.in +# +aiohttp==3.9.1 + # via + # datasets + # fsspec + # langchain +aiosignal==1.3.1 + # via aiohttp +albumentations==1.3.1 + # via nougat-ocr +altair==5.2.0 + # via streamlit +annotated-types==0.6.0 + # via pydantic +anyio==3.7.1 + # via + # httpx + # langchain + # openai +attrs==23.2.0 + # via + # aiohttp + # jsonschema + # referencing +blinker==1.7.0 + # via streamlit +build==0.9.0 + # via pdfservices-sdk +cachetools==5.3.2 + # via streamlit +certifi==2022.12.7 + # via + # httpcore + # httpx + # pdfservices-sdk + # requests +cffi==1.15.1 + # via + # cryptography + # pdfservices-sdk +chardet==5.2.0 + # via pdfminer-six +charset-normalizer==2.0.12 + # via + # pdfminer-six + # requests +click==8.1.7 + # via + # nltk + # streamlit +cryptography==3.4.6 + # via + # pdfminer-six + # pdfservices-sdk +dataclasses-json==0.6.3 + # via langchain +datasets[vision]==2.16.1 + # via nougat-ocr +dill==0.3.7 + # via + # datasets + # multiprocess +distro==1.9.0 + # via openai +filelock==3.13.1 + # via + # datasets + # huggingface-hub + # torch + # transformers +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +fsspec[http]==2023.10.0 + # via + # datasets + # huggingface-hub + # lightning + # pytorch-lightning + # torch +gitdb==4.0.11 + # via gitpython +gitpython==3.1.40 + # via streamlit +h11==0.14.0 + # via httpcore +httpcore==1.0.2 + # via httpx +httpx==0.26.0 + # via openai +huggingface-hub==0.20.2 + # via + # datasets + # tokenizers + # transformers +idna==3.6 + # via + # anyio + # httpx + # requests + # yarl +imageio==2.33.1 + # via scikit-image +importlib-metadata==6.11.0 + # via streamlit +jinja2==3.1.2 + # via + # altair + # pydeck + # torch +joblib==1.3.2 + # via + # nltk + # scikit-learn +jsonpatch==1.33 + # via + # langchain + # langchain-core +jsonpointer==2.4 + # via jsonpatch +jsonschema==4.20.0 + # via altair +jsonschema-specifications==2023.12.1 + # via jsonschema +langchain==0.0.347 + # via -r requirements.in +langchain-core==0.0.11 + # via langchain +langsmith==0.0.79 + # via + # langchain + # langchain-core +lazy-loader==0.3 + # via scikit-image +levenshtein==0.23.0 + # via python-levenshtein +lightning==2.1.3 + # via nougat-ocr +lightning-utilities==0.10.0 + # via + # lightning + # pytorch-lightning + # torchmetrics +markdown-it-py==3.0.0 + # via rich +markupsafe==2.1.3 + # via jinja2 +marshmallow==3.20.2 + # via dataclasses-json +mdurl==0.1.2 + # via markdown-it-py +mpmath==1.3.0 + # via sympy +multidict==6.0.4 + # via + # aiohttp + # yarl +multipart==0.2.4 + # via pdfservices-sdk +multiprocess==0.70.15 + # via datasets +munch==4.0.0 + # via sconf +mypy-extensions==1.0.0 + # via typing-inspect +networkx==3.2.1 + # via + # scikit-image + # torch +nltk==3.8.1 + # via nougat-ocr +nougat-ocr==0.1.17 + # via -r requirements.in +numpy==1.26.3 + # via + # albumentations + # altair + # datasets + # imageio + # langchain + # lightning + # opencv-python-headless + # pandas + # pyarrow + # pydeck + # pytorch-lightning + # qudida + # scikit-image + # scikit-learn + # scipy + # streamlit + # tifffile + # torchmetrics + # torchvision + # transformers +openai==1.7.0 + # via -r requirements.in +opencv-python-headless==4.9.0.80 + # via + # albumentations + # nougat-ocr + # qudida +orjson==3.9.10 + # via nougat-ocr +packaging==21.3 + # via + # altair + # build + # datasets + # huggingface-hub + # lightning + # lightning-utilities + # marshmallow + # pdfservices-sdk + # pytorch-lightning + # scikit-image + # streamlit + # torchmetrics + # transformers +pandas==2.1.4 + # via + # altair + # datasets + # streamlit +pdfminer-six==20220319 + # via + # -r requirements.in + # pdfplumber +pdfplumber==0.6.2 + # via -r requirements.in +pdfservices-sdk==2.3.0 + # via -r requirements.in +pep517==0.13.0 + # via + # build + # pdfservices-sdk +pillow==10.2.0 + # via + # datasets + # imageio + # pdfplumber + # scikit-image + # streamlit + # torchvision +polling==0.3.2 + # via pdfservices-sdk +polling2==0.5.0 + # via pdfservices-sdk +protobuf==4.25.1 + # via streamlit +pyarrow==14.0.2 + # via + # datasets + # streamlit +pyarrow-hotfix==0.6 + # via datasets +pycparser==2.21 + # via + # cffi + # pdfservices-sdk +pydantic==2.5.3 + # via + # langchain + # langchain-core + # langsmith + # openai +pydantic-core==2.14.6 + # via pydantic +pydeck==0.8.1b0 + # via streamlit +pygments==2.14.0 + # via + # pdfservices-sdk + # rich +pyjwt==2.4.0 + # via pdfservices-sdk +pyparsing==3.0.9 + # via + # packaging + # pdfservices-sdk +pypdf==3.17.4 + # via nougat-ocr +pypdfium2==4.25.0 + # via + # nougat-ocr + # pdfplumber +python-dateutil==2.8.2 + # via + # pandas + # streamlit +python-dotenv==1.0.0 + # via -r requirements.in +python-levenshtein==0.23.0 + # via nougat-ocr +python-multipart==0.0.6 + # via -r requirements.in +pytorch-lightning==2.1.3 + # via lightning +pytz==2023.3.post1 + # via pandas +pyyaml==6.0 + # via + # albumentations + # datasets + # huggingface-hub + # langchain + # lightning + # pdfservices-sdk + # pytorch-lightning + # transformers +qudida==0.0.4 + # via albumentations +rapidfuzz==3.6.1 + # via levenshtein +referencing==0.32.1 + # via + # jsonschema + # jsonschema-specifications +regex==2023.12.25 + # via + # nltk + # transformers +requests==2.27.1 + # via + # -r requirements.in + # datasets + # fsspec + # huggingface-hub + # langchain + # langsmith + # pdfservices-sdk + # requests-toolbelt + # streamlit + # torchvision + # transformers +requests-toolbelt==0.10.1 + # via pdfservices-sdk +rich==13.7.0 + # via streamlit +rpds-py==0.16.2 + # via + # jsonschema + # referencing +ruamel-yaml==0.18.5 + # via sconf +ruamel-yaml-clib==0.2.8 + # via ruamel-yaml +safetensors==0.4.1 + # via transformers +scikit-image==0.22.0 + # via albumentations +scikit-learn==1.3.2 + # via qudida +scipy==1.11.4 + # via + # albumentations + # scikit-image + # scikit-learn +sconf==0.2.5 + # via nougat-ocr +sentencepiece==0.1.99 + # via nougat-ocr +six==1.16.0 + # via + # pdfservices-sdk + # python-dateutil +smmap==5.0.1 + # via gitdb +sniffio==1.3.0 + # via + # anyio + # httpx + # openai +sqlalchemy==2.0.25 + # via langchain +streamlit==1.29.0 + # via -r requirements.in +sympy==1.12 + # via torch +tenacity==8.2.3 + # via + # langchain + # langchain-core + # streamlit +threadpoolctl==3.2.0 + # via scikit-learn +tifffile==2023.12.9 + # via scikit-image +timm==0.5.4 + # via nougat-ocr +tokenizers==0.15.0 + # via transformers +toml==0.10.2 + # via + # pdfservices-sdk + # streamlit +toolz==0.12.0 + # via altair +torch==2.1.2 + # via + # -r requirements.in + # lightning + # pytorch-lightning + # timm + # torchmetrics + # torchvision +torchmetrics==1.2.1 + # via + # lightning + # pytorch-lightning +torchvision==0.16.2 + # via timm +tornado==6.4 + # via streamlit +tqdm==4.66.1 + # via + # -r requirements.in + # datasets + # huggingface-hub + # lightning + # nltk + # openai + # pytorch-lightning + # transformers +transformers==4.36.2 + # via nougat-ocr +typing-extensions==4.9.0 + # via + # huggingface-hub + # lightning + # lightning-utilities + # openai + # pydantic + # pydantic-core + # pytorch-lightning + # qudida + # sqlalchemy + # streamlit + # torch + # typing-inspect +typing-inspect==0.9.0 + # via dataclasses-json +tzdata==2023.4 + # via pandas +tzlocal==5.2 + # via streamlit +urllib3==1.26.13 + # via + # pdfservices-sdk + # requests +validators==0.22.0 + # via streamlit +wand==0.6.13 + # via pdfplumber +xxhash==3.4.1 + # via datasets +yarl==1.9.4 + # via aiohttp +zipp==3.17.0 + # via importlib-metadata + +# The following packages are considered to be unsafe in a requirements file: +# setuptools