Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Opensourcing the citation-graph repo #1

Merged
merged 7 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
name: CI

on:
pull_request:
push:
branches: [main]
workflow_dispatch:
inputs:
debug_enabled:
description: "Run the build with tmate debugging enabled"
required: false

jobs:
linting:
runs-on: ubuntu-latest
steps:
- name: Checkout latest commit
uses: actions/checkout@v2
with:
fetch-depth: 0 # fetch all history with version tags
- name: Set up python
uses: actions/setup-python@v2
with:
python-version: "3.10"
- name: Set up pip cache
uses: actions/cache@v2
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/pyproject.toml') }}
- name: Set up environment
run: |
pip install --upgrade pip wheel setuptools
pip install black==24.4.2 flake8==6.0.0 flake8-builtins==2.1.0 flake8-bugbear==22.10.27 flake8-comprehensions==3.10.1 flake8-docstrings==1.6.0 toml-sort==0.23.1 isort==5.12.0 mypy
- name: Linting check
run: |
black --check src/
flake8 src/
isort --check src/
toml-sort --check pyproject.toml
mypy src/ --ignore-missing-imports
unit-tests:
runs-on: ${{ matrix.os }}
env:
PIP_CACHE_DIR: .cache/pip
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
python-version: ["3.10"]
steps:
- name: Checkout latest commit
uses: actions/checkout@v2
with:
fetch-depth: 0 # fetch all history with version tags
- name: Set up python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Set up pip cache
uses: actions/cache@v2
with:
path: .cache/pip
key: ${{ matrix.tox-env }}-${{ matrix.os }}
- name: Set up environment
run: |
pip install --upgrade pip
pip install ".[dev]"
- name: Running mypy and tests
run: |
pytest --cov=src --cov-report=html tests
181 changes: 181 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
# Data dir except dvc files

data/*.csv
data/*.json
data/*.jsonl
data/*.md
!data/*.dvc
!data/pdfs/*.dvc
!data/author_profiles
!data/author_profiles/*.dvc
!data/bbp_publications
!data/bbp_publications/pdfs/*.dvc
!data/clustering/
!data/clustering/*.dvc
!data/articles
!data/articles/orcid/*.dvc
!data/articles/author_name/*.dvc
!data/author_profiles/orcid/*.dvc
!data/author_profiles/serp/*.dvc
!data/bbp_publications/*.dvc

# Custom entries
/.idea/
.DS_Store
.env*
!.env*.example
.python-version
.vscode
.pypirc

# Version file

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# static files generated from Django application using `collectstatic`
media
static

# ignore flask app related filess
statuc
yarn.lock
templates
node_modules
72 changes: 72 additions & 0 deletions data_checks/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""Configuration for data tests."""

import json
import os

import pandas as pd
import pytest


@pytest.fixture(scope="session")
def articles():
"""Return articles data."""
return pd.read_csv(
os.path.join("data", "articles.csv"), dtype={"pmid": str}
)


@pytest.fixture(scope="session")
def authors():
"""Return authors data."""
return pd.read_csv(os.path.join("data", "authors.csv"))


@pytest.fixture(scope="session")
def institutions():
"""Return institutions data."""
return pd.read_csv(os.path.join("data", "institutions.csv"))


@pytest.fixture(scope="session")
def article_cites_article():
"""Return citation data."""
return pd.read_csv(os.path.join("data", "article_cites_article.csv"))


@pytest.fixture(scope="session")
def author_wrote_article():
"""Return author wrote data."""
return pd.read_csv(os.path.join("data", "author_wrote_article.csv"))


@pytest.fixture(scope="session")
def author_affiliated_with_institution():
"""Return affiliation data."""
return pd.read_csv(
os.path.join("data", "author_affiliated_with_institution.csv")
)


@pytest.fixture(scope="session")
def embedded_article_uids():
"""Return embedding uids."""
uids = []
with open(
os.path.join("data", "articles_embedded.jsonl"), "r", encoding="utf-8"
) as articles_file:
for line in articles_file:
embedded_article = json.loads(line)
uids.append(embedded_article["article_uid"])
return uids


@pytest.fixture(scope="session")
def clusterings():
"""Return all clusterings."""
clustering_dir = os.path.join("data", "clustering")
clusterings = {}
for clustering in os.listdir(clustering_dir):
clustering_path = os.path.join(clustering_dir, clustering)
with open(clustering_path, "r", encoding="utf-8") as clustering_file:
clusterings[clustering] = json.load(clustering_file)
return clusterings
71 changes: 71 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
[build-system]
requires = ["setuptools"]

[project]
name = "citations"
authors = [
{name = "Blue Brain Project, EPFL"}
]
description = "Tools for analysing Blue Brain citations"
readme = "README.md"
requires-python = ">=3.10"
dynamic = ["version"]
dependencies = [
"dvc-s3",
"httpx",
"pandas",
"tqdm",
"pydantic",
"openai",
"asyncio",
"aiohttp",
"python-dotenv",
"scikit-learn",
"neo4j",
"serpapi"
]

[project.optional-dependencies]
dev = [
"black==24.4.2",
"flake8==6.0.0",
"flake8-builtins==2.1.0",
"flake8-bugbear==22.10.27",
"flake8-comprehensions==3.10.1",
"flake8-docstrings==1.6.0",
"toml-sort==0.23.1",
"isort==5.12.0",
"pytest==8.2.1",
"pytest_httpx",
"pytest-cov",
"types-PyYAML",
"validators",
"pandas-stubs",
"types-tqdm",
"mypy"
]

[project.scripts]
gather_articles = "citations.scripts.gather_articles:main"
gather_authors = "citations.scripts.gather_authors:main"

[tool.black]
line-length = 79
preview = true

[tool.isort]
profile = "black"
line_length = 79

[tool.pytest.ini_options]
addopts = "--cov=src --cov-report=html --cov-config=.coveragerc"

[tool.setuptools.dynamic]
version = {attr = "citations.__version__"}

[tool.setuptools.packages.find]
where = ["src"]
namespaces = false

[tool.tomlsort]
in_place = true
Loading
Loading