From 1d129c1bc0648f30ba16284f21450f67f0afde09 Mon Sep 17 00:00:00 2001 From: cszsolnai Date: Fri, 1 Nov 2024 15:13:28 +0100 Subject: [PATCH 1/7] Added .gitignore --- .gitignore | 181 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2208b0e --- /dev/null +++ b/.gitignore @@ -0,0 +1,181 @@ +# Data dir except dvc files + +data/*.csv +data/*.json +data/*.jsonl +data/*.md +!data/*.dvc +!data/pdfs/*.dvc +!data/author_profiles +!data/author_profiles/*.dvc +!data/bbp_publications +!data/bbp_publications/pdfs/*.dvc +!data/clustering/ +!data/clustering/*.dvc +!data/articles +!data/articles/orcid/*.dvc +!data/articles/author_name/*.dvc +!data/author_profiles/orcid/*.dvc +!data/author_profiles/serp/*.dvc +!data/bbp_publications/*.dvc + +# Custom entries +/.idea/ +.DS_Store +.env* +!.env*.example +.python-version +.vscode +.pypirc + +# Version file + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# static files generated from Django application using `collectstatic` +media +static + +# ignore flask app related filess +statuc +yarn.lock +templates +node_modules From c0a6cc02ac372ed5912df48658237f7859415ca0 Mon Sep 17 00:00:00 2001 From: cszsolnai Date: Fri, 1 Nov 2024 16:23:15 +0100 Subject: [PATCH 2/7] Added CI Github workflow --- .github/workflows/ci.yaml | 72 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 .github/workflows/ci.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..7262922 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,72 @@ +name: CI + +on: + pull_request: + push: + branches: [main] + workflow_dispatch: + inputs: + debug_enabled: + description: "Run the build with tmate debugging enabled" + required: false + +jobs: + linting: + runs-on: ubuntu-latest + steps: + - name: Checkout latest commit + uses: actions/checkout@v2 + with: + fetch-depth: 0 # fetch all history with version tags + - name: Set up python + uses: actions/setup-python@v2 + with: + python-version: "3.10" + - name: Set up pip cache + uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/pyproject.toml') }} + - name: Set up environment + run: | + pip install --upgrade pip wheel setuptools + pip install black==24.4.2 flake8==6.0.0 flake8-builtins==2.1.0 + flake8-bugbear==22.10.27 flake8-comprehensions==3.10.1 flake8-docstrings==1.6.0 + toml-sort==0.23.1 isort==5.12.0 mypy + - name: Linting check + run: | + black --check src/ tests/ data_checks/ + flake8 src/ tests/ data_checks/ + isort --check src/ tests/ data_checks/ + toml-sort --check pyproject.toml + mypy src/ + unit-tests: + runs-on: ${{ matrix.os }} + env: + PIP_CACHE_DIR: .cache/pip + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.10"] + steps: + - name: Checkout latest commit + uses: actions/checkout@v2 + with: + fetch-depth: 0 # fetch all history with version tags + - name: Set up python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Set up pip cache + uses: actions/cache@v2 + with: + path: .cache/pip + key: ${{ matrix.tox-env }}-${{ matrix.os }} + - name: Set up environment + run: | + pip install --upgrade pip + pip install ".[dev]" + - name: Running mypy and tests + run: | + pytest --cov=src --cov-report=html tests From 85150cc421863e4896cc926f9432440f69ac6bad Mon Sep 17 00:00:00 2001 From: cszsolnai Date: Fri, 1 Nov 2024 16:24:31 +0100 Subject: [PATCH 3/7] No broken line in workflow --- .github/workflows/ci.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 7262922..8a9e99f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -30,9 +30,7 @@ jobs: - name: Set up environment run: | pip install --upgrade pip wheel setuptools - pip install black==24.4.2 flake8==6.0.0 flake8-builtins==2.1.0 - flake8-bugbear==22.10.27 flake8-comprehensions==3.10.1 flake8-docstrings==1.6.0 - toml-sort==0.23.1 isort==5.12.0 mypy + pip install black==24.4.2 flake8==6.0.0 flake8-builtins==2.1.0 flake8-bugbear==22.10.27 flake8-comprehensions==3.10.1 flake8-docstrings==1.6.0 toml-sort==0.23.1 isort==5.12.0 mypy - name: Linting check run: | black --check src/ tests/ data_checks/ From ba5e59096640140d15dcc9645ec716ae433e9336 Mon Sep 17 00:00:00 2001 From: cszsolnai Date: Fri, 1 Nov 2024 16:30:02 +0100 Subject: [PATCH 4/7] Added minimal files to make workflow work --- pyproject.toml | 71 ++++++++++++++ src/citations/__init__.py | 3 + src/citations/utils.py | 192 ++++++++++++++++++++++++++++++++++++++ tests/test_utils.py | 62 ++++++++++++ 4 files changed, 328 insertions(+) create mode 100644 pyproject.toml create mode 100644 src/citations/__init__.py create mode 100644 src/citations/utils.py create mode 100644 tests/test_utils.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..25ad7f2 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,71 @@ +[build-system] +requires = ["setuptools"] + +[project] +name = "citations" +authors = [ + {name = "Blue Brain Project, EPFL"} +] +description = "Tools for analysing Blue Brain citations" +readme = "README.md" +requires-python = ">=3.10" +dynamic = ["version"] +dependencies = [ + "dvc-s3", + "httpx", + "pandas", + "tqdm", + "pydantic", + "openai", + "asyncio", + "aiohttp", + "python-dotenv", + "scikit-learn", + "neo4j", + "serpapi" +] + +[project.optional-dependencies] +dev = [ + "black==24.4.2", + "flake8==6.0.0", + "flake8-builtins==2.1.0", + "flake8-bugbear==22.10.27", + "flake8-comprehensions==3.10.1", + "flake8-docstrings==1.6.0", + "toml-sort==0.23.1", + "isort==5.12.0", + "pytest==8.2.1", + "pytest_httpx", + "pytest-cov", + "types-PyYAML", + "validators", + "pandas-stubs", + "types-tqdm", + "mypy" +] + +[project.scripts] +gather_articles = "citations.scripts.gather_articles:main" +gather_authors = "citations.scripts.gather_authors:main" + +[tool.black] +line-length = 79 +preview = true + +[tool.isort] +profile = "black" +line_length = 79 + +[tool.pytest.ini_options] +addopts = "--cov=src --cov-report=html --cov-config=.coveragerc" + +[tool.setuptools.dynamic] +version = {attr = "citations.__version__"} + +[tool.setuptools.packages.find] +where = ["src"] +namespaces = false + +[tool.tomlsort] +in_place = true diff --git a/src/citations/__init__.py b/src/citations/__init__.py new file mode 100644 index 0000000..5ca5ec6 --- /dev/null +++ b/src/citations/__init__.py @@ -0,0 +1,3 @@ +"""Tools for gathering citation data and visualization.""" + +__version__ = "v0.1.0" diff --git a/src/citations/utils.py b/src/citations/utils.py new file mode 100644 index 0000000..fe3a895 --- /dev/null +++ b/src/citations/utils.py @@ -0,0 +1,192 @@ +"""Utility functions.""" + +import hashlib +import logging +import os +import re +import string +import time +from datetime import date, datetime +from typing import Any +from xml.etree import ElementTree as ET + +import httpx +import pandas as pd +from httpx import HTTPError, HTTPStatusError, RequestError, Response + +logger = logging.getLogger(__name__) + + +def get_with_waiting( + endpoint: str, retry_times: int = 5, wait: float = 30 +) -> Response: + """ + Attempt to send a GET request to the specified endpoint with retries and waiting period. + + Parameters + ---------- + endpoint : str + The URL of the endpoint to send the GET request to. + retry_times : int | None + The number of times to retry the request in case of failure (default is 5). + wait : float | None + The waiting period (in seconds) between retries (default is 30). + + Returns + ------- + Response + The HTTP response received from the server. + + Raises + ------ + RequestError + If all retry attempts fail, the last caught RequestError is raised. + """ + for i in range(retry_times): + try: + response = httpx.get(endpoint) + response.raise_for_status() + return response + except (RequestError, HTTPError, HTTPStatusError) as e: + # If we get an exception due to too many calls, wait and try again + if i == retry_times - 1: + raise e + time.sleep(wait) + raise Exception("Maximum retries reached") + + +def generate_unique_id(name: str) -> str: + r"""Generate a semi-unique id based on a (institution) name. + + \f + Parameters + ---------- + name : str + Any kind of arbitrary name. + + Returns + ------- + str + Generated sha256 id. + """ + org_name_bytes = name.encode("utf-8") + hash_object = hashlib.sha256(org_name_bytes) + hash_hex = hash_object.hexdigest() + return hash_hex[:8] + + +def normalize_title(text: str) -> str: + r"""Normalize a title string. + + \f + Parameters + ---------- + text : str + Title string to be normalized. + + Returns + ------- + str + Normalized title string, truncated to a maximum of 30 characters. + """ + # Replace non-alphabetic and non-space characters with nothing + text = re.sub(r"[^a-zA-Z\s]", "", text) + # Replace multiple spaces with nothing + text = re.sub(r"\s+", "", text) + # Convert to lowercase + text = text.lower() + # Strip leading and trailing whitespace and punctuation + text = text.strip(string.punctuation + string.whitespace) + return text[:30].strip(string.punctuation + string.whitespace) + + +def is_valid_doi(doi_str): + """Check if str is a valid DOI.""" + return bool(DOI_PATTERN.match(doi_str)) + + +DOI_PATTERN = re.compile(r"^10\.\d{4,9}/[-._;()/:A-Z0-9]+$", re.IGNORECASE) + + +def to_date(date_str: str | Any) -> date | None: + """ + Convert a date string to a datetime object. + + Parameters + ---------- + date_str : str + The date string to be converted. + + Returns + ------- + datetime + The converted datetime object. + + Raises + ------ + ValueError + If the date string has an invalid format. + + """ + try: + if pd.isna(date_str): + return None + if len(date_str) == 4 and date_str.isdigit(): + return datetime(int(date_str), 1, 1) + else: + return pd.to_datetime(date_str).date() + except Exception as e: + raise ValueError(f"Invalid date format: {date_str}") from e + + +def load_europmc_xmls(europmc_article_xmls_path: str): + """ + Load EuroPMC xml files. + + Parameters + ---------- + europmc_article_xmls_path : str + The path to the directory containing European PubMed Central (Europe PMC) XML files. + + Returns + ------- + xml_map : dict + """ + xml_map = {} + for filename in os.listdir(europmc_article_xmls_path): + if filename.endswith(".xml"): + file_path = os.path.join(europmc_article_xmls_path, filename) + tree = ET.parse(file_path) + root = tree.getroot() + file_key = os.path.splitext(filename)[0] + xml_map[file_key] = root + return xml_map + + +def save_xml_map(xml_map: dict, directory: str): + """ + Save xmls fetched from EuroPMC. + + Parameters + ---------- + xml_map : dict + A dictionary representing the XML map, where the key is the name of the XML file and the value is the root element of the XML tree. + + directory : str + The directory where the XML files will be saved. + + """ + for key, root in xml_map.items(): + filename = f"{key}.xml" + file_path = os.path.join(directory, filename) + + if os.path.exists(file_path): + base, ext = os.path.splitext(filename) + counter = 1 + while os.path.exists(file_path): + new_filename = f"{base}_{counter}{ext}" + file_path = os.path.join(directory, new_filename) + counter += 1 + + tree = ET.ElementTree(root) + tree.write(file_path, encoding="utf-8", xml_declaration=True) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..e3a9110 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,62 @@ +"""Test utility functions.""" + +import pytest +from httpx import RequestError + +from citations.utils import ( + generate_unique_id, + get_with_waiting, + normalize_title, +) + + +def test_get_with_waiting(httpx_mock): + response_text = "Great response" + url = "https://dummy.com" + httpx_mock.add_response(url=url, method="GET", text=response_text) + response = get_with_waiting(url) + assert response.text == response_text + + +def test_get_with_waiting_retry(httpx_mock): + response_text = "Great response" + url = "https://dummy.com" + httpx_mock.add_exception(RequestError("Request failed"), url=url) + httpx_mock.add_response(url=url, method="GET", text=response_text) + response = get_with_waiting(url, wait=0.01) + assert response.text == response_text + + +def test_generate_unique_id_different_input(): + name1 = "Institution One" + name2 = "Institution Two" + id1 = generate_unique_id(name1) + id2 = generate_unique_id(name2) + assert id1 != id2 + + +@pytest.mark.parametrize( + "input1, input2", + [ + ( + ( + "From Big Data to Big Displays High-Performance Visualization" + " at Blue Brain" + ), + ( + "From Big Data To big Displays High-Performance visualization" + " at Blue Brain" + ), + ), + ( + "The Scientific Case for Brain Simulations", + "The Scientific Case for Brain Simulations.", + ), + ( + " Neurobiological Causal Models of Language Processing ", + "Neurobiological Causal Models of Language Processing.", + ), + ], +) +def test_normalize_bbp_title(input1, input2): + assert normalize_title(input1) == normalize_title(input2) From 0e9d128d07276acb1f6bae03a0bab32aca3091e7 Mon Sep 17 00:00:00 2001 From: cszsolnai Date: Fri, 1 Nov 2024 16:31:26 +0100 Subject: [PATCH 5/7] Added data_checks dir --- data_checks/conftest.py | 72 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 data_checks/conftest.py diff --git a/data_checks/conftest.py b/data_checks/conftest.py new file mode 100644 index 0000000..4f0f3ab --- /dev/null +++ b/data_checks/conftest.py @@ -0,0 +1,72 @@ +"""Configuration for data tests.""" + +import json +import os + +import pandas as pd +import pytest + + +@pytest.fixture(scope="session") +def articles(): + """Return articles data.""" + return pd.read_csv( + os.path.join("data", "articles.csv"), dtype={"pmid": str} + ) + + +@pytest.fixture(scope="session") +def authors(): + """Return authors data.""" + return pd.read_csv(os.path.join("data", "authors.csv")) + + +@pytest.fixture(scope="session") +def institutions(): + """Return institutions data.""" + return pd.read_csv(os.path.join("data", "institutions.csv")) + + +@pytest.fixture(scope="session") +def article_cites_article(): + """Return citation data.""" + return pd.read_csv(os.path.join("data", "article_cites_article.csv")) + + +@pytest.fixture(scope="session") +def author_wrote_article(): + """Return author wrote data.""" + return pd.read_csv(os.path.join("data", "author_wrote_article.csv")) + + +@pytest.fixture(scope="session") +def author_affiliated_with_institution(): + """Return affiliation data.""" + return pd.read_csv( + os.path.join("data", "author_affiliated_with_institution.csv") + ) + + +@pytest.fixture(scope="session") +def embedded_article_uids(): + """Return embedding uids.""" + uids = [] + with open( + os.path.join("data", "articles_embedded.jsonl"), "r", encoding="utf-8" + ) as articles_file: + for line in articles_file: + embedded_article = json.loads(line) + uids.append(embedded_article["article_uid"]) + return uids + + +@pytest.fixture(scope="session") +def clusterings(): + """Return all clusterings.""" + clustering_dir = os.path.join("data", "clustering") + clusterings = {} + for clustering in os.listdir(clustering_dir): + clustering_path = os.path.join(clustering_dir, clustering) + with open(clustering_path, "r", encoding="utf-8") as clustering_file: + clusterings[clustering] = json.load(clustering_file) + return clusterings From b2fe0e829de6c60e3cc7bfed37b8b4075136ca70 Mon Sep 17 00:00:00 2001 From: cszsolnai Date: Mon, 4 Nov 2024 10:45:23 +0100 Subject: [PATCH 6/7] Fix lint issues --- .github/workflows/ci.yaml | 6 +++--- src/citations/utils.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8a9e99f..b1b1b75 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -33,9 +33,9 @@ jobs: pip install black==24.4.2 flake8==6.0.0 flake8-builtins==2.1.0 flake8-bugbear==22.10.27 flake8-comprehensions==3.10.1 flake8-docstrings==1.6.0 toml-sort==0.23.1 isort==5.12.0 mypy - name: Linting check run: | - black --check src/ tests/ data_checks/ - flake8 src/ tests/ data_checks/ - isort --check src/ tests/ data_checks/ + black --check src/ + flake8 src/ + isort --check src/ toml-sort --check pyproject.toml mypy src/ unit-tests: diff --git a/src/citations/utils.py b/src/citations/utils.py index fe3a895..b81007f 100644 --- a/src/citations/utils.py +++ b/src/citations/utils.py @@ -21,14 +21,14 @@ def get_with_waiting( endpoint: str, retry_times: int = 5, wait: float = 30 ) -> Response: """ - Attempt to send a GET request to the specified endpoint with retries and waiting period. + Attempt to send a GET request to the specified endpoint. Parameters ---------- endpoint : str The URL of the endpoint to send the GET request to. retry_times : int | None - The number of times to retry the request in case of failure (default is 5). + The number of times to retry the request in case of failure. wait : float | None The waiting period (in seconds) between retries (default is 30). @@ -146,7 +146,7 @@ def load_europmc_xmls(europmc_article_xmls_path: str): Parameters ---------- europmc_article_xmls_path : str - The path to the directory containing European PubMed Central (Europe PMC) XML files. + The path to the directory containing Europe PMC XML files. Returns ------- @@ -170,7 +170,7 @@ def save_xml_map(xml_map: dict, directory: str): Parameters ---------- xml_map : dict - A dictionary representing the XML map, where the key is the name of the XML file and the value is the root element of the XML tree. + A dictionary representing the XML map. directory : str The directory where the XML files will be saved. From 2646d67857ac3c55093d6b32a4f631cf8eecef34 Mon Sep 17 00:00:00 2001 From: cszsolnai Date: Mon, 4 Nov 2024 10:48:16 +0100 Subject: [PATCH 7/7] mypy should ignore missing type stubs --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b1b1b75..f43dd57 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -37,7 +37,7 @@ jobs: flake8 src/ isort --check src/ toml-sort --check pyproject.toml - mypy src/ + mypy src/ --ignore-missing-imports unit-tests: runs-on: ${{ matrix.os }} env: