Skip to content

Commit

Permalink
feat: add journal information to api (#2067)
Browse files Browse the repository at this point in the history
For JabRef/jabref#10015.

Sample query:
```
query GetJournalByIssn($issn: String) {
  journal(issn: $issn) {
    id
    name
    issn
    scimagoId
    country
    publisher
    areas
    categories
    citationInfo {
      year
      docsThisYear
      docsPrevious3Years
      citableDocsPrevious3Years
      citesOutgoing
      citesOutgoingPerDoc
      citesIncomingByRecentlyPublished
      citesIncomingPerDocByRecentlyPublished
      sjrIndex
    }
    hIndex
  }
}
```
with `issn: 15230864`.

References:
- https://www.scimagojr.com/help.php
- Example:
https://www.scimagojr.com/journalsearch.php?q=27514&tip=sid&clean=0
- https://docs.openalex.org/api-entities/sources/source-object

---------

Co-authored-by: Nitin Suresh <[email protected]>
  • Loading branch information
tobiasdiez and aqurilla authored Jul 28, 2023
1 parent df824a6 commit 6b3fc73
Show file tree
Hide file tree
Showing 30 changed files with 870 additions and 139 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ apollo/introspection.ts
apollo/fragment-masking.ts
apollo/validation.internal.ts

scripts/journal-data/

# Yarn: https://yarnpkg.com/getting-started/qa#which-files-should-be-gitignored
.pnp.*
.yarn/*
Expand Down
2 changes: 2 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@
"composables",
"datetime",
"esbuild",
"issn",
"jiti",
"journaltitle",
"Nuxt",
"nuxtjs",
"scimago",
"transpiled",
"tsyringe",
"upsert"
Expand Down
6 changes: 4 additions & 2 deletions graphql.config.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
"scalars": {
"Date": "Date",
"DateTime": "Date",
"EmailAddress": "string"
"EmailAddress": "string",
"BigInt": "BigInt"
}
}
},
Expand All @@ -42,7 +43,8 @@
"scalarSchemas": {
"Date": "z.date()",
"DateTime": "z.date()",
"EmailAddress": "z.string().email()"
"EmailAddress": "z.string().email()",
"BigInt": "z.bigint()"
},
"importFrom": "~/apollo/graphql",
"validationSchemaExportType": "const"
Expand Down
4 changes: 2 additions & 2 deletions nuxt.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ export default defineNuxtConfig({
},

nitro: {
// Prevent 'reflect-metadata' from being treeshaked (since we don't explicitly use the import it would otherwise be removed)
moduleSideEffects: ['reflect-metadata'],
// Prevent 'reflect-metadata' and 'json-bigint-patch' from being treeshaked (since we don't explicitly use the import it would otherwise be removed)
moduleSideEffects: ['reflect-metadata', 'json-bigint-patch'],
prerender: {
// Needed for storybook support (otherwise the file is not created during nuxi generate)
routes: ['/_storybook/external-iframe'],
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
"graphql": "^16.7.1",
"graphql-passport": "^0.6.5",
"graphql-scalars": "^1.22.2",
"json-bigint-patch": "^0.0.8",
"lodash": "^4.17.21",
"nodemailer": "^6.8.0",
"passport": "^0.6.0",
Expand Down
303 changes: 303 additions & 0 deletions scripts/journaldata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,303 @@
"""
This script downloads data for multiple years from the Scimago Journal Rank website
(https://www.scimagojr.com/journalrank.php), parses the CSV files, and builds a consolidated
dataset over all the years, in JSON format.
The downloaded data includes various metrics for academic journals such as SJR,
h-index, doc counts, citation counts, etc.
Usage:
- Add
```
generator pyclient {
provider = "prisma-client-py"
recursive_type_depth = 5
}
```
to the `schema.prisma` file, and run `yarn generate` to generate the Prisma client.
- Update the `current_year` variable to the latest year of data available.
- Set the environment variable `DATABASE_URL` to the postgres database url (using a .env file is recommended).
- If you want to use the Azure database, add your IP address to the Azure exception list under `jabrefdb | Networking`.
- Run this script with `download` argument to downloads data from the specified start year up to the current year.
- Run this script with `db` (or `json`) argument to dump the consolidated dataset in the database (or `scimagojr_combined_data.json`, respectively).
"""


import asyncio
import csv
import json
import os
import sys
import urllib.request
from pathlib import Path

from prisma import Prisma
from prisma.types import JournalCitationInfoYearlyCreateWithoutRelationsInput

# current_year should be the latest year of data available at https://www.scimagojr.com/journalrank.php
current_year = 2022
start_year = 1999
data_directory = Path('scripts/journal-data')


class JournalInfoYearly:
def __init__(
self,
sjr: float,
hIndex: int,
totalDocs: int,
totalDocs3Years: int,
totalRefs: int,
totalCites3Years: int,
citableDocs3Years: int,
citesPerDoc2Years: float,
refPerDoc: float,
):
self.sjr = sjr
self.hIndex = hIndex
self.totalDocs = totalDocs
self.totalDocs3Years = totalDocs3Years
self.totalRefs = totalRefs
self.totalCites3Years = totalCites3Years
self.citableDocs3Years = citableDocs3Years
self.citesPerDoc2Years = citesPerDoc2Years
self.refPerDoc = refPerDoc


class JournalInfo:
def __init__(
self,
source_id: int,
issn: str,
title: str,
type: str,
country: str,
region: str,
publisher: str,
coverage: str,
categories: str,
areas: str,
):
self.source_id = source_id
self.issn = issn
self.title = title
self.type = type
self.country = country
self.region = region
self.publisher = publisher
self.coverage = coverage
self.categories = categories
self.areas = areas
self.yearly: dict[int, JournalInfoYearly] = {}


def journal_url(year: int):
"""Get url to download info for the given year"""
return f'https://www.scimagojr.com/journalrank.php?year={year}&out=xls'


def parse_float(value: str):
"""Parse float from string, replacing comma with dot"""
try:
float_val = float(value.replace(',', '.'))
return float_val
except ValueError:
return 0.0


def parse_int(value: str):
"""Parse int from string"""
try:
int_val = int(value)
return int_val
except ValueError:
return 0


def get_data_filepath(year: int):
"""Get filename for the given year"""
return data_directory / f'scimagojr-journal-{year}.csv'


def download_all_data():
"""Download data for all years"""

# create data directory if it doesn't exist
if not os.path.exists(data_directory):
os.makedirs(data_directory)

for year in range(start_year, current_year + 1):
# download file for given year
print(f'Downloading data for {year}')
url = journal_url(year)
filepath = get_data_filepath(year)
urllib.request.urlretrieve(url, filepath)


def combine_data():
"""Iterate over files and return the consolidated dataset"""
journals: dict[int, JournalInfo] = {}
for year in range(start_year, current_year + 1):
print(f'Processing {year}')
filepath = get_data_filepath(year)
with open(filepath, mode='r', encoding='utf-8') as csv_file:
csv_reader = csv.DictReader(csv_file, delimiter=';')
for row in csv_reader:
# Columns present in the csv:
# 'Rank', 'Sourceid', 'Title', 'Type', 'Issn', 'SJR', 'SJR Best Quartile', 'H index',
# 'Total Docs. (2020)', 'Total Docs. (3years)', 'Total Refs.', 'Total Cites (3years)',
# 'Citable Docs. (3years)', 'Cites / Doc. (2years)', 'Ref. / Doc.', 'Country', 'Region',
# 'Publisher', 'Coverage', 'Categories', 'Areas'

sourceId = parse_int(row['Sourceid'])
issn = row['Issn']
if issn == '-':
issn = ''
hIndex = parse_int(row['H index'])
sjr = parse_float(row['SJR'])
totalDocs = parse_int(row[f'Total Docs. ({year})'])
totalDocs3Years = parse_int(row['Total Docs. (3years)'])
totalRefs = parse_int(row['Total Refs.'])
totalCites3Years = parse_int(row['Total Cites (3years)'])
citableDocs3Years = parse_int(row['Citable Docs. (3years)'])
citesPerDoc2Years = parse_float(row['Cites / Doc. (2years)'])
refPerDoc = parse_float(row['Ref. / Doc.'])

if sourceId not in journals:
# populate non-varying fields
journals[sourceId] = JournalInfo(
source_id=sourceId,
issn=issn,
title=row['Title'],
type=row['Type'],
country=row['Country'],
region=row['Region'],
publisher=row['Publisher'],
coverage=row['Coverage'],
categories=row['Categories'],
areas=row['Areas'],
)
# populate yearly varying fields
info = journals[sourceId]
info.yearly[year] = JournalInfoYearly(
sjr=sjr,
hIndex=hIndex,
totalDocs=totalDocs,
totalDocs3Years=totalDocs3Years,
totalRefs=totalRefs,
totalCites3Years=totalCites3Years,
citableDocs3Years=citableDocs3Years,
citesPerDoc2Years=citesPerDoc2Years,
refPerDoc=refPerDoc,
)

print(f'Number of journals collected: {len(journals)}')
return journals


def dump_to_json(journals: dict[int, JournalInfo]):
# write to json file
print('Writing to json')
with open(
data_directory / 'scimagojr_combined_data.json', 'w', encoding='utf-8'
) as fp:
json.dump(journals, fp, default=vars)


async def dump_into_database(journals: dict[int, JournalInfo]):
"""Save data from json file to postgres database"""
db = Prisma()
await db.connect()

# delete all existing yearly data (because its easier than updating)
await db.journalcitationinfoyearly.delete_many()

for journal in journals.values():
citation_info: list[JournalCitationInfoYearlyCreateWithoutRelationsInput] = [
{
'year': year,
'docsThisYear': info.totalDocs,
'docsPrevious3Years': info.totalDocs3Years,
'citableDocsPrevious3Years': info.citableDocs3Years,
'citesOutgoing': info.totalCites3Years,
'citesOutgoingPerDoc': info.citesPerDoc2Years,
'citesIncomingByRecentlyPublished': info.totalRefs,
'citesIncomingPerDocByRecentlyPublished': info.refPerDoc,
'sjrIndex': info.sjr,
}
for year, info in journal.yearly.items()
]

await db.journal.upsert(
where={'scimagoId': journal.source_id},
data={
'create': {
'scimagoId': journal.source_id,
'isCustom': False,
'name': journal.title,
'issn': journal.issn.split(','),
'country': journal.country,
'publisher': journal.publisher,
'areas': journal.areas.split(','),
'categories': journal.categories.split(','),
'hIndex': next(
iter(journal.yearly.values())
).hIndex, # they are constant
'citationInfo': {'create': citation_info},
},
'update': {
'scimagoId': journal.source_id,
'isCustom': False,
'name': journal.title,
'issn': journal.issn.split(','),
'country': journal.country,
'publisher': journal.publisher,
'areas': journal.areas.split(','),
'categories': journal.categories.split(','),
'hIndex': next(
iter(journal.yearly.values())
).hIndex, # they are constant
'citationInfo': {'create': citation_info},
},
},
)

await db.disconnect()


def find_duplicate_issn():
"""Find journals with duplicate issn"""
journals = combine_data()
issn_count: dict[str, list[str]] = {}
for journal in journals.values():
for issn in journal.issn.split(','):
if issn == '':
continue
journal_list = issn_count.get(issn, [])
journal_list.append(journal.title)
issn_count[issn] = journal_list

for issn, titles in issn_count.items():
if len(titles) > 1:
print(issn, titles)


def main(argv: list[str]):
"""Main function"""
if len(argv) == 1:
print("No arguments provided")
elif argv[1] == "download":
download_all_data()
elif argv[1] == "json":
dump_to_json(combine_data())
elif argv[1] == "db":
data = combine_data()
asyncio.run(dump_into_database(data))
elif argv[1] == "duplicates":
find_duplicate_issn()
else:
print("Invalid argument provided")


if __name__ == "__main__":
main(sys.argv)
1 change: 1 addition & 0 deletions server/api/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { startServerAndCreateH3Handler } from '@as-integrations/h3'
import { defineCorsEventHandler } from '@nozomuikuta/h3-cors'
import http from 'http'
import 'reflect-metadata' // Needed for tsyringe
import 'json-bigint-patch' // Needed for bigint support in JSON
import { buildContext, Context } from '../context'
import { loadSchemaWithResolvers } from '../schema'

Expand Down
Loading

0 comments on commit 6b3fc73

Please sign in to comment.