Skip to content

Commit

Permalink
Implement person-specific date ranges when scraping ORCID (#18)
Browse files Browse the repository at this point in the history
* ✨ add end dates to each orc id allowing to exclude papers before or after people left (implements #13)

* 🔥 remove marcos papers after 2024-12-31
  • Loading branch information
M4GNV5 authored Jan 24, 2025
1 parent 9ad9458 commit 47dd92f
Showing 1 changed file with 22 additions and 12 deletions.
34 changes: 22 additions & 12 deletions generate-papers-page.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import sys
from datetime import date

#from pyorcid import OrcidAuthentication, Orcid
from pyorcid import OrcidScrapper
Expand Down Expand Up @@ -26,27 +27,32 @@
"""

# generate this using `cat content/_index.md | grep -Eo 'orcid.org/[0-9A-Z-]+'`
# start and end dates added manually allowing to exclude publications before someone
# joined or after someone left the research group
orcids = [
'0000-0002-6930-9271',
'0009-0001-1594-2119',
'0000-0002-5597-3913',
'0000-0003-0439-066X',
'0000-0003-2441-7962',
'0009-0006-4383-5683',
'0009-0005-0529-0996',
'0009-0009-7414-1360',
'0009-0006-7088-8684',
'0000-0002-2871-3905',
('0000-0002-6930-9271', '1970-01-01', '9999-12-01'),
('0009-0001-1594-2119', '1970-01-01', '9999-12-01'),
('0000-0003-0439-066X', '1970-01-01', '9999-12-01'),
('0000-0003-2441-7962', '1970-01-01', '9999-12-01'),
('0009-0005-0529-0996', '1970-01-01', '9999-12-01'),
('0009-0006-7088-8684', '1970-01-01', '9999-12-01'),
('0009-0009-7414-1360', '1970-01-01', '9999-12-01'),
('0009-0003-6679-3672', '1970-01-01', '9999-12-01'),
('0009-0000-8446-7641', '1970-01-01', '9999-12-01'),
('0000-0002-5597-3913', '1970-01-01', '2024-12-31'), # Kevin Mayer
('0009-0006-4383-5683', '1970-01-01', '2024-12-31'), # Marco Michl
]

#orcid_auth = OrcidAuthentication(client_id=client_id, client_secret=client_secret)
#access_token = orcid_auth.get_public_access_token()

known_works = []
papers = []
for orcid in orcids:
for orcid, start_date, end_date in orcids:
#orcid = Orcid(orcid_id=orcid, orcid_access_token=access_token, state="public")
orcid = OrcidScrapper(orcid)
start_date = date.fromisoformat(start_date)
end_date = date.fromisoformat(end_date)
for work in orcid.works()[0]:
date_split = work['publication-date'].split('/')
try:
Expand All @@ -62,13 +68,17 @@
print(f"could not parse date: {date_split} for {work['title']}", file=sys.stderr)
continue

publication_date = date.fromisoformat(f"{year}-{month or 6:02d}-15")
if publication_date > end_date or publication_date < start_date:
continue

if work['url'] in known_works or work['title'] in known_works:
continue

if work['url'] is None:
known_works.append(work['title'])
else:
known_works.append(work["url"])
known_works.append(work['url'])

papers.append({
"order": year * 100 + month,
Expand Down

0 comments on commit 47dd92f

Please sign in to comment.