-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathGetCItationV2.1.py
137 lines (110 loc) · 4.79 KB
/
GetCItationV2.1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import requests
import csv
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
def get_paper_details(paper_id):
"""
Get details of a paper including its citation count.
Args:
paper_id (str): The paperId to get details for.
Returns:
dict: A dictionary containing paper details including citation count.
"""
base_url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}"
params = {"fields": "citationCount"}
try:
response = requests.get(base_url, params=params, timeout=10)
if response.status_code == 200:
return response.json()
else:
print(f"Error: Status code {response.status_code} - {response.text}")
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
return {}
def fetch_citations(paper_id, fields=None, limit=1000, start_offset=0):
"""
Fetch all citations for a given paper using Semantic Scholar API.
Args:
paper_id (str): The paperId for which to fetch citations.
fields (str, optional): A comma-separated list of fields to include in the response.
limit (int): Maximum number of citations per API request (default is 1000).
start_offset (int): The offset to start fetching from.
Returns:
list: A list of dictionaries containing citation information.
"""
base_url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations"
offset = start_offset
# Set up retries to handle connection issues
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504, 429])
session.mount('https://', HTTPAdapter(max_retries=retries))
while True:
params = {"offset": offset, "limit": limit}
if fields:
params["fields"] = fields
try:
response = session.get(base_url, params=params, timeout=10)
if response.status_code == 200:
data = response.json()
citations = data.get("data", [])
yield citations
# Update counters
offset += len(citations)
# Check if there are more results
if len(citations) < limit:
break
else:
print(f"Error: Status code {response.status_code} - {response.text}")
break
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
break
# Delay to avoid rate-limiting
time.sleep(1)
def save_citations_to_csv(citations, filename):
"""
Save citation information to a CSV file.
Args:
citations (list): A list of citation data.
filename (str): The name of the CSV file to save.
"""
with open(filename, mode='a', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
# Write the header if the file is empty
if file.tell() == 0:
writer.writerow(["Paper Title", "Authors", "Publication Year"])
for citation in citations:
citing_paper = citation.get("citingPaper", {})
title = citing_paper.get("title", "Unknown")
year = citing_paper.get("year", "Unknown")
authors = ", ".join([author.get("name", "Unknown") for author in citing_paper.get("authors", [])])
writer.writerow([title, authors, year])
# Main program
if __name__ == "__main__":
paper_id = "c4c45661501c16064eead6e5d37dcb80d41c7a78" # Replace with your paperId
fields = "citingPaper.title,citingPaper.authors,citingPaper.year"
output_file = "paper/video/mid.csv"
# Get the total citation count for the paper
paper_details = get_paper_details(paper_id)
total_citations = paper_details.get("citationCount", 0)
print(f"Total citations to fetch: {total_citations}")
# Fetch citations in a paginated manner and save to CSV
print("Fetching citations...")
total_fetched = 0
start_offset = 0
while total_fetched < total_citations:
batch_fetched = 0
for citations_batch in fetch_citations(paper_id, fields=fields, start_offset=start_offset):
if not citations_batch:
break
print(f"Saving {len(citations_batch)} citations to {output_file}...")
save_citations_to_csv(citations_batch, output_file)
total_fetched += len(citations_batch)
batch_fetched += len(citations_batch)
start_offset += len(citations_batch)
if batch_fetched >= 9000:
print("Reached 9000 citations in this batch, writing to CSV and waiting before continuing...")
time.sleep(30) # Wait before continuing to avoid rate limits
batch_fetched = 0
print("Done!")