-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfetch_napari_data.py
192 lines (150 loc) · 6.95 KB
/
fetch_napari_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import requests
import json
import os
import pandas as pd
import re
def fetch_conda(plugin_name):
""" Fetches Conda info and creates an HTML file for it """
conda_url = f'https://npe2api.vercel.app/api/conda/{plugin_name}'
response = requests.get(conda_url)
if response.status_code != 200:
print(f"Failed to fetch Conda info for {plugin_name}")
return None
return response.json()
def fetch_plugin(url):
""" Fetches plugin summary from the given URL """
response = requests.get(url)
if response.status_code == 200:
return response.json()
else:
print(f"Failed to fetch {url}")
return None
def fetch_manifest(plugin_name):
""" Fetches the manifest data for a given plugin """
url = f'https://npe2api.vercel.app/api/manifest/{plugin_name}'
response = requests.get(url)
if response.status_code == 200:
return response.json()
else:
print(f"Failed to fetch MANIFEST info for {plugin_name}")
return None
def flatten_and_merge(original, additional, parent_key=''):
""" Flattens an additional nested dictionary and merges it into the original dictionary """
for key, value in additional.items():
new_key = f"{parent_key}_{key}" if parent_key else key
if isinstance(value, dict):
flatten_and_merge(original, value, new_key)
elif isinstance(value, list) and all(isinstance(elem, dict) for elem in value):
# Handle list of dictionaries separately
for i, elem in enumerate(value):
flatten_and_merge(original, elem, f"{new_key}_{i}")
else:
original.setdefault(new_key, value)
def build_plugins_dataframe():
summary_url = 'https://npe2api.vercel.app/api/summary'
plugin_summary = fetch_plugin(summary_url)
if not plugin_summary:
return pd.DataFrame()
all_plugin_data = []
for plugin in plugin_summary:
plugin_data = plugin.copy()
plugin_name = plugin.get('name')
# Fetch and flatten Conda info and Manifest
conda_info = fetch_conda(plugin_name)
manifest_info = fetch_manifest(plugin_name)
if conda_info:
flatten_and_merge(plugin_data, conda_info)
if manifest_info:
flatten_and_merge(plugin_data, manifest_info)
all_plugin_data.append(plugin_data)
#print(pd.DataFrame(all_plugin_data))
df = pd.DataFrame(all_plugin_data)
return df
# Define the function to extract the author's name from the email field
def extract_author_name(email):
if not isinstance(email, str):
return ''
# Split the string by comma to process multiple authors
authors = email.split(',')
clean_authors = []
for author in authors:
# Use regex to find name patterns before the email
match = re.match(r'(.*)<.*?>', author)
if match:
# Remove surrounding quotation marks using strip
clean_author = match.group(1).replace('"', '').strip()
clean_authors.append(clean_author)
else:
clean_authors.append(author.replace('"', '').strip())
# Return the list of clean author names
return ', '.join(clean_authors)
# Function to classify the 'home' column
def classify_website(home_url):
if pd.notnull(home_url):
if 'pypi.org' in home_url:
return 'pypi'
elif 'github.com' in home_url:
return 'github'
return 'other'
#########################################
## main starts
#########################################
df_plugins = build_plugins_dataframe()
# Drop columns where all elements are NaN
#df_plugins.dropna(axis=1, how='all', inplace=True)
# Drop columns where less than 20 non-NaN counts
df_plugins.dropna(axis=1, thresh=20, inplace=True)
# Create a dictionary of column names and their non-NaN counts
column_counts = {column: df_plugins[column].count() for column in df_plugins.columns}
# Sort the dictionary by counts and print
for column, count in sorted(column_counts.items(), key=lambda item: item[1], reverse=True):
print(column, count)
# Save the cleaned DataFrame to a CSV file
df_plugins.to_csv('./data/cleaned_napari_plugins.csv')
Plugin_page_columns = ['display_name', 'version', 'created_at','modified_at', 'name',
'author', #'package_metadata_author',
'package_metadata_author_email',
'license', #'package_metadata_license',
'home', #'home_page', 'package_metadata_home_page',
'summary', # 'package_metadata_summary'
'package_metadata_requires_python',
'package_metadata_requires_dist',
'package_metadata_description',
'package_metadata_classifier',
'package_metadata_project_url',
'contributions_readers_0_command',
'contributions_writers_0_command',
'contributions_widgets_0_command',
'contributions_sample_data_0_command',
'contributions_readers_0_filename_patterns',
'contributions_writers_0_filename_extensions',
'contributions_writers_1_filename_extensions']
df_plugins = df_plugins[Plugin_page_columns]
# Convert and format 'created_at' and 'modified_at' columns
df_plugins['created_at'] = pd.to_datetime(df_plugins['created_at']).dt.date
df_plugins['modified_at'] = pd.to_datetime(df_plugins['modified_at']).dt.date
# Apply the function to the 'home' column to classify the websites
df_plugins['home_type'] = df_plugins['home'].apply(classify_website)
# Create the new columns based on classification
df_plugins['home_pypi'] = df_plugins['home'].where(df_plugins['home_type'] == 'pypi', '')
df_plugins['home_github'] = df_plugins['home'].where(df_plugins['home_type'] == 'github', '')
df_plugins['home_other'] = df_plugins['home'].where(df_plugins['home_type'] == 'other', '')
# Delete 'home_type' column as it is no longer needed
df_plugins.drop('home_type', axis=1, inplace=True)
for index, row in df_plugins.iterrows():
# Check if 'author' is NaN or contains quotation marks
if pd.isna(row['author']) or '"' in str(row['author']):
# Update 'author' using the extracted name from 'package_metadata_author_email'
df_plugins.at[index, 'author'] = extract_author_name(row['package_metadata_author_email'])
if pd.isna(row['license']):
pass
else:
# Check for specific license strings to shorten the license information
if "BSD 3-Clause" in str(row['license']):
df_plugins.at[index, 'license'] = "BSD 3-Clause"
elif "MIT License" in str(row['license']):
df_plugins.at[index, 'license'] = "MIT"
# Fill home_pypi
if not row['home_pypi']:
df_plugins.at[index, 'home_pypi'] = f"https://pypi.org/project/{row['name']}"
df_plugins.to_csv('./data/final_plugins.csv')