-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcsvConverter.py
73 lines (63 loc) · 3.04 KB
/
csvConverter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import sys
import json
import csv
from lxml import html
# read in project names
projects = []
with open(sys.argv[1]) as projectfile:
projects = projectfile.read().splitlines()
with open("output.csv", "w") as csvfile:
fieldnames = ["name", "owner", "url", "version", "creation date", "stars", "watches", "forks", \
"contributors", "languages", "open issues", "closed issues", "last year commit #", "description", "readme", \
"Propagation Cost", "Architecture Type", "Core Size", "Central Size", "Lines of Code (LOC)", "Comment/Code Ratio", \
"Classes", "Files", "Median LOC per File", "Files > 200 LOC", "Functions > 200 LOC", "Median CBO", \
"CBO > 8", "Median WMC", "WMC > 12", "Median WMC-McCabe", "WMC-McCabe > 100", "Median RFC", "RFC > 30"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for proj in projects:
p = proj.split('/')[1]
data = {}
try:
with open("data/" + p + "/fromGithub.json") as jsonfile:
data = json.load(jsonfile)
output = {}
output["name"] = data["repo"]["name"]
output["owner"] = data["repo"]["owner"]["login"]
output["version"] = data["repo"]["pushed_at"]
output["stars"] = data["repo"]["stargazers_count"]
output["watches"] = data["repo"]["subscribers_count"]
output["forks"] = data["repo"]["forks_count"]
output["contributors"] = data["contributors_count"]
output["open issues"] = data["repo"]["open_issues_count"]
output["closed issues"] = data["closed_issues_count"]
output["creation date"] = data["repo"]["created_at"]
output["description"] = data["repo"]["description"]
# get the most-used language
output["languages"] = sorted(data["languages"].items(), key=lambda x: x[1], reverse=True)[0][0]
output["last year commit #"] = sum([x["total"] for x in data["commit_activity"]])
output["url"] = "https://github.com/" + output["owner"] + "/" + output["name"]
paragraphs = data["readme"].split("\n\n")
for para in paragraphs:
par = para.strip()
if par[0] not in "|<>" and par[-1] == '.':
output["readme"] = par
break
except IOError as e:
print "no JSON file for: " + p
continue
# Understand metrics
try:
with open("data/" + p + "/index.html") as htmlfile:
# grab <head><script>
block = html.tostring(html.parse(htmlfile).getroot()[0][0])
metrics = json.loads(block.split("metrics=")[1].split(';')[0])
for m in metrics:
if m['name'] != "Project Name":
output[m['name']] = m['value']
except IOError as e:
print "no metrics file for: " + p
continue
try:
writer.writerow(output)
except UnicodeEncodeError as e:
print p, e