-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
47 changed files
with
454 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
import requests | ||
from bs4 import BeautifulSoup as Soup | ||
import re | ||
import pandas as pd | ||
import os | ||
|
||
universities = {"KU": "01", | ||
"AU": "05", | ||
"SDU": "11", | ||
"RUC": "15", | ||
"AAU": "17", | ||
"CBS": "21", | ||
"DTU": "37", | ||
"ITU": "45"} | ||
|
||
try: | ||
os.mkdir("data") | ||
os.mkdir("data/raw") | ||
os.mkdir("data/processed") | ||
except FileExistsError: | ||
pass | ||
except IsADirectoryError: | ||
pass | ||
|
||
|
||
for university in universities: | ||
|
||
urls = ["22-1","18","13A", "08A"] | ||
for url in urls: | ||
print(f"Working on {url}") | ||
tmp_url = f"https://www.oes-cs.dk/bevillingslove/doctopic.cgi?book=BEVPUBL.FL{url}&topic=19.22.{universities[university]}&searchtype=3" | ||
tmp = requests.get(tmp_url).content | ||
with open | ||
tmp_string = str(tmp) | ||
tmp_decode = tmp.decode('iso8859_10') | ||
tmp_decode = tmp_decode.replace("<B>","") | ||
tmp_decode = tmp_decode.replace("</B>","") | ||
tmp_list = tmp_decode.split("\r\n") | ||
optage_num = 0 | ||
virk_num = 0 | ||
for i in tmp_list: | ||
if "Optagelsesskøn" in i: | ||
print(tmp_list.index(i)) | ||
optage_num = tmp_list.index(i) | ||
elif "Virksomhedsoversigt" in i: | ||
virk_num = tmp_list.index(i) | ||
|
||
optag = tmp_list[optage_num:virk_num] | ||
|
||
virk = tmp_list[virk_num:] | ||
optag = [i.split("|")[1:-1] for i in optag if "|" in i] | ||
virk = [i.split("|")[1:-1] for i in virk if "|" in i] | ||
|
||
optag[0][0] = "Finanslov" | ||
optag[1][0] = "År" | ||
virk[0][0] = "Finanslov" | ||
virk[1][0] = "År" | ||
for i in range(len(optag)): | ||
for j in range(len(optag[i])): | ||
optag[i][j] = optag[i][j].replace(".","") | ||
optag[i][j] = re.sub("^\s+(.+?)$",r"\1",optag[i][j]) | ||
optag[i][j] = re.sub("\s+$",r"",optag[i][j]) | ||
if optag[i][j] == "-": | ||
optag[i][j] == None | ||
# optag[i][j] = optag[i][j].replace(" ","") | ||
try: | ||
optag[i][j] = int(optag[i][j]) | ||
except ValueError: | ||
pass | ||
except TypeError: | ||
pass | ||
if optag[i][j] == "achelor": | ||
optag[i][j] = "Bachelor" | ||
elif optag[i][j] == "andidat": | ||
optag[i][j] = "Kandidat" | ||
elif optag[i][j] == "alt": | ||
optag[i][j] = "Total" | ||
if optag[i][1:] == [" "] * (len(optag[i]) -1): | ||
optag[i+1][0] = optag[i][0] + " " + optag[i+1][0] | ||
act_num = 0 | ||
|
||
for i in range(len(virk)): | ||
for j in range(len(virk[i])): | ||
virk[i][j] = virk[i][j].replace(".","") | ||
virk[i][j] = re.sub("^\s+(.+?)\s+$",r"\1",virk[i][j]) | ||
virk[i][j] = re.sub("\s+$", "", virk[i][j]) | ||
virk[i][j] = re.sub("\s+", " ",virk[i][j]) | ||
virk[i][j] = re.sub(",", ".", virk[i][j]) | ||
if virk[i][j] == "-": | ||
virk[i][j] = None | ||
# virk[i][j] = virk[i][j].replace(" ","") | ||
try: | ||
virk[i][j] = int(virk[i][j]) | ||
except ValueError: | ||
pass | ||
except TypeError: | ||
pass | ||
if virk[i][j] == "ktiviteter": | ||
virk[i][j] = "Aktiviteter" | ||
act_num = i | ||
elif virk[i][j] == "Aktiviteter": | ||
act_num = i | ||
if virk[i][1:] == [""] * (len(virk[i]) -1) and len(virk[i]) > 1 : | ||
virk[i+1][0] = virk[i][0] + " " + virk[i+1][0] | ||
if virk[i][j] == "alt omsætning (mio kr)": | ||
virk[i][j] = "Total omsætning (mio kr)" | ||
|
||
aktivitet = virk[act_num+1:] | ||
virk = virk[:act_num] | ||
|
||
virk_dict = {} | ||
optag_dict = {} | ||
aktivitet_dict = {} | ||
|
||
for i in virk: | ||
if i[1:] == [""] * len(i[1:]): | ||
continue | ||
virk_dict[i[0]] = i[1:] | ||
for i in optag: | ||
if i[1:] == [""] * len(i[1:]): | ||
continue | ||
optag_dict[i[0]] = i[1:] | ||
for i in aktivitet: | ||
if i[1:] == [""] * len(i[1:]): | ||
continue | ||
aktivitet_dict[i[0]] = i[1:] | ||
|
||
virk_df = pd.DataFrame(virk_dict) | ||
# optag_df = pd.DataFrame(optag_dict) | ||
# aktivitet_df = pd.DataFrame(aktivitet_dict) | ||
# aktivitet_df['År'] = virk_df['År'] | ||
# aktivitet_df['Finanslov'] = virk_df['Finanslov'] | ||
virk_df.to_csv(f"{university}_virk_{url}.csv",index=False,sep=";") | ||
# optag_df.to_csv(f"{university}_optag_{url}.csv",index=False,sep=";") | ||
# aktivitet_df.to_csv(f"{university}_aktivitet_{url}.csv",index=False,sep=";") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import requests | ||
from bs4 import BeautifulSoup as Soup | ||
import re | ||
import pandas as pd | ||
import os | ||
|
||
universities = {"KU": "01", | ||
"AU": "05", | ||
"SDU": "11", | ||
"RUC": "15", | ||
"AAU": "17", | ||
"CBS": "21", | ||
"DTU": "37", | ||
"ITU": "45"} | ||
|
||
try: | ||
os.mkdir("data") | ||
os.mkdir("data/raw") | ||
os.mkdir("data/processed") | ||
except FileExistsError: | ||
pass | ||
except IsADirectoryError: | ||
pass | ||
|
||
|
||
for university in universities: | ||
urls = ["22-1","18","13A", "08A"] | ||
for url in urls: | ||
print(f"Working on {url}") | ||
tmp_url = f"https://www.oes-cs.dk/bevillingslove/doctopic.cgi?book=BEVPUBL.FL{url}&topic=19.22.{universities[university]}&searchtype=3" | ||
tmp = requests.get(tmp_url).content | ||
with open | ||
tmp_string = str(tmp) | ||
tmp_decode = tmp.decode('iso8859_10') | ||
tmp_decode = tmp_decode.replace("<B>","") | ||
tmp_decode = tmp_decode.replace("</B>","") | ||
tmp_list = tmp_decode.split("\r\n") | ||
optage_num = 0 | ||
virk_num = 0 | ||
for i in tmp_list: | ||
if "Optagelsesskøn" in i: | ||
print(tmp_list.index(i)) | ||
optage_num = tmp_list.index(i) | ||
elif "Virksomhedsoversigt" in i: | ||
virk_num = tmp_list.index(i) | ||
|
||
|
||
virk = tmp_list[virk_num:] | ||
virk = [i.split("|")[1:-1] for i in virk if "|" in i] | ||
|
||
virk[0][0] = "Finanslov" | ||
virk[1][0] = "År" | ||
act_num = 0 | ||
|
||
for i in range(len(virk)): | ||
for j in range(len(virk[i])): | ||
virk[i][j] = virk[i][j].replace(".","") | ||
virk[i][j] = re.sub("^\s+(.+?)\s+$",r"\1",virk[i][j]) | ||
virk[i][j] = re.sub("\s+$", "", virk[i][j]) | ||
virk[i][j] = re.sub("\s+", " ",virk[i][j]) | ||
virk[i][j] = re.sub(",", ".", virk[i][j]) | ||
if virk[i][j] == "-": | ||
virk[i][j] = None | ||
# virk[i][j] = virk[i][j].replace(" ","") | ||
try: | ||
virk[i][j] = int(virk[i][j]) | ||
except ValueError: | ||
pass | ||
except TypeError: | ||
pass | ||
if virk[i][j] == "ktiviteter": | ||
virk[i][j] = "Aktiviteter" | ||
act_num = i | ||
elif virk[i][j] == "Aktiviteter": | ||
act_num = i | ||
if virk[i][1:] == [""] * (len(virk[i]) -1) and len(virk[i]) > 1 : | ||
virk[i+1][0] = virk[i][0] + " " + virk[i+1][0] | ||
if virk[i][j] == "alt omsætning (mio kr)": | ||
virk[i][j] = "Total omsætning (mio kr)" | ||
|
||
virk = virk[:act_num] | ||
virk_dict = {} | ||
for i in virk: | ||
if i[1:] == [""] * len(i[1:]): | ||
continue | ||
virk_dict[i[0]] = i[1:] | ||
|
||
virk_df = pd.DataFrame(virk_dict) | ||
|
||
virk_df.to_csv(f"./data/raw/{university}_virk_{url}.csv",index=False,sep=";") |
File renamed without changes.
File renamed without changes.
69 changes: 69 additions & 0 deletions
69
.history/finanslov/merge_years_and_universities_20220415225138.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import pandas as pd | ||
|
||
universities = [ | ||
"AU", | ||
"AAU", | ||
"CBS", | ||
"DTU", | ||
"ITU", | ||
"KU", | ||
"RUC", | ||
"SDU"] | ||
|
||
years = [ | ||
("08A", 100/89.9), | ||
("13A", 100/98.6), | ||
("18", 100/102.6), | ||
("22-1", 100/105.4)] | ||
|
||
column_names = { | ||
"I alt omsætning (mio kr)": "Total Omsætning", | ||
"alt omsætning (mio kr)": "Total Omsætning", | ||
"Uddannelsestilskud fra UBST": "Uddannelsestilskud", | ||
"Uddannelsestilskud fra UI": "Uddannelsestilskud", | ||
"Heltidsuddannelse": "Uddannelsestaxameter (Heltid)", | ||
"Deltidsuddannelse": "Uddannelsestaxameter (Deltid)", | ||
"Basistilskud fra UBST": "Basistilskud", | ||
"Basistilskud fra UI": "Basistilskud" | ||
} | ||
column_list = [ | ||
"År", | ||
"Finanslov", | ||
"Universitet", | ||
"Total Omsætning", | ||
"Uddannelsestilskud", | ||
"Uddannelsestaxameter (Heltid)", | ||
"Uddannelsestaxameter (Deltid)", | ||
"Basistilskud", | ||
"Tilskudsfinansieret forskning" | ||
] | ||
financial_columns = [ | ||
"Total Omsætning", | ||
"Uddannelsestilskud", | ||
"Uddannelsestaxameter (Heltid)", | ||
"Uddannelsestaxameter (Deltid)", | ||
"Basistilskud", | ||
"Tilskudsfinansieret forskning" | ||
] | ||
|
||
base_dict = { | ||
i: [] for i in column_list | ||
} | ||
base_df = pd.DataFrame(base_dict) | ||
|
||
|
||
for university in universities: | ||
for year in years: | ||
# print(in_df.describe()) | ||
in_df = pd.read_csv(f"./data/raw/{university}_virk_{year[0]}.csv", sep=";",index_col = False) | ||
in_df["Universitet"] = university | ||
in_df = in_df.rename(columns = column_names) | ||
# print(in_df.columns) | ||
for i in financial_columns: | ||
in_df[i] = in_df[i] * year[1] * 10 ** 6 | ||
in_df = in_df[column_list] | ||
if year[0] != "22-1": | ||
in_df = in_df[in_df["År"] != 2016] | ||
in_df["År"] = in_df["År"].astype("int") | ||
base_df = base_df.append(in_df[in_df["Finanslov"] == "R"],ignore_index=True) | ||
base_df.to_csv("./data/processed/Universitetsfinansiering_2002-2021.csv",sep=";",index=False) |
File renamed without changes.
File renamed without changes.
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import requests | ||
from bs4 import BeautifulSoup as Soup | ||
import re | ||
import pandas as pd | ||
import os | ||
|
||
universities = {"KU": "01", | ||
"AU": "05", | ||
"SDU": "11", | ||
"RUC": "15", | ||
"AAU": "17", | ||
"CBS": "21", | ||
"DTU": "37", | ||
"ITU": "45"} | ||
|
||
try: | ||
os.mkdir("data") | ||
os.mkdir("data/raw") | ||
os.mkdir("data/processed") | ||
except FileExistsError: | ||
pass | ||
except IsADirectoryError: | ||
pass | ||
|
||
|
||
for university in universities: | ||
urls = ["22-1","18","13A", "08A"] | ||
for url in urls: | ||
print(f"Working on {url}") | ||
tmp_url = f"https://www.oes-cs.dk/bevillingslove/doctopic.cgi?book=BEVPUBL.FL{url}&topic=19.22.{universities[university]}&searchtype=3" | ||
tmp = requests.get(tmp_url).content | ||
tmp_string = str(tmp) | ||
tmp_decode = tmp.decode('iso8859_10') | ||
tmp_decode = tmp_decode.replace("<B>","") | ||
tmp_decode = tmp_decode.replace("</B>","") | ||
tmp_list = tmp_decode.split("\r\n") | ||
optage_num = 0 | ||
virk_num = 0 | ||
for i in tmp_list: | ||
if "Optagelsesskøn" in i: | ||
print(tmp_list.index(i)) | ||
optage_num = tmp_list.index(i) | ||
elif "Virksomhedsoversigt" in i: | ||
virk_num = tmp_list.index(i) | ||
|
||
|
||
virk = tmp_list[virk_num:] | ||
virk = [i.split("|")[1:-1] for i in virk if "|" in i] | ||
|
||
virk[0][0] = "Finanslov" | ||
virk[1][0] = "År" | ||
act_num = 0 | ||
|
||
for i in range(len(virk)): | ||
for j in range(len(virk[i])): | ||
virk[i][j] = virk[i][j].replace(".","") | ||
virk[i][j] = re.sub("^\s+(.+?)\s+$",r"\1",virk[i][j]) | ||
virk[i][j] = re.sub("\s+$", "", virk[i][j]) | ||
virk[i][j] = re.sub("\s+", " ",virk[i][j]) | ||
virk[i][j] = re.sub(",", ".", virk[i][j]) | ||
if virk[i][j] == "-": | ||
virk[i][j] = None | ||
# virk[i][j] = virk[i][j].replace(" ","") | ||
try: | ||
virk[i][j] = int(virk[i][j]) | ||
except ValueError: | ||
pass | ||
except TypeError: | ||
pass | ||
if virk[i][j] == "ktiviteter": | ||
virk[i][j] = "Aktiviteter" | ||
act_num = i | ||
elif virk[i][j] == "Aktiviteter": | ||
act_num = i | ||
if virk[i][1:] == [""] * (len(virk[i]) -1) and len(virk[i]) > 1 : | ||
virk[i+1][0] = virk[i][0] + " " + virk[i+1][0] | ||
if virk[i][j] == "alt omsætning (mio kr)": | ||
virk[i][j] = "Total omsætning (mio kr)" | ||
|
||
virk = virk[:act_num] | ||
virk_dict = {} | ||
for i in virk: | ||
if i[1:] == [""] * len(i[1:]): | ||
continue | ||
virk_dict[i[0]] = i[1:] | ||
|
||
virk_df = pd.DataFrame(virk_dict) | ||
|
||
virk_df.to_csv(f"./data/raw/{university}_virk_{url}.csv",index=False,sep=";") |
File renamed without changes.
File renamed without changes.
Oops, something went wrong.