Skip to content

Commit

Permalink
Minor commit
Browse files Browse the repository at this point in the history
  • Loading branch information
NKofod committed Apr 15, 2022
1 parent 35868f0 commit 1691100
Show file tree
Hide file tree
Showing 47 changed files with 454 additions and 0 deletions.
135 changes: 135 additions & 0 deletions .history/finanslov/fetch_udd_dat_20220415224812.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import requests
from bs4 import BeautifulSoup as Soup
import re
import pandas as pd
import os

universities = {"KU": "01",
"AU": "05",
"SDU": "11",
"RUC": "15",
"AAU": "17",
"CBS": "21",
"DTU": "37",
"ITU": "45"}

try:
os.mkdir("data")
os.mkdir("data/raw")
os.mkdir("data/processed")
except FileExistsError:
pass
except IsADirectoryError:
pass


for university in universities:

urls = ["22-1","18","13A", "08A"]
for url in urls:
print(f"Working on {url}")
tmp_url = f"https://www.oes-cs.dk/bevillingslove/doctopic.cgi?book=BEVPUBL.FL{url}&topic=19.22.{universities[university]}&searchtype=3"
tmp = requests.get(tmp_url).content
with open
tmp_string = str(tmp)
tmp_decode = tmp.decode('iso8859_10')
tmp_decode = tmp_decode.replace("<B>","")
tmp_decode = tmp_decode.replace("</B>","")
tmp_list = tmp_decode.split("\r\n")
optage_num = 0
virk_num = 0
for i in tmp_list:
if "Optagelsesskøn" in i:
print(tmp_list.index(i))
optage_num = tmp_list.index(i)
elif "Virksomhedsoversigt" in i:
virk_num = tmp_list.index(i)

optag = tmp_list[optage_num:virk_num]

virk = tmp_list[virk_num:]
optag = [i.split("|")[1:-1] for i in optag if "|" in i]
virk = [i.split("|")[1:-1] for i in virk if "|" in i]

optag[0][0] = "Finanslov"
optag[1][0] = "År"
virk[0][0] = "Finanslov"
virk[1][0] = "År"
for i in range(len(optag)):
for j in range(len(optag[i])):
optag[i][j] = optag[i][j].replace(".","")
optag[i][j] = re.sub("^\s+(.+?)$",r"\1",optag[i][j])
optag[i][j] = re.sub("\s+$",r"",optag[i][j])
if optag[i][j] == "-":
optag[i][j] == None
# optag[i][j] = optag[i][j].replace(" ","")
try:
optag[i][j] = int(optag[i][j])
except ValueError:
pass
except TypeError:
pass
if optag[i][j] == "achelor":
optag[i][j] = "Bachelor"
elif optag[i][j] == "andidat":
optag[i][j] = "Kandidat"
elif optag[i][j] == "alt":
optag[i][j] = "Total"
if optag[i][1:] == [" "] * (len(optag[i]) -1):
optag[i+1][0] = optag[i][0] + " " + optag[i+1][0]
act_num = 0

for i in range(len(virk)):
for j in range(len(virk[i])):
virk[i][j] = virk[i][j].replace(".","")
virk[i][j] = re.sub("^\s+(.+?)\s+$",r"\1",virk[i][j])
virk[i][j] = re.sub("\s+$", "", virk[i][j])
virk[i][j] = re.sub("\s+", " ",virk[i][j])
virk[i][j] = re.sub(",", ".", virk[i][j])
if virk[i][j] == "-":
virk[i][j] = None
# virk[i][j] = virk[i][j].replace(" ","")
try:
virk[i][j] = int(virk[i][j])
except ValueError:
pass
except TypeError:
pass
if virk[i][j] == "ktiviteter":
virk[i][j] = "Aktiviteter"
act_num = i
elif virk[i][j] == "Aktiviteter":
act_num = i
if virk[i][1:] == [""] * (len(virk[i]) -1) and len(virk[i]) > 1 :
virk[i+1][0] = virk[i][0] + " " + virk[i+1][0]
if virk[i][j] == "alt omsætning (mio kr)":
virk[i][j] = "Total omsætning (mio kr)"

aktivitet = virk[act_num+1:]
virk = virk[:act_num]

virk_dict = {}
optag_dict = {}
aktivitet_dict = {}

for i in virk:
if i[1:] == [""] * len(i[1:]):
continue
virk_dict[i[0]] = i[1:]
for i in optag:
if i[1:] == [""] * len(i[1:]):
continue
optag_dict[i[0]] = i[1:]
for i in aktivitet:
if i[1:] == [""] * len(i[1:]):
continue
aktivitet_dict[i[0]] = i[1:]

virk_df = pd.DataFrame(virk_dict)
# optag_df = pd.DataFrame(optag_dict)
# aktivitet_df = pd.DataFrame(aktivitet_dict)
# aktivitet_df['År'] = virk_df['År']
# aktivitet_df['Finanslov'] = virk_df['Finanslov']
virk_df.to_csv(f"{university}_virk_{url}.csv",index=False,sep=";")
# optag_df.to_csv(f"{university}_optag_{url}.csv",index=False,sep=";")
# aktivitet_df.to_csv(f"{university}_aktivitet_{url}.csv",index=False,sep=";")
90 changes: 90 additions & 0 deletions .history/finanslov/fetch_udd_dat_20220415224955.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import requests
from bs4 import BeautifulSoup as Soup
import re
import pandas as pd
import os

universities = {"KU": "01",
"AU": "05",
"SDU": "11",
"RUC": "15",
"AAU": "17",
"CBS": "21",
"DTU": "37",
"ITU": "45"}

try:
os.mkdir("data")
os.mkdir("data/raw")
os.mkdir("data/processed")
except FileExistsError:
pass
except IsADirectoryError:
pass


for university in universities:
urls = ["22-1","18","13A", "08A"]
for url in urls:
print(f"Working on {url}")
tmp_url = f"https://www.oes-cs.dk/bevillingslove/doctopic.cgi?book=BEVPUBL.FL{url}&topic=19.22.{universities[university]}&searchtype=3"
tmp = requests.get(tmp_url).content
with open
tmp_string = str(tmp)
tmp_decode = tmp.decode('iso8859_10')
tmp_decode = tmp_decode.replace("<B>","")
tmp_decode = tmp_decode.replace("</B>","")
tmp_list = tmp_decode.split("\r\n")
optage_num = 0
virk_num = 0
for i in tmp_list:
if "Optagelsesskøn" in i:
print(tmp_list.index(i))
optage_num = tmp_list.index(i)
elif "Virksomhedsoversigt" in i:
virk_num = tmp_list.index(i)


virk = tmp_list[virk_num:]
virk = [i.split("|")[1:-1] for i in virk if "|" in i]

virk[0][0] = "Finanslov"
virk[1][0] = "År"
act_num = 0

for i in range(len(virk)):
for j in range(len(virk[i])):
virk[i][j] = virk[i][j].replace(".","")
virk[i][j] = re.sub("^\s+(.+?)\s+$",r"\1",virk[i][j])
virk[i][j] = re.sub("\s+$", "", virk[i][j])
virk[i][j] = re.sub("\s+", " ",virk[i][j])
virk[i][j] = re.sub(",", ".", virk[i][j])
if virk[i][j] == "-":
virk[i][j] = None
# virk[i][j] = virk[i][j].replace(" ","")
try:
virk[i][j] = int(virk[i][j])
except ValueError:
pass
except TypeError:
pass
if virk[i][j] == "ktiviteter":
virk[i][j] = "Aktiviteter"
act_num = i
elif virk[i][j] == "Aktiviteter":
act_num = i
if virk[i][1:] == [""] * (len(virk[i]) -1) and len(virk[i]) > 1 :
virk[i+1][0] = virk[i][0] + " " + virk[i+1][0]
if virk[i][j] == "alt omsætning (mio kr)":
virk[i][j] = "Total omsætning (mio kr)"

virk = virk[:act_num]
virk_dict = {}
for i in virk:
if i[1:] == [""] * len(i[1:]):
continue
virk_dict[i[0]] = i[1:]

virk_df = pd.DataFrame(virk_dict)

virk_df.to_csv(f"./data/raw/{university}_virk_{url}.csv",index=False,sep=";")
File renamed without changes.
File renamed without changes.
69 changes: 69 additions & 0 deletions .history/finanslov/merge_years_and_universities_20220415225138.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import pandas as pd

universities = [
"AU",
"AAU",
"CBS",
"DTU",
"ITU",
"KU",
"RUC",
"SDU"]

years = [
("08A", 100/89.9),
("13A", 100/98.6),
("18", 100/102.6),
("22-1", 100/105.4)]

column_names = {
"I alt omsætning (mio kr)": "Total Omsætning",
"alt omsætning (mio kr)": "Total Omsætning",
"Uddannelsestilskud fra UBST": "Uddannelsestilskud",
"Uddannelsestilskud fra UI": "Uddannelsestilskud",
"Heltidsuddannelse": "Uddannelsestaxameter (Heltid)",
"Deltidsuddannelse": "Uddannelsestaxameter (Deltid)",
"Basistilskud fra UBST": "Basistilskud",
"Basistilskud fra UI": "Basistilskud"
}
column_list = [
"År",
"Finanslov",
"Universitet",
"Total Omsætning",
"Uddannelsestilskud",
"Uddannelsestaxameter (Heltid)",
"Uddannelsestaxameter (Deltid)",
"Basistilskud",
"Tilskudsfinansieret forskning"
]
financial_columns = [
"Total Omsætning",
"Uddannelsestilskud",
"Uddannelsestaxameter (Heltid)",
"Uddannelsestaxameter (Deltid)",
"Basistilskud",
"Tilskudsfinansieret forskning"
]

base_dict = {
i: [] for i in column_list
}
base_df = pd.DataFrame(base_dict)


for university in universities:
for year in years:
# print(in_df.describe())
in_df = pd.read_csv(f"./data/raw/{university}_virk_{year[0]}.csv", sep=";",index_col = False)
in_df["Universitet"] = university
in_df = in_df.rename(columns = column_names)
# print(in_df.columns)
for i in financial_columns:
in_df[i] = in_df[i] * year[1] * 10 ** 6
in_df = in_df[column_list]
if year[0] != "22-1":
in_df = in_df[in_df["År"] != 2016]
in_df["År"] = in_df["År"].astype("int")
base_df = base_df.append(in_df[in_df["Finanslov"] == "R"],ignore_index=True)
base_df.to_csv("./data/processed/Universitetsfinansiering_2002-2021.csv",sep=";",index=False)
File renamed without changes.
File renamed without changes.
Empty file added udd_data/checked.txt
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
89 changes: 89 additions & 0 deletions udd_data/fetch_udd_dat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import requests
from bs4 import BeautifulSoup as Soup
import re
import pandas as pd
import os

universities = {"KU": "01",
"AU": "05",
"SDU": "11",
"RUC": "15",
"AAU": "17",
"CBS": "21",
"DTU": "37",
"ITU": "45"}

try:
os.mkdir("data")
os.mkdir("data/raw")
os.mkdir("data/processed")
except FileExistsError:
pass
except IsADirectoryError:
pass


for university in universities:
urls = ["22-1","18","13A", "08A"]
for url in urls:
print(f"Working on {url}")
tmp_url = f"https://www.oes-cs.dk/bevillingslove/doctopic.cgi?book=BEVPUBL.FL{url}&topic=19.22.{universities[university]}&searchtype=3"
tmp = requests.get(tmp_url).content
tmp_string = str(tmp)
tmp_decode = tmp.decode('iso8859_10')
tmp_decode = tmp_decode.replace("<B>","")
tmp_decode = tmp_decode.replace("</B>","")
tmp_list = tmp_decode.split("\r\n")
optage_num = 0
virk_num = 0
for i in tmp_list:
if "Optagelsesskøn" in i:
print(tmp_list.index(i))
optage_num = tmp_list.index(i)
elif "Virksomhedsoversigt" in i:
virk_num = tmp_list.index(i)


virk = tmp_list[virk_num:]
virk = [i.split("|")[1:-1] for i in virk if "|" in i]

virk[0][0] = "Finanslov"
virk[1][0] = "År"
act_num = 0

for i in range(len(virk)):
for j in range(len(virk[i])):
virk[i][j] = virk[i][j].replace(".","")
virk[i][j] = re.sub("^\s+(.+?)\s+$",r"\1",virk[i][j])
virk[i][j] = re.sub("\s+$", "", virk[i][j])
virk[i][j] = re.sub("\s+", " ",virk[i][j])
virk[i][j] = re.sub(",", ".", virk[i][j])
if virk[i][j] == "-":
virk[i][j] = None
# virk[i][j] = virk[i][j].replace(" ","")
try:
virk[i][j] = int(virk[i][j])
except ValueError:
pass
except TypeError:
pass
if virk[i][j] == "ktiviteter":
virk[i][j] = "Aktiviteter"
act_num = i
elif virk[i][j] == "Aktiviteter":
act_num = i
if virk[i][1:] == [""] * (len(virk[i]) -1) and len(virk[i]) > 1 :
virk[i+1][0] = virk[i][0] + " " + virk[i+1][0]
if virk[i][j] == "alt omsætning (mio kr)":
virk[i][j] = "Total omsætning (mio kr)"

virk = virk[:act_num]
virk_dict = {}
for i in virk:
if i[1:] == [""] * len(i[1:]):
continue
virk_dict[i[0]] = i[1:]

virk_df = pd.DataFrame(virk_dict)

virk_df.to_csv(f"./data/raw/{university}_virk_{url}.csv",index=False,sep=";")
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 1691100

Please sign in to comment.