-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
60 lines (52 loc) · 2.33 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import pandas as pd
import numpy as np
import sys
frac = float(sys.argv[1])
# Use header to load my csv file with variable names
header = ['label','rndm','weight']
# boosted frame
header += ['W_B_pt','W_B_eta','W_B_phi','W_B_E','W_B_m']
header += ['b_B_pt','b_B_eta','b_B_phi','b_B_E','b_B_m']
header += ['Wjj_B_dR','tWb_B_dR'] #dR = sqrt(phi^2 + eta^2)
header += ['Wjj_B_deta','Wjj_B_dphi','tWb_B_deta','tWb_B_dphi']
# lab frame
header += ['W_pt','W_eta','W_phi','W_E','W_m']
header += ['b_pt','b_eta','b_phi','b_E','b_m']
header += ['Wjj_dR','tWb_dR']
header += ['Wjj_deta','Wjj_dphi','tWb_deta','tWb_dphi']
header += ['t_pt','t_eta','t_phi','t_E','t_m']
header += ['btag1', 'btag2', 'btag3'] #binary representation of likelihood of the jet being an actual bjet
#df = pd.read_csv('~/projects/top-reco-tests/samples/result.csv', names=header, delimiter=' ', skiprows=1)
df = pd.read_csv('~/projects/samples/result.csv', delimiter=' ', names=header, skiprows=1)
#down-sample the class of non-jet samples to 1/4 of the original size (prevents model bias towards to majority class)
pos_class = df[df['label'] == 1]
neg_class = df[df['label'] == 0]
neg_class = neg_class.sample(frac=frac)
neg_class.shape
final_df = pd.concat([neg_class, pos_class])
#dataframe preprocessing
y = final_df['label']
X = final_df.drop('label', axis=1)
#data normalization
normalization_data = pd.DataFrame(columns=["column_name", "slope_upper", "slope_lower", "threshold", "minimum"])
def normalize(col):
global normalization_data
print(col.name)
threshold = col.quantile(0.9)
mini = col.min()
slopeUpper = (1 - 0.9) / (col.max() - threshold)
slopeLower = (0.6 - 0) / (threshold - mini)
temp = {"column_name": col.name, "slope_upper": slopeUpper, "slope_lower": slopeLower, "threshold": threshold, "minimum": mini}
normalization_data = normalization_data.append(temp, ignore_index=True)
def norm_helper(row):
if row > threshold:
return 0.9 + slopeUpper * (row - threshold)
else:
return 0 + slopeLower * (row - mini)
return col.apply(norm_helper)
#normalize the data
X_norm = X.apply(normalize, axis=0)
print(X_norm.head())
normalized = pd.concat([y, X_norm], axis=1)
normalized.to_csv('~/projects/samples/norm_results.csv'.format(frac), sep=',')
normalization_data.to_csv('~/projects/samples/scaler_data.csv', sep=',')