-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeatures.py
117 lines (93 loc) · 3.72 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import sklearn.preprocessing as pp
import numpy as np
import pandas as pd
import re
def binary_features(df):
df=pd.concat([df,pd.get_dummies(df['Deck']).rename(columns=lambda x: 'Deck_'+str(x))],axis=1)
df=pd.concat([df, pd.get_dummies(df['Title']).rename(columns=lambda x: 'Title_' + str(x))], axis=1)
return df
def simplify_fare(df):
bins=(-1,0,7.896,14.454,31.275,512.4)
tags=['unknown', 'first','second','third','forth']
df['Fare']=pd.cut(df['Fare'], bins, tags)
return df
def simplify_ages(df):
bins=(-1,0,5,12,18,25,35,60,120)
tags = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
df['Age'] = pd.cut(df['Age'],bins , labels=tags)
return df
def get_title(name):
title_search = re.search(' ([A-Za-z]+)\.', name)
# If the title exists, extract and return it.
if title_search:
return title_search.group(1)
return ""
def fill_features(df):
df['Embarked'].fillna('S',inplace=True)
df['Fare'].fillna(df['Fare'].median(),inplace=True)
mu=df['Age'].mean()
delta=df['Age'].std()
rand_list=np.random.randint(mu-delta,mu+delta,size=df['Age'].isnull().sum())
df.loc[df['Age'].isnull(), 'Age']=rand_list
df['Age']=df['Age'].astype(int)
df['Cabin'].fillna('U0')
return df
def add_features(df):
df['FamilySize']=df['SibSp']+df['Parch']+1
df['IsAlone']=1
df.loc[df['FamilySize']>1, 'IsAlone']=0
df['HasCabin']=df['Cabin'].apply(lambda x: 0 if type(x)==float else 1)
df['Deck']=total['Cabin'].apply(lambda x: x[0])
df=extract_ticket(df)
return df
def simplify_features(df):
df=simplify_fare(df)
df=simplify_ages(df)
df=simplify_title(df)
return df
def drop_features(df):
df.drop('PassengerId',axis=1,inplace=True)
df.drop('Name',axis=1,inplace=True)
#df.drop('Cabin',axis=1,inplace=True)
#df.drop('SibSp',axis=1,inplace=True)
#total.drop('Parch',axis=1,inplace=True)
df.drop('Ticket',axis=1,inplace=True)
return df
def encode_features(df):
features = ['Fare', 'Age', 'Embarked', 'Sex', 'Title']
for feature in features:
le=pp.LabelEncoder()
le=le.fit(df[feature])
df[feature]=le.transform(df[feature])
return df
def extract_ticket(df):
# extract and massage the ticket prefix
df['TicketPrefix'] = df['Ticket'].map( lambda x : getTicketPrefix(x.upper()))
df['TicketPrefix'] = df['TicketPrefix'].map( lambda x: re.sub('[.?/?]', '', x) )
df['TicketPrefix'] = df['TicketPrefix'].map( lambda x: re.sub('STON', 'SOTON', x) )
# create binary features for each prefix
#prefixes = pd.get_dummies(df['TicketPrefix']).rename(columns=lambda x: 'TicketPrefix_' + str(x))
#df = pd.concat([df, prefixes], axis=1)
# factorize the prefix to create a numerical categorical variable
df['TicketPrefixId'] = pd.factorize(df['TicketPrefix'])[0]
# extract the ticket number
df['TicketNumber'] = df['Ticket'].map( lambda x: getTicketNumber(x) )
# create a feature for the number of digits in the ticket number
df['TicketNumberDigits'] = df['TicketNumber'].map( lambda x: len(x) ).astype(np.int)
# create a feature for the starting number of the ticket number
df['TicketNumberStart'] = df['TicketNumber'].map( lambda x: x[0:1] ).astype(np.int)
# The prefix and (probably) number themselves aren't useful
df.drop(['TicketPrefix', 'TicketNumber'], axis=1, inplace=True)
return df
def getTicketPrefix(ticket):
match = re.compile("([a-zA-Z./]+)").search(ticket)
if match:
return match.group()
else:
return 'U'
def getTicketNumber(ticket):
match = re.compile("([d]+$)").search(ticket)
if match:
return match.group()
else:
return '0'