-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfunctions.py
95 lines (65 loc) · 2.67 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
# ---------------- Fill missing values ---------------- #
def fill_age(df):
sex = ['male', 'female']
mean_ages = np.zeros((2, 3))
for i, gender in enumerate(sex):
for j in range(0, 3):
mean_ages[i, j] = df[(df['Sex'] == gender) & (df['Pclass'] == j + 1)]['Age'].dropna().mean()
for i, gender in enumerate(sex):
for j in range(0, 3):
df.loc[(df['Age'].isnull()) & (df['Sex'] == gender) & (df['Pclass'] == j + 1), 'Age'] = mean_ages[i, j]
return df
# ---------------- Create New Features ---------------- #
def create_familysize(df):
df['FamilySize'] = df["Parch"] + df["SibSp"] + 1
return df
def create_isalone(df):
df['IsAlone'] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1
return df
def __find_titles__(words):
for word in words.split():
if word[0].isupper() and word.endswith('.'):
return word
def create_title(df):
df['Title'] = df.Name.apply(__find_titles__)
df['Title'] = df.groupby('Title')['Title'].transform(
lambda x: 'Other.' if x.count() < 9 else x)
return df
def create_primacy(df):
df['Primacy'] = (df['Fare'] + 1) / df['Pclass']
return df
# ---------------- Mapping ---------------- #
def mapping(df):
df['Sex'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)
title_mapping = {"Mr.": 1, "Miss.": 2, "Mrs.": 3, "Master.": 4, "Other.": 5}
df['Title'] = df['Title'].map(title_mapping)
df['Title'] = df['Title'].fillna(0)
df.loc[df['Age'] <= 16, 'Age'] = 0
df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3
df.loc[df['Age'] > 64, 'Age'] = 4
df['Age'] = df['Age'].astype(int)
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
df.loc[df['Fare'] <= 7.91, 'Fare'] = 0
df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare'] = 2
df.loc[df['Fare'] > 31, 'Fare'] = 3
df['Fare'] = df['Fare'].astype(int)
return df
# ---------------- Create Dummy Values ---------------- #
def dummy_embarked(df):
dummies = pd.get_dummies(df['Embarked'])
dummies.columns = ['C', 'Q', 'S']
df = df.join(dummies)
df = df.drop('Embarked', axis=1)
return df
# ---------------- Drop unnecessary columns ---------------- #
def drop_columns(df):
df = df.drop(['PassengerId', 'Pclass', 'Fare', 'Name', 'SibSp', 'Parch', 'Ticket', 'FamilySize', 'Cabin'], axis=1)
return df