-
Notifications
You must be signed in to change notification settings - Fork 139
/
Copy pathregression.py
124 lines (104 loc) · 4.05 KB
/
regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!usr/bin/env python
#-*- coding:utf-8 _*-
"""
@version: python3.6
@author: QLMX
@contact: [email protected]
@time: 2019-07-31 20:02
公众号:AI成长社
知乎:https://www.zhihu.com/people/qlmx-61/columns
"""
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
import gc
## load data
train_data = pd.read_csv('../../data/train.csv')
test_data = pd.read_csv('../../data/test.csv')
num_round = 1000
## category feature one_hot
test_data['label'] = -1
data = pd.concat([train_data, test_data])
cate_feature = ['gender', 'cell_province', 'id_province', 'id_city', 'rate', 'term']
for item in cate_feature:
data[item] = LabelEncoder().fit_transform(data[item])
train = data[data['label'] != -1]
test = data[data['label'] == -1]
#Clean up the memory
del data, train_data, test_data
gc.collect()
## get train feature
del_feature = ['auditing_date', 'due_date', 'label']
features = [i for i in train.columns if i not in del_feature]
train_x = train[features]
train_y = train['label'].astype(int)
test = test[features]
##train and predict
params = {'num_leaves': 38,
'min_data_in_leaf': 50,
'objective': 'regression',
'max_depth': -1,
'learning_rate': 0.02,
"min_sum_hessian_in_leaf": 6,
"boosting": "gbdt",
"feature_fraction": 0.9,
"bagging_freq": 1,
"bagging_fraction": 0.7,
"bagging_seed": 11,
"lambda_l1": 0.1,
"verbosity": -1,
"nthread": 4,
'metric': 'mae',
"random_state": 2019,
# 'device': 'gpu'
}
def mean_absolute_percentage_error(y_true, y_pred):
return np.mean(np.abs((y_true - y_pred) / (y_true))) * 100
def smape_func(preds, dtrain):
label = dtrain.get_label().values
epsilon = 0.1
summ = np.maximum(0.5 + epsilon, np.abs(label) + np.abs(preds) + epsilon)
smape = np.mean(np.abs(label - preds) / summ) * 2
return 'smape', float(smape), False
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof = np.zeros(train_x.shape[0])
predictions = np.zeros(test.shape[0])
train_y = np.log1p(train_y) # Data smoothing
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_x)):
print("fold {}".format(fold_ + 1))
trn_data = lgb.Dataset(train_x.iloc[trn_idx], label=train_y.iloc[trn_idx])
val_data = lgb.Dataset(train_x.iloc[val_idx], label=train_y.iloc[val_idx])
clf = lgb.train(params,
trn_data,
num_round,
valid_sets=[trn_data, val_data],
verbose_eval=200,
categorical_feature=cate_feature,
early_stopping_rounds=200)
oof[val_idx] = clf.predict(train_x.iloc[val_idx], num_iteration=clf.best_iteration)
fold_importance_df = pd.DataFrame()
fold_importance_df["Feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
predictions += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits
print('mse %.6f' % mean_squared_error(train_y, oof))
print('mae %.6f' % mean_absolute_error(train_y, oof))
result = np.expm1(predictions) #reduction
result = predictions
## plot feature importance
cols = (feature_importance_df[["Feature", "importance"]].groupby("Feature").mean().sort_values(by="importance", ascending=False).index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)].sort_values(by='importance',ascending=False)
plt.figure(figsize=(8, 10))
sns.barplot(y="Feature",
x="importance",
data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('../../result/lgb_importances.png')