from sklearn.datasets import load_iris from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import lightgbm as lgb from lightgbm import LGBMClassifier import numpy as np import pandas as pd from sklearn import metrics import warnings warnings.filterwarnings("ignore")
[LightGBM] [Warning] num_threads is set with n_jobs=-1, nthread=-1 will be ignored. Current value: num_threads=-1
accuarcy: 93.33%
1.1.2使用pickle进行保存模型,然后加载预测
import pickle
withopen('model.pkl', 'wb') as fout: pickle.dump(gbm, fout) # load model with pickle to predict withopen('model.pkl', 'rb') as fin: pkl_bst = pickle.load(fin) # can predict with any iteration when loaded in pickle way y_pred = pkl_bst.predict(X_test) accuracy = accuracy_score(y_test,y_pred) print("accuarcy: %.2f%%" % (accuracy*100.0))
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[10] valid_0's multi_logloss: 0.96537
accuarcy: 93.33%
withopen('model.pkl', 'wb') as fout: pickle.dump(gbm, fout) # load model with pickle to predict withopen('model.pkl', 'rb') as fin: pkl_bst = pickle.load(fin) # can predict with any iteration when loaded in pickle way y_pred = pkl_bst.predict(X_test) y_pred=np.argmax(y_pred,axis=-1) accuracy = accuracy_score(y_test,y_pred) print("accuarcy: %.2f%%" % (accuracy*100.0))
accuarcy: 93.33%
三、任务3 分类、回归和排序任务
3.1使用 make_classification生成二分类数据进行训练
from sklearn.datasets import make_classification import numpy as np import pandas as pd from scipy import stats import matplotlib.pyplot as plt #n_samples:样本数量,默认100 #n_features:特征数,默认20 #n_informative:有效特征数量,默认2 #n_redundant:冗余特征,默认2 #n_repeated :重复的特征个数,默认0 #n_clusters_per_class:每个类别中cluster数量,默认2
from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import lightgbm as lgb from lightgbm import LGBMClassifier
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[39] valid_0's binary_logloss: 0.255538
accuarcy: 88.50%
y_pred = gbm2.predict(X_test,num_iteration=gbm2.best_iteration)#结果是0-1之间的概率值,是一维数组 y_pred =[1if x >0.5else0for x in y_pred] accuracy = accuracy_score(y_test,y_pred) print("accuarcy: %.2f%%" % (accuracy*100.0))
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[10] valid_0's binary_logloss: 0.371903
accuarcy: 88.00%
3.2使用 make_classification生成多分类数据进行训练
3.2.1 sklearn接口
from sklearn.datasets import make_classification import numpy as np import pandas as pd from scipy import stats import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import lightgbm as lgb from lightgbm import LGBMClassifier
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[39] valid_0's binary_logloss: 0.255538
accuarcy: 88.50%
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[10] valid_0's multi_logloss: 0.322024
accuarcy: 87.50%
from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import lightgbm as lgb from lightgbm import LGBMClassifier
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[99] valid_0's l1: 8.13036 valid_0's l2: 119.246
mse: 119.25
from sklearn.datasets import load_iris from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import lightgbm as lgb from lightgbm import LGBMClassifier import graphviz
[02:55:18] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
from sklearn.datasets import load_iris from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import lightgbm as lgb from lightgbm import LGBMClassifier
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[98] valid_0's binary_logloss: 0.429334
max_depth= 3 accuarcy: 81.90% auc_score: 76.32%
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[60] valid_0's binary_logloss: 0.430826
max_depth= 5 accuarcy: 81.98% auc_score: 75.54%
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[65] valid_0's binary_logloss: 0.429341
max_depth= 6 accuarcy: 81.69% auc_score: 75.63%
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[52] valid_0's binary_logloss: 0.429146
max_depth= 9 accuarcy: 81.94% auc_score: 76.07%
from sklearn.model_selection import train_test_split # 筛选出部分数据 data["ARRIVAL_DELAY"] = (data["ARRIVAL_DELAY"]>10)*1 categorical_feats = ["AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT","ORIGIN_AIRPORT"] #categorical_feats = [f for f in data.columns if data[f].dtype == 'object']
#将上面四列特征转为类别特征,但不是one-hot编码 for f_ in categorical_feats: data[f_], _ = pd.factorize(data[f_]) # Set feature type as categorical data[f_] = data[f_].astype('category') # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(data.drop(["ARRIVAL_DELAY"], axis=1), data["ARRIVAL_DELAY"], random_state=10, test_size=0.25) categorical_feats
# 特征命名 #num_train, num_feature = X_train.shape#X_train是7194行10列的数据集,num_feature=10表示特征数量 #feature_name = ['feature_' + str(col) for col in range(num_feature)]#feature_0到9
y_pred = gbm.predict(X_test,num_iteration=gbm.best_iteration)#结果是0-1之间的概率值,是一维数组 pred =[1if x >0.5else0for x in y_pred] accuracy = accuracy_score(y_test,pred) auc_score=metrics.roc_auc_score(y_test,y_pred) print("accuarcy: %.2f%%" % (accuracy*100.0),"auc_score: %.2f%%" % (auc_score*100.0))
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[10] valid_0's binary_logloss: 0.424384
accuarcy: 81.82% auc_score: 77.52%
#不设置categorical_feature结果一样啊,不知道为何?
# 进行one-hot编码 cols = ["AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT","ORIGIN_AIRPORT"] for item in cols: data[item] = data[item].astype("category").cat.codes +1
y_pred2 = gbm2.predict(X_test,num_iteration=gbm2.best_iteration)#结果是0-1之间的概率值,是一维数组 pred2 =[1if x >0.5else0for x in y_pred2] accuracy2 = accuracy_score(y_test,pred2) auc_score2=metrics.roc_auc_score(y_test,y_pred2) print("accuarcy: %.2f%%" % (accuracy2*100.0),"auc_score: %.2f%%" % (auc_score2*100.0))
Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[10] valid_0's binary_logloss: 0.424384
accuarcy: 81.82% auc_score: 77.52%
from sklearn.datasets import load_iris from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import lightgbm as lgb from lightgbm import LGBMClassifier import numpy as np from sklearn import metrics from sklearn.model_selection import GridSearchCV
import pandas as pd, numpy as np, time # 读取数据 data = pd.read_csv("https://cdn.coggle.club/kaggle-flight-delays/flights_10k.csv.zip")
# 提取有用的列 data = data[["MONTH","DAY","DAY_OF_WEEK","AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT", "ORIGIN_AIRPORT","AIR_TIME", "DEPARTURE_TIME","DISTANCE","ARRIVAL_DELAY"]] data.dropna(inplace=True)
#设定贝叶斯优化的黑盒函数LGB_bayesian defLGB_bayesian( num_leaves, # int min_data_in_leaf, # int learning_rate, min_sum_hessian_in_leaf, # int feature_fraction, lambda_l1, lambda_l2, min_gain_to_split, max_depth): # LightGBM expects next three parameters need to be integer. So we make them integer num_leaves = int(num_leaves) min_data_in_leaf = int(min_data_in_leaf) max_depth = int(max_depth)
asserttype(num_leaves) == int asserttype(min_data_in_leaf) == int asserttype(max_depth) == int
fold 1
[LightGBM] [Info] Number of positive: 1600, number of negative: 5594
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000288 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 393
[LightGBM] [Info] Number of data points in the train set: 7194, number of used features: 7
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[17] valid_0's auc: 0.770586
[0.52559221 0.40000825 0.43907974 ... 0.40122056 0.46515425 0.56678622]
fold 2
[LightGBM] [Info] Number of positive: 1600, number of negative: 5594
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000330 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 393
[LightGBM] [Info] Number of data points in the train set: 7194, number of used features: 7
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[17] valid_0's auc: 0.770586
[0.52559221 0.40000825 0.43907974 ... 0.40122056 0.46515425 0.56678622]
fold 3
[LightGBM] [Info] Number of positive: 1600, number of negative: 5594
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000292 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 393
[LightGBM] [Info] Number of data points in the train set: 7194, number of used features: 7
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[17] valid_0's auc: 0.770586
[0.52559221 0.40000825 0.43907974 ... 0.40122056 0.46515425 0.56678622]
fold 4
[LightGBM] [Info] Number of positive: 1600, number of negative: 5594
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000302 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 393
[LightGBM] [Info] Number of data points in the train set: 7194, number of used features: 7
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[17] valid_0's auc: 0.770586
[0.52559221 0.40000825 0.43907974 ... 0.40122056 0.46515425 0.56678622]
fold 5
[LightGBM] [Info] Number of positive: 1600, number of negative: 5594
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000300 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 393
[LightGBM] [Info] Number of data points in the train set: 7194, number of used features: 7
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[17] valid_0's auc: 0.770586
[0.52559221 0.40000825 0.43907974 ... 0.40122056 0.46515425 0.56678622]
CV AUC: 0.81
Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[52] valid_0's binary_logloss: 0.429146
<style>
table.eli5-weights tr:hover {
filter: brightness(85%);
}
from sklearn.metrics import roc_auc_score from sklearn.model_selection import KFold import time from lightgbm import LGBMClassifier import lightgbm as lgb
import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import seaborn as sns %matplotlib inline
from sklearn.model_selection import train_test_split # 筛选出部分数据 data["ARRIVAL_DELAY"] = (data["ARRIVAL_DELAY"]>10)*1 #categorical_feats = ["AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT","ORIGIN_AIRPORT"] categorical_feats = [f for f in data.columns if data[f].dtype == 'object']
#将上面四列特征转为类别特征,但不是one-hot编码 for f_ in categorical_feats: data[f_], _ = pd.factorize(data[f_]) # Set feature type as categorical data[f_] = data[f_].astype('category') # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(data.drop(["ARRIVAL_DELAY"], axis=1), data["ARRIVAL_DELAY"], random_state=10, test_size=0.25)
[LightGBM] [Info] Number of positive: 1600, number of negative: 5594
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000416 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1406
[LightGBM] [Info] Number of data points in the train set: 7194, number of used features: 7
[1] valid_0's binary_logloss: 0.479157
[2] valid_0's binary_logloss: 0.46882
[3] valid_0's binary_logloss: 0.454724
[4] valid_0's binary_logloss: 0.445913
[5] valid_0's binary_logloss: 0.440924
[6] valid_0's binary_logloss: 0.438309
[7] valid_0's binary_logloss: 0.433886
[8] valid_0's binary_logloss: 0.432747
[9] valid_0's binary_logloss: 0.431001
[10] valid_0's binary_logloss: 0.429621
feature
importance_gain
importance_split
trn_score
0
AIRLINE
153.229680
15
0.764829
1
FLIGHT_NUMBER
189.481180
23
0.764829
2
DESTINATION_AIRPORT
1036.401096
23
0.764829
3
ORIGIN_AIRPORT
650.938854
22
0.764829
4
AIR_TIME
119.763649
17
0.764829
5
DEPARTURE_TIME
994.109417
37
0.764829
6
DISTANCE
93.170790
13
0.764829
null_imp_df = pd.DataFrame() nb_runs = 10 import time start = time.time() dsp = '' for i inrange(nb_runs): # 获取当前的特征重要性 imp_df = get_feature_importances(X_train, X_test, y_train, y_test, shuffle=True) imp_df['run'] = i + 1 # 将特征重要性连起来 null_imp_df = pd.concat([null_imp_df, imp_df], axis=0) # 删除上一条信息 for l inrange(len(dsp)): print('\b', end='', flush=True) # Display current run and time used spent = (time.time() - start) / 60 dsp = 'Done with %4d of %4d (Spent %5.1f min)' % (i + 1, nb_runs, spent) print(dsp, end='', flush=True)
# Return the last mean / std values return hist['auc-mean'][-1], hist['auc-stdv'][-1]
# features = [f for f in data.columns if f not in ['SK_ID_CURR', 'TARGET']] # score_feature_selection(df=data[features], train_features=features, target=data['TARGET'])
for threshold in [0, 10, 20, 30 , 40, 50 ,60 , 70, 80 , 90, 99]: split_feats = [_f for _f, _score, _ in correlation_scores if _score >= threshold] split_cat_feats = [_f for _f, _score, _ in correlation_scores if (_score >= threshold) & (_f in categorical_feats)] gain_feats = [_f for _f, _, _score in correlation_scores if _score >= threshold] gain_cat_feats = [_f for _f, _, _score in correlation_scores if (_score >= threshold) & (_f in categorical_feats)] print('Results for threshold %3d' % threshold) split_results = score_feature_selection(data,train_features=split_feats, cat_feats=split_cat_feats) print('\t SPLIT : %.6f +/- %.6f' % (split_results[0], split_results[1])) gain_results = score_feature_selection(data,train_features=gain_feats, cat_feats=gain_cat_feats) print('\t GAIN : %.6f +/- %.6f' % (gain_results[0], gain_results[1]))
Results for threshold 0
SPLIT : 0.757882 +/- 0.012114
GAIN : 0.757882 +/- 0.012114
Results for threshold 10
SPLIT : 0.756999 +/- 0.011506
GAIN : 0.757882 +/- 0.012114
Results for threshold 20
SPLIT : 0.757959 +/- 0.012558
GAIN : 0.757882 +/- 0.012114
Results for threshold 30
SPLIT : 0.757959 +/- 0.012558
GAIN : 0.757882 +/- 0.012114
Results for threshold 40
SPLIT : 0.745729 +/- 0.013217
GAIN : 0.757882 +/- 0.012114
Results for threshold 50
SPLIT : 0.745729 +/- 0.013217
GAIN : 0.757882 +/- 0.012114
Results for threshold 60
SPLIT : 0.727063 +/- 0.006758
GAIN : 0.757882 +/- 0.012114
Results for threshold 70
SPLIT : 0.727063 +/- 0.006758
GAIN : 0.756999 +/- 0.011506
Results for threshold 80
SPLIT : 0.727063 +/- 0.006758
GAIN : 0.756999 +/- 0.011506
Results for threshold 90
SPLIT : 0.727063 +/- 0.006758
GAIN : 0.756999 +/- 0.011506
Results for threshold 99
SPLIT : 0.727063 +/- 0.006758
GAIN : 0.757959 +/- 0.012558
# 自定义目标函数,预测概率小于0.1的正样本(标签为正样本,但模型预测概率小于0.1),梯度增加一倍。 defloglikelihood(preds, train_data): labels=train_data.get_label() preds=1./(1.+np.exp(-preds)) grad=[(p-l) if p>=0.1else2*(p-l) for (p,l) inzip(preds,labels) ] hess=[p*(1.-p) if p>=0.1else2*p*(1.-p) for p in preds ]