1,364
社区成员




# 导入相关库及配置
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV # 交叉验证,网格搜索
pd.options.display.max_columns = None # 取消最大列显示限制
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")# 过滤警告信息,保证清爽输出
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
## 1) 载入训练集和测试集;
Train_data= pd.read_csv('used_car_train_20200313.csv', sep=' ')
Test_data= pd.read_csv('used_car_testB_20200421.csv', sep=' ')
Train = Train_data.drop(['SaleID'], axis=1)
Test = Test_data.drop(['SaleID'], axis=1)
Train
Test
# 查看总览 - 训练集
Train.info()
# 查看总览 - 测试集
Test.info()
# 转换'-'
Train['notRepairedDamage'] = Train['notRepairedDamage'].replace('-', np.nan)
Test['notRepairedDamage'] = Test['notRepairedDamage'].replace('-', np.nan)
# 转换数据类型
Train['notRepairedDamage'] = Train['notRepairedDamage'].astype('float64')
Test['notRepairedDamage'] = Test['notRepairedDamage'].astype('float64')
# 检查是否转换成功
Train['notRepairedDamage'].unique(), Test['notRepairedDamage'].unique()
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
# 读取数据集
train_data = pd.read_csv('used_car_train_20200313.csv', sep=' ')
# 特征工程
train_data.drop(['SaleID', 'name', 'regDate', 'model', 'brand', 'regionCode', 'seller', 'offerType'], axis=1, inplace=True)
train_data['notRepairedDamage'] = train_data['notRepairedDamage'].map({'0.0': 0, '-': 0, '1.0': 1})
X = train_data.drop('price', axis=1)
y = train_data['price']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# 网格搜索最优参数
params = {'n_estimators': [150, 200, 250],
'learning_rate': [0.1],
'subsample': [0.5, 0.8]}
model = GridSearchCV(estimator=XGBRegressor(),
param_grid=params,
scoring='neg_mean_absolute_error',
cv=3)
model.fit(X_train, y_train)
# 输出最佳参数
print('最佳参数为:\n', model.best_params_)
print('最佳分数为:\n', model.best_score_)
print('最佳模型为:\n', model.best_estimator_)
# 查看数值统计描述 - 测试集
Test.describe()
# 查看数值统计描述 - 训练集
Train.describe()
Train['seller'].value_counts()
Train.drop(['seller'], axis=1, inplace=True)
Test.drop(['seller'], axis=1, inplace=True)
Train['offerType'].value_counts()
Train = Train.drop(['offerType'], axis=1)
Test = Test.drop(['offerType'], axis=1)
Train
Test
# 有143个值不合法,需要用别的值替换
Train[Train['power'] > 600]['power'].count()
Test[Test['power'] > 600]['power'].count()
# 使用map函数,以power列的中位数来替换数值超出范围的power
Train['power'] = Train['power'].map(lambda x: Train['power'].median() if x > 600 else x)
Test['power'] = Test['power'].map(lambda x: Test['power'].median() if x > 600 else x)
# 检查是否替换成功
Train['power'].plot.hist()
Test['power'].plot.hist()
# 查看各特征与销售价格之间的线性相关系数
Train.corr().unstack()['price'].sort_values(ascending=False)
# 在选择需要删除的特征之前,考虑线性相关系数低的。第一步选中系数绝对值小于0.1的特征, 第二步,抛开线性相关系数,从现实角度思考每个特征对售价的影响
# 特征v_2, v_6, v_1, v_14, v_13, v_7:由于是连续型变量,理论上具有数学意义。既然跟售价的线性相关系数极低,为降低噪声,避免过拟合,考虑删去;
# 特征regionCode, brand:并非连续型变量,不具备数学上的可比较性。与售价的线性相关系数低无法说明各自的取值对售价影响不大,保留。
# 特征name:汽车交易名称,训练集共有99662条不重复值,取值不影响售价,删去。
# 特征creatDate:(二手)汽车开始售卖时间,范围在 [20150618, 20160407],间隔短,且与regDate(汽车注册时间)线性相关系数仅为-0.001293,其取值显然对售价影响很小,删去。
Train.drop(['v_2', 'v_6', 'v_1', 'v_14', 'v_13', 'v_7', 'name', 'creatDate'], axis=1, inplace=True)
Test.drop(['v_2', 'v_6', 'v_1', 'v_14', 'v_13', 'v_7', 'name', 'creatDate'], axis=1, inplace=True)
Train.shape, Test.shape
# 再次查看各特征与销售价格之间的线性相关系数
Train.corr().unstack()['price'].sort_values(ascending=False)
# 查看训练集缺失值存在情况
Train.isnull().sum()[Train.isnull().sum() > 0]
# 查看测试集缺失值存在情况
Test.isnull().sum()[Test.isnull().sum() > 0]
Train[Train['model'].isnull()]
# model(车型编码)一般与brand, bodyType, gearbox, power有关,选择以上4个特征与该车相同的车辆的model,选择出现次数最多的值
Train[(Train['brand'] == 37) &
(Train['bodyType'] == 6.0) &
(Train['gearbox'] == 1.0) &
(Train['power'] == 190)]['model'].value_counts()
# 用157.0填充缺失值
Train.loc[38424, 'model'] = 157.0
# 用157.0填充缺失值
Train.loc[38424, 'model'] = 157.0
# 查看填充结果
Train.info()
# 看缺失值数量
Train['bodyType'].isnull().value_counts()
Test['bodyType'].isnull().value_counts()
# bodyType特征缺失值占比较小,先观察它的取值与售价之间的联系,再决定是否删去
# 输出特征与售价之间的线性关系图(类似散点图)
sns.regplot(x='bodyType', y='price', data=Train)
# 可见不同车身类型的汽车售价差别还是比较大的,故保留该特征,填充缺失值
# 看看车身类型数量分布
print(Train['bodyType'].value_counts())
print('\n')
print(Test['bodyType'].value_counts())
# 在两个数据集上,车身类型为0.0(豪华轿车)的汽车数量都是最多,所以用0.0来填充缺失值
Train.loc[:, 'bodyType'] = Train['bodyType'].map(lambda x: 0.0 if pd.isnull(x) else x)
Test.loc[:, 'bodyType'] = Test['bodyType'].map(lambda x: 0.0 if pd.isnull(x) else x)
# 看缺失值数量
print(Train['fuelType'].isnull().value_counts())
print('\n')
print(Test['fuelType'].isnull().value_counts())
# fuel特征缺失值占比较小,先观察它的取值与售价之间的联系,再决定是否删去
# 输出特征与售价之间的线性关系图(类似散点图)
sns.regplot(x='fuelType', y='price', data=Train
# 猜想:燃油类型与车身类型相关,如豪华轿车更可能是汽油或电动, 而搅拌车大多是柴油
# 创建字典,保存不同bodyType下, fuelType的众数,并以此填充fuelTyp的缺失值
dict_enu_train, dict_enu_test = {}, {}
for i in [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]:
dict_enu_train[i] = Train[Train['bodyType'] == i]['fuelType'].mode()[0]
dict_enu_test[i] = Test[Test['bodyType'] == i]['fuelType'].mode()[0]
# 发现dict_enu_train, dict_enu_test是一样的内容
# 开始填充fuelType缺失值
# 在含fuelType缺失值的条目中,将不同bodyType对应的index输出保存到一个字典中
dict_index_train, dict_index_test = {}, {}
for bodytype in [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]:
dict_index_train[bodytype] = Train[(Train['bodyType'] == bodytype) & (Train['fuelType'].isnull())].index.tolist()
dict_index_test[bodytype] = Test[(Test['bodyType'] == bodytype) & (Test['fuelType'].isnull())].index.tolist()
# 分别对每个bodyTYpe所对应的index来填充fuelType列
for bt, ft in dict_enu_train.items():
# train.loc[tuple(dict_index[bt]), :]['fuelType'] = ft # 注意:链式索引 (chained indexing)很可能导致赋值失败!
Train.loc[dict_index_train[bt], 'fuelType'] = ft # Pandas推荐使用这种方法来索引/赋值
Test.loc[dict_index_test[bt], 'fuelType'] = ft
# 分别对每个bodyTYpe所对应的index来填充fuelType列
for bt, ft in dict_enu_train.items():
# train.loc[tuple(dict_index[bt]), :]['fuelType'] = ft # 注意:链式索引 (chained indexing)很可能导致赋值失败!
Train.loc[dict_index_train[bt], 'fuelType'] = ft # Pandas推荐使用这种方法来索引/赋值
Test.loc[dict_index_test[bt], 'fuelType'] = ft
# gearbox特征缺失值占比较小,先观察它的取值与售价之间的联系,再决定是否删去
# 输出特征与售价之间的线性关系图(类似散点图)
sns.regplot(x='gearbox', y='price', data=Train)
# 可见变速箱类型的不同不会显著影响售价,删去测试集中带缺失值的行或许是可行的做法,但为避免样本量减少带来的过拟合,还是决定保留该特征并填充其缺失值
# 看看车身类型数量分布
print(Train['gearbox'].value_counts())
print('\n')
print(Test['gearbox'].value_counts())
# 训练集
Train.loc[:, 'gearbox'] = Train['gearbox'].map(lambda x: 0.0 if pd.isnull(x) else x)
# # 对于测试集,为保证预测结果完整性,不能删去任何行。测试集仅有1910个gearbox缺失值,用数量占绝大多数的0.0(手动档)来填充缺失值
Test.loc[:, 'gearbox'] = Test['gearbox'].map(lambda x: 0.0 if pd.isnull(x) else x)
# 检查填充是否成功
Train.info()
Test.info()
# 看缺失值数量
# 缺失值数量在两个数据集中的占比都不低
print(Train['notRepairedDamage'].isnull().value_counts())
print('\n')
print(Test['notRepairedDamage'].isnull().value_counts())
# 查看数量分布
print(Train['notRepairedDamage'].value_counts())
print('\n')
print(Test['notRepairedDamage'].value_counts())
# 查看线性相关系数
Train[['notRepairedDamage', 'price']].corr()['price']
# 在输出特征与售价之间的线性关系图(类似散点图)
sns.regplot(x='notRepairedDamage', y='price', data=Train)
# 在整个训练集上有尚未修复损坏的汽车比损坏已修复的汽车售价还要高。
# 为简单化问题,仍使用数量占比最大的0.0来填充所有缺失值
Train.loc[:, 'notRepairedDamage'] = Train['notRepairedDamage'].map(lambda x: 0.0 if pd.isnull(x) else x)
Test.loc[:, 'notRepairedDamage'] = Test['notRepairedDamage'].map(lambda x: 0.0 if pd.isnull(x) else x)
# 最后。检查填充结果
Train.info()
Test.info()
# 查看预测值的具体频数
plt.hist(Train['price'], orientation = 'vertical',histtype = 'bar')
plt.show()
# log变换 z之后的分布较均匀,可以进行log变换进行预测
plt.hist(np.log(Train['price']), orientation = 'vertical',histtype = 'bar')
plt.show()
Train['price']=np.log(Train['price'])
Train
Train.to_csv('Train1.csv',index=False)
Test.to_csv('Test1.csv',index=False)
train= pd.read_csv('Train1.csv', sep=',')
train
test= pd.read_csv('Test1.csv', sep=',')
test
train_y=train['price']
train_y
del train["price"]
train
from sklearn.model_selection import train_test_split
## Split data with val
x_train,x_val,y_train,y_val = train_test_split(train,train_y,test_size=0.3)
x_train.shape
x_val.shape
import time
begin=time.time()
## Train and Predict
print('Predict RF...')
model_rf = RandomForestRegressor(n_estimators=250,n_jobs=-1)
model_rf = model_rf.fit(x_train,y_train)
val_rf = model_rf.predict(x_val)
MAE_Weighted = mean_absolute_error(np.exp(y_val),np.exp(val_rf))
print('MAE_Weighted:',MAE_Weighted)
end=time.time()
print("运行时间:",end-begin)
subA_rf = np.exp(model_rf.predict(test))
import time
begin=time.time()
axisx =np.arange(230,270,10)
rs = []
for i in axisx:
model_rf = RandomForestRegressor(n_estimators =260,n_jobs=-1)
model_rf = model_rf.fit(x_train,y_train)
val_rf = model_rf.predict(x_val)
MAE_Weighted = mean_absolute_error(np.exp(y_val),np.exp(val_rf))
rs.append(MAE_Weighted)
print(axisx[rs.index(min(rs))],min(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="green",label="XGB")
plt.legend()
plt.show()
end=time.time()
print("运行时间:",end-begin)
import time
begin=time.time()
## Train and Predict
print('Predict XGB...')
model_xgb = XGBRegressor(base_score=0.5, booster='gbtree',learning_rate=0.13999999999999996, n_estimators=250,
n_jobs=1, objective='reg:linear',random_state=1, reg_alpha=0, reg_lambda=1, subsample=0.7999999999999999)
model_xgb = model_xgb.fit(x_train,y_train)
val_xgb = model_xgb.predict(x_val)
MAE_Weighted = mean_absolute_error(np.exp(y_val),np.exp(val_xgb))
print('MAE_Weighted:',MAE_Weighted)
end=time.time()
print("运行时间:",end-begin)
subA_xgb =np.exp(model_xgb.predict(test))
import time
begin=time.time()
## Train and Predict
print('Predict GBDT...')
model_gbdt = GradientBoostingRegressor(loss='squared_error', subsample=0.85, max_depth=5, n_estimators=250)
model_gbdt = model_gbdt.fit(x_train,y_train)
val_gbdt = model_gbdt.predict(x_val)
MAE_Weighted = mean_absolute_error(np.exp(y_val),np.exp(val_gbdt))
print('MAE_Weighted:',MAE_Weighted)
end=time.time()
print("运行时间:",end-begin)
subA_gbdt = np.exp(model_gbdt.predict(test))
import time
begin=time.time()
model_lgb = lgb.LGBMRegressor(n_estimators =14000,learning_rate=0.04,num_leaves=60,subsample= 0.85)
model_lgb = model_lgb.fit(x_train,y_train)
val_lgb = model_lgb.predict(x_val)
MAE_Weighted = mean_absolute_error(np.exp(y_val),np.exp(val_lgb))
end=time.time()
print("运行时间:",end-begin)
subA_lgb = np.exp(model_lgb.predict(test))
print('MAE_Weighted:',MAE_Weighted)
#学习曲线
import time
begin=time.time()
axisx =[0.6,0.7,0.8,0.85,0.9]
rs = []
for i in axisx:
model_lgb = lgb.LGBMRegressor(n_estimators =14000,learning_rate=0.04,num_leaves=60,subsample= 0.85)
model_lgb = model_lgb.fit(x_train,y_train)
val_lgb = model_lgb.predict(x_val)
MAE_Weighted = mean_absolute_error(np.exp(y_val),np.exp(val_lgb))
rs.append(MAE_Weighted)
print(axisx[rs.index(min(rs))],min(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="green",label="XGB")
plt.legend()
plt.show()
end=time.time()
print("运行时间:",end-begin)
## Starking
## 第一层
train_rf_pred = model_rf.predict(x_train)
train_gbdt_pred = model_gbdt.predict(x_train)
train_xgb_pred = model_xgb.predict(x_train)
Strak_X_train = pd.DataFrame()
Strak_X_train['Method_1'] = train_rf_pred
Strak_X_train['Method_2'] = train_gbdt_pred
Strak_X_train['Method_3'] = train_xgb_pred
Strak_X_val = pd.DataFrame()
Strak_X_val['Method_1'] = val_rf
Strak_X_val['Method_2'] = val_gbdt
Strak_X_val['Method_3'] = val_xgb
Strak_X_test = pd.DataFrame()
Strak_X_test['Method_1'] = subA_rf
Strak_X_test['Method_2'] = subA_gbdt
Strak_X_test['Method_3'] = subA_xgb
Strak_X_train
y_train
#model_lr= LogisticRegression()
## 第二层
model_lgb_Stacking = model_lgb.fit(Strak_X_train,y_train)
## 训练集
train_pre_Stacking = model_lgb_Stacking.predict(Strak_X_train)
print('MAE of Stacking-LR:',mean_absolute_error(np.exp(y_train),np.exp(train_pre_Stacking)))
## 验证集
val_pre_Stacking = model_lgb_Stacking.predict(Strak_X_val)
print('MAE of Stacking-LR:', mean_absolute_error(np.exp(y_val), np.exp(val_pre_Stacking)))
## 预测集
print('Predict Stacking-LR...')
subA_Stacking = model_lgb_Stacking.predict(np.exp(Strak_X_test))
from xgboost import XGBRegressor
params = {'n_estimators': [150, 200, 250],
'learning_rate': [0.1],
'subsample': [0.5, 0.8]}
model = GridSearchCV(estimator=XGBRegressor(),
param_grid=params,
scoring='neg_mean_absolute_error',
cv=3)
model.fit(X_train, y_train)
# 输出最佳参数
print('最佳参数为:\n', model.best_params_)
print('最佳分数为:\n', model.best_score_)
print('最佳模型为:\n', model.best_estimator_)
import pandas as pd
# 读取训练集和测试集
train_data = pd.read_csv('used_car_train_20200313.csv', sep=' ')
test_data = pd.read_csv('used_car_testB_20200421.csv', sep=' ')
# 定义类别变量
cat_cols = ['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']
# 处理训练集
train_data = train_data.drop(['name', 'seller', 'offerType', 'creatDate', 'price'], axis=1)
for cat_col in cat_cols:
train_data[cat_col] = train_data[cat_col].astype('category').cat.codes
# 处理测试集
test_data = test_data.drop(['name', 'seller', 'offerType', 'creatDate'], axis=1)
for cat_col in cat_cols:
test_data[cat_col] = test_data[cat_col].astype('category').cat.codes
# 使用 LightGBM 进行训练和预测
import lightgbm as lgb
X_train = train_data
y_train = pd.read_csv('used_car_train_20200313.csv', sep=' ')['price']
X_test = test_data
model_lgb = lgb.LGBMRegressor(num_leaves=127, objective='regression', learning_rate=0.05, n_estimators=1000, max_depth=-1)
model_lgb.fit(X_train, y_train)
subA_lgb = model_lgb.predict(X_test)
# 保存预测结果
sub = pd.DataFrame()
sub['SaleID'] = test_data['SaleID']
sub['price'] = subA_lgb
sub.to_csv('predictions.csv', index=False)