阿里云天池学习赛——二手车价格预测

人工智能技术应用一班-邓铭 2023-05-22 07:06:52

# 导入相关库及配置

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from xgboost import XGBRegressor

from sklearn.model_selection import cross_val_score, GridSearchCV # 交叉验证，网格搜索

pd.options.display.max_columns = None # 取消最大列显示限制

%matplotlib inline

import warnings

warnings.filterwarnings("ignore")# 过滤警告信息，保证清爽输出

from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.model_selection import cross_val_score

import lightgbm as lgb

## 1) 载入训练集和测试集；

Train_data= pd.read_csv('used_car_train_20200313.csv', sep=' ')

Test_data= pd.read_csv('used_car_testB_20200421.csv', sep=' ')

Train = Train_data.drop(['SaleID'], axis=1)

Test = Test_data.drop(['SaleID'], axis=1)

Train

Test

# 查看总览 - 训练集

Train.info()

# 查看总览 - 测试集

Test.info()

# 转换'-'

Train['notRepairedDamage'] = Train['notRepairedDamage'].replace('-', np.nan)

Test['notRepairedDamage'] = Test['notRepairedDamage'].replace('-', np.nan)

# 转换数据类型

Train['notRepairedDamage'] = Train['notRepairedDamage'].astype('float64')

Test['notRepairedDamage'] = Test['notRepairedDamage'].astype('float64')

# 检查是否转换成功

Train['notRepairedDamage'].unique(), Test['notRepairedDamage'].unique()

import pandas as pd

from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor

from sklearn.model_selection import GridSearchCV

# 读取数据集

train_data = pd.read_csv('used_car_train_20200313.csv', sep=' ')

# 特征工程

train_data.drop(['SaleID', 'name', 'regDate', 'model', 'brand', 'regionCode', 'seller', 'offerType'], axis=1, inplace=True)

train_data['notRepairedDamage'] = train_data['notRepairedDamage'].map({'0.0': 0, '-': 0, '1.0': 1})

X = train_data.drop('price', axis=1)

y = train_data['price']

# 划分训练集和测试集

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# 网格搜索最优参数

params = {'n_estimators': [150, 200, 250],

'learning_rate': [0.1],

'subsample': [0.5, 0.8]}

model = GridSearchCV(estimator=XGBRegressor(),

param_grid=params,

scoring='neg_mean_absolute_error',

cv=3)

model.fit(X_train, y_train)

# 输出最佳参数

print('最佳参数为：\n', model.best_params_)

print('最佳分数为：\n', model.best_score_)

print('最佳模型为：\n', model.best_estimator_)

# 查看数值统计描述 - 测试集

Test.describe()

# 查看数值统计描述 - 训练集

Train.describe()

Train['seller'].value_counts()

Train.drop(['seller'], axis=1, inplace=True)

Test.drop(['seller'], axis=1, inplace=True)

Train['offerType'].value_counts()

Train = Train.drop(['offerType'], axis=1)

Test = Test.drop(['offerType'], axis=1)

Train

Test

# 有143个值不合法，需要用别的值替换

Train[Train['power'] > 600]['power'].count()

Test[Test['power'] > 600]['power'].count()

# 使用map函数，以power列的中位数来替换数值超出范围的power

Train['power'] = Train['power'].map(lambda x: Train['power'].median() if x > 600 else x)

Test['power'] = Test['power'].map(lambda x: Test['power'].median() if x > 600 else x)

# 检查是否替换成功

Train['power'].plot.hist()

Test['power'].plot.hist()

# 查看各特征与销售价格之间的线性相关系数

Train.corr().unstack()['price'].sort_values(ascending=False)

# 在选择需要删除的特征之前，考虑线性相关系数低的。第一步选中系数绝对值小于0.1的特征，第二步，抛开线性相关系数，从现实角度思考每个特征对售价的影响

# 特征v_2, v_6, v_1, v_14, v_13, v_7：由于是连续型变量，理论上具有数学意义。既然跟售价的线性相关系数极低，为降低噪声，避免过拟合，考虑删去；

# 特征regionCode, brand：并非连续型变量，不具备数学上的可比较性。与售价的线性相关系数低无法说明各自的取值对售价影响不大，保留。

# 特征name：汽车交易名称，训练集共有99662条不重复值，取值不影响售价，删去。

# 特征creatDate：（二手）汽车开始售卖时间，范围在 [20150618, 20160407]，间隔短，且与regDate（汽车注册时间）线性相关系数仅为-0.001293，其取值显然对售价影响很小，删去。

Train.drop(['v_2', 'v_6', 'v_1', 'v_14', 'v_13', 'v_7', 'name', 'creatDate'], axis=1, inplace=True)

Test.drop(['v_2', 'v_6', 'v_1', 'v_14', 'v_13', 'v_7', 'name', 'creatDate'], axis=1, inplace=True)

Train.shape, Test.shape

# 再次查看各特征与销售价格之间的线性相关系数

Train.corr().unstack()['price'].sort_values(ascending=False)

# 查看训练集缺失值存在情况

Train.isnull().sum()[Train.isnull().sum() > 0]

# 查看测试集缺失值存在情况

Test.isnull().sum()[Test.isnull().sum() > 0]

Train[Train['model'].isnull()]

# model(车型编码)一般与brand, bodyType, gearbox, power有关，选择以上4个特征与该车相同的车辆的model，选择出现次数最多的值

Train[(Train['brand'] == 37) &

(Train['bodyType'] == 6.0) &

(Train['gearbox'] == 1.0) &

(Train['power'] == 190)]['model'].value_counts()

# 用157.0填充缺失值

Train.loc[38424, 'model'] = 157.0

# 用157.0填充缺失值

Train.loc[38424, 'model'] = 157.0

# 查看填充结果

Train.info()

# 看缺失值数量

Train['bodyType'].isnull().value_counts()

Test['bodyType'].isnull().value_counts()

# bodyType特征缺失值占比较小，先观察它的取值与售价之间的联系，再决定是否删去

# 输出特征与售价之间的线性关系图（类似散点图）

sns.regplot(x='bodyType', y='price', data=Train)

# 可见不同车身类型的汽车售价差别还是比较大的，故保留该特征，填充缺失值

# 看看车身类型数量分布

print(Train['bodyType'].value_counts())

print('\n')

print(Test['bodyType'].value_counts())

# 在两个数据集上，车身类型为0.0（豪华轿车）的汽车数量都是最多，所以用0.0来填充缺失值

Train.loc[:, 'bodyType'] = Train['bodyType'].map(lambda x: 0.0 if pd.isnull(x) else x)

Test.loc[:, 'bodyType'] = Test['bodyType'].map(lambda x: 0.0 if pd.isnull(x) else x)

# 看缺失值数量

print(Train['fuelType'].isnull().value_counts())

print('\n')

print(Test['fuelType'].isnull().value_counts())

# fuel特征缺失值占比较小，先观察它的取值与售价之间的联系，再决定是否删去

# 输出特征与售价之间的线性关系图（类似散点图）

sns.regplot(x='fuelType', y='price', data=Train

# 猜想：燃油类型与车身类型相关，如豪华轿车更可能是汽油或电动，而搅拌车大多是柴油

# 创建字典，保存不同bodyType下， fuelType的众数，并以此填充fuelTyp的缺失值

dict_enu_train, dict_enu_test = {}, {}

for i in [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]:

dict_enu_train[i] = Train[Train['bodyType'] == i]['fuelType'].mode()[0]

dict_enu_test[i] = Test[Test['bodyType'] == i]['fuelType'].mode()[0]

# 发现dict_enu_train, dict_enu_test是一样的内容

# 开始填充fuelType缺失值

# 在含fuelType缺失值的条目中，将不同bodyType对应的index输出保存到一个字典中

dict_index_train, dict_index_test = {}, {}

for bodytype in [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]:

dict_index_train[bodytype] = Train[(Train['bodyType'] == bodytype) & (Train['fuelType'].isnull())].index.tolist()

dict_index_test[bodytype] = Test[(Test['bodyType'] == bodytype) & (Test['fuelType'].isnull())].index.tolist()

# 分别对每个bodyTYpe所对应的index来填充fuelType列

for bt, ft in dict_enu_train.items():

# train.loc[tuple(dict_index[bt]), :]['fuelType'] = ft # 注意：链式索引 (chained indexing)很可能导致赋值失败！

Train.loc[dict_index_train[bt], 'fuelType'] = ft # Pandas推荐使用这种方法来索引/赋值

Test.loc[dict_index_test[bt], 'fuelType'] = ft

# 分别对每个bodyTYpe所对应的index来填充fuelType列

for bt, ft in dict_enu_train.items():

# train.loc[tuple(dict_index[bt]), :]['fuelType'] = ft # 注意：链式索引 (chained indexing)很可能导致赋值失败！

Train.loc[dict_index_train[bt], 'fuelType'] = ft # Pandas推荐使用这种方法来索引/赋值

Test.loc[dict_index_test[bt], 'fuelType'] = ft

# gearbox特征缺失值占比较小，先观察它的取值与售价之间的联系，再决定是否删去

# 输出特征与售价之间的线性关系图（类似散点图）

sns.regplot(x='gearbox', y='price', data=Train)

# 可见变速箱类型的不同不会显著影响售价，删去测试集中带缺失值的行或许是可行的做法，但为避免样本量减少带来的过拟合，还是决定保留该特征并填充其缺失值

# 看看车身类型数量分布

print(Train['gearbox'].value_counts())

print('\n')

print(Test['gearbox'].value_counts())

# 训练集

Train.loc[:, 'gearbox'] = Train['gearbox'].map(lambda x: 0.0 if pd.isnull(x) else x)

# # 对于测试集，为保证预测结果完整性，不能删去任何行。测试集仅有1910个gearbox缺失值，用数量占绝大多数的0.0（手动档）来填充缺失值

Test.loc[:, 'gearbox'] = Test['gearbox'].map(lambda x: 0.0 if pd.isnull(x) else x)

# 检查填充是否成功

Train.info()

Test.info()

# 看缺失值数量

# 缺失值数量在两个数据集中的占比都不低

print(Train['notRepairedDamage'].isnull().value_counts())

print('\n')

print(Test['notRepairedDamage'].isnull().value_counts())

# 查看数量分布

print(Train['notRepairedDamage'].value_counts())

print('\n')

print(Test['notRepairedDamage'].value_counts())

# 查看线性相关系数

Train[['notRepairedDamage', 'price']].corr()['price']

# 在输出特征与售价之间的线性关系图（类似散点图）

sns.regplot(x='notRepairedDamage', y='price', data=Train)

# 在整个训练集上有尚未修复损坏的汽车比损坏已修复的汽车售价还要高。

# 为简单化问题，仍使用数量占比最大的0.0来填充所有缺失值

Train.loc[:, 'notRepairedDamage'] = Train['notRepairedDamage'].map(lambda x: 0.0 if pd.isnull(x) else x)

Test.loc[:, 'notRepairedDamage'] = Test['notRepairedDamage'].map(lambda x: 0.0 if pd.isnull(x) else x)

# 最后。检查填充结果

Train.info()

Test.info()

# 查看预测值的具体频数

plt.hist(Train['price'], orientation = 'vertical',histtype = 'bar')

plt.show()

# log变换 z之后的分布较均匀，可以进行log变换进行预测

plt.hist(np.log(Train['price']), orientation = 'vertical',histtype = 'bar')

plt.show()

Train['price']=np.log(Train['price'])

Train

Train.to_csv('Train1.csv',index=False)

Test.to_csv('Test1.csv',index=False)

train= pd.read_csv('Train1.csv', sep=',')

train

test= pd.read_csv('Test1.csv', sep=',')

test

train_y=train['price']

train_y

del train["price"]

train

from sklearn.model_selection import train_test_split

## Split data with val

x_train,x_val,y_train,y_val = train_test_split(train,train_y,test_size=0.3)

x_train.shape

x_val.shape

import time

begin=time.time()

## Train and Predict

print('Predict RF...')

model_rf = RandomForestRegressor(n_estimators=250,n_jobs=-1)

model_rf = model_rf.fit(x_train,y_train)

val_rf = model_rf.predict(x_val)

MAE_Weighted = mean_absolute_error(np.exp(y_val),np.exp(val_rf))

print('MAE_Weighted:',MAE_Weighted)

end=time.time()

print("运行时间：",end-begin)

subA_rf = np.exp(model_rf.predict(test))

import time

begin=time.time()

axisx =np.arange(230,270,10)

rs = []

for i in axisx:

model_rf = RandomForestRegressor(n_estimators =260,n_jobs=-1)

model_rf = model_rf.fit(x_train,y_train)

val_rf = model_rf.predict(x_val)

MAE_Weighted = mean_absolute_error(np.exp(y_val),np.exp(val_rf))

rs.append(MAE_Weighted)

print(axisx[rs.index(min(rs))],min(rs))

plt.figure(figsize=(20,5))

plt.plot(axisx,rs,c="green",label="XGB")

plt.legend()

plt.show()

end=time.time()

print("运行时间：",end-begin)

import time

begin=time.time()

## Train and Predict

print('Predict XGB...')

model_xgb = XGBRegressor(base_score=0.5, booster='gbtree',learning_rate=0.13999999999999996, n_estimators=250,

n_jobs=1, objective='reg:linear',random_state=1, reg_alpha=0, reg_lambda=1, subsample=0.7999999999999999)

model_xgb = model_xgb.fit(x_train,y_train)

val_xgb = model_xgb.predict(x_val)

MAE_Weighted = mean_absolute_error(np.exp(y_val),np.exp(val_xgb))

print('MAE_Weighted:',MAE_Weighted)

end=time.time()

print("运行时间：",end-begin)

subA_xgb =np.exp(model_xgb.predict(test))

import time

begin=time.time()

## Train and Predict

print('Predict GBDT...')

model_gbdt = GradientBoostingRegressor(loss='squared_error', subsample=0.85, max_depth=5, n_estimators=250)

model_gbdt = model_gbdt.fit(x_train,y_train)

val_gbdt = model_gbdt.predict(x_val)

MAE_Weighted = mean_absolute_error(np.exp(y_val),np.exp(val_gbdt))

print('MAE_Weighted:',MAE_Weighted)

end=time.time()

print("运行时间：",end-begin)

subA_gbdt = np.exp(model_gbdt.predict(test))

import time

begin=time.time()

model_lgb = lgb.LGBMRegressor(n_estimators =14000,learning_rate=0.04,num_leaves=60,subsample= 0.85)

model_lgb = model_lgb.fit(x_train,y_train)

val_lgb = model_lgb.predict(x_val)

MAE_Weighted = mean_absolute_error(np.exp(y_val),np.exp(val_lgb))

end=time.time()

print("运行时间：",end-begin)

subA_lgb = np.exp(model_lgb.predict(test))

print('MAE_Weighted:',MAE_Weighted)

#学习曲线

import time

begin=time.time()

axisx =[0.6,0.7,0.8,0.85,0.9]

rs = []

for i in axisx:

model_lgb = lgb.LGBMRegressor(n_estimators =14000,learning_rate=0.04,num_leaves=60,subsample= 0.85)

model_lgb = model_lgb.fit(x_train,y_train)

val_lgb = model_lgb.predict(x_val)

MAE_Weighted = mean_absolute_error(np.exp(y_val),np.exp(val_lgb))

rs.append(MAE_Weighted)

print(axisx[rs.index(min(rs))],min(rs))

plt.figure(figsize=(20,5))

plt.plot(axisx,rs,c="green",label="XGB")

plt.legend()

plt.show()

end=time.time()

print("运行时间：",end-begin)

## Starking

## 第一层

train_rf_pred = model_rf.predict(x_train)

train_gbdt_pred = model_gbdt.predict(x_train)

train_xgb_pred = model_xgb.predict(x_train)

Strak_X_train = pd.DataFrame()

Strak_X_train['Method_1'] = train_rf_pred

Strak_X_train['Method_2'] = train_gbdt_pred

Strak_X_train['Method_3'] = train_xgb_pred

Strak_X_val = pd.DataFrame()

Strak_X_val['Method_1'] = val_rf

Strak_X_val['Method_2'] = val_gbdt

Strak_X_val['Method_3'] = val_xgb

Strak_X_test = pd.DataFrame()

Strak_X_test['Method_1'] = subA_rf

Strak_X_test['Method_2'] = subA_gbdt

Strak_X_test['Method_3'] = subA_xgb

Strak_X_train

y_train

#model_lr= LogisticRegression()

## 第二层

model_lgb_Stacking = model_lgb.fit(Strak_X_train,y_train)

## 训练集

train_pre_Stacking = model_lgb_Stacking.predict(Strak_X_train)

print('MAE of Stacking-LR:',mean_absolute_error(np.exp(y_train),np.exp(train_pre_Stacking)))

## 验证集

val_pre_Stacking = model_lgb_Stacking.predict(Strak_X_val)

print('MAE of Stacking-LR:', mean_absolute_error(np.exp(y_val), np.exp(val_pre_Stacking)))

## 预测集

print('Predict Stacking-LR...')

subA_Stacking = model_lgb_Stacking.predict(np.exp(Strak_X_test))

from xgboost import XGBRegressor

params = {'n_estimators': [150, 200, 250],

'learning_rate': [0.1],

'subsample': [0.5, 0.8]}

model = GridSearchCV(estimator=XGBRegressor(),

param_grid=params,

scoring='neg_mean_absolute_error',

cv=3)

model.fit(X_train, y_train)

# 输出最佳参数

print('最佳参数为：\n', model.best_params_)

print('最佳分数为：\n', model.best_score_)

print('最佳模型为：\n', model.best_estimator_)

import pandas as pd

# 读取训练集和测试集

train_data = pd.read_csv('used_car_train_20200313.csv', sep=' ')

test_data = pd.read_csv('used_car_testB_20200421.csv', sep=' ')

# 定义类别变量

cat_cols = ['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']

# 处理训练集

train_data = train_data.drop(['name', 'seller', 'offerType', 'creatDate', 'price'], axis=1)

for cat_col in cat_cols:

train_data[cat_col] = train_data[cat_col].astype('category').cat.codes

# 处理测试集

test_data = test_data.drop(['name', 'seller', 'offerType', 'creatDate'], axis=1)

for cat_col in cat_cols:

test_data[cat_col] = test_data[cat_col].astype('category').cat.codes

# 使用 LightGBM 进行训练和预测

import lightgbm as lgb

X_train = train_data

y_train = pd.read_csv('used_car_train_20200313.csv', sep=' ')['price']

X_test = test_data

model_lgb = lgb.LGBMRegressor(num_leaves=127, objective='regression', learning_rate=0.05, n_estimators=1000, max_depth=-1)

model_lgb.fit(X_train, y_train)

subA_lgb = model_lgb.predict(X_test)

# 保存预测结果

sub = pd.DataFrame()

sub['SaleID'] = test_data['SaleID']

sub['price'] = subA_lgb

sub.to_csv('predictions.csv', index=False)

...全文

400 回复打赏收藏转发到动态举报

写回复

用AI写文章

切换为时间正序

请发表友善的回复…

发表回复

天池竞赛二手车价格预测项目源码+项目说明+数据集（高分项目）天池竞赛二手车价格预测项目源码+项目说明+数据集（高分项目）天池竞赛二手车价格预测项目源码+项目说明+数据集（高分项目）天池竞赛二手车价格预测项目...

数据可以在官网下载方案与文件

赛题源自阿里云天池平台的“零基础入门数据挖掘 – 二手车交易价格预测”，它为初学者提供了实践数据挖掘技术的机会。首先，我们要处理的数据可能包含多个特征，如车辆的品牌、型号、年份、里程、颜色等，这些特征...

阿里云天池竞赛——二手车交易价格预测（个人练习+源代码）

三、赛题数据数据链接：零基础入门数据挖掘 - 二手车交易价格预测_学习赛_天池大赛-阿里云天池的赛制 (aliyun.com)https://tianchi.aliyun.com/competition/entrance/231784/information 赛题以预测二手车的交易...

柳州职业技术学院

1,365

社区成员

581

社区内容

发帖

与我相关

我的任务

社区管理员

加入社区

近7日
近30日
至今

加载中

查看更多榜单

社区公告

各位加入社区的同学，请完善社区信息，把社区昵称改为【班级-姓名】，社区签名改为【班级-学号-姓名】的格式

如【社区昵称】20计应1班张某某（班级用简称）

【社区签名】2020级计算机应用技术1班 20201234567 张某某（班级用全称）

试试用AI创作助手写篇文章吧

+ 用AI写文章