Python实战：链家二手房房价预测模型

kaggle.com/ruiqurm/lianjia

1 分析目的

2 数据收集

Kaggle提供的数据共计，26个字段，318852行。

3 数据清洗

3.1 选择子集

3.2 删除重复值

Excel删除26条重复值。

3.3 数据一致化

drawingRoom和floor明显有错位替换。

floor这里，为了方便后续建模，把floor分列floorType，floorNum，高-H-1，中-M-2，低-L-3，底-D-4，顶-U-5，钢混结构和混合结构和楼层无关，全改为nan。floor改为数值格式。

3.4 删除异常值

2010以前的数据很少，基本不对价格造成什么影响，予以删除。

3.5 导入数据

``````#必须导入的包
import pandas as pd
import numpy as np

# 图形
import matplotlib.pyplot as plt
import seaborn as sns

# 统计
from scipy import stats
from scipy.stats import skew, norm
from subprocess import check_output
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# 分类
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

# 模型
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from xgboost import XGBRegressor

#导入数据集

#数据集指标检查
erhouse.columns``````

3.6 数据观察

``print('The data size is : {} '.format(erhouse.shape))``

``erhouse.head(5)``

price销售价格是需要预测的变量。

price的分布

``sns.distplot(erhouse['price'])``

``````#skewness and kurtosis
print('Skewness: %f' % erhouse['price'].skew())
print('Kurtosis: %f' % erhouse['price'].kurt())``````

3.7 异常值处理

``````corrmat = erhouse.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);``````

price和特征1-5的散点图

``````sns.set()
cols = [ 'tradeTime', 'DOM', 'followers', 'totalPrice', 'square', 'price']
sns.pairplot(erhouse[cols], size = 2.5)
plt.show();``````

price和DOM

``````var = 'DOM'
data = pd.concat([erhouse['price'], erhouse[var]], axis=1)
data.plot.scatter(x=var, y='price', ylim=(0,150000));``````

``````erhouse.sort_values(by = 'DOM', ascending = False)[:2]
erhouse = erhouse.drop(erhouse[erhouse['DOM'] == 1677].index) ``````

``````var = 'DOM'
data = pd.concat([erhouse['price'], erhouse[var]], axis=1)
data.plot.scatter(x=var, y='price', ylim=(0,150000));``````

price和followers

``````var = 'followers'
data = pd.concat([erhouse['price'], erhouse[var]], axis=1)
data.plot.scatter(x=var, y='price', ylim=(0,150000));``````

price和totalPrice

``````var = 'totalPrice'
data = pd.concat([erhouse['price'], erhouse[var]], axis=1)
data.plot.scatter(x=var, y='price', ylim=(0,150000));``````

``````erhouse.sort_values(by = 'totalPrice', ascending = False)[:2]
erhouse = erhouse.drop(erhouse[erhouse['totalPrice'] == 18130].index) ``````

``````var = 'totalPrice'
data = pd.concat([erhouse['price'], erhouse[var]], axis=1)
data.plot.scatter(x=var, y='price', ylim=(0,150000));``````

price和square

``````var = 'square'
data = pd.concat([erhouse['price'], erhouse[var]], axis=1)
data.plot.scatter(x=var, y='price', ylim=(0,150000));``````

``````erhouse.sort_values(by = 'square', ascending = False)[:2]
erhouse = erhouse.drop(erhouse[erhouse['square'] == 906].index)
erhouse = erhouse.drop(erhouse[erhouse['square'] ==922.7].index) ``````

``````var = 'square'
data = pd.concat([erhouse['price'], erhouse[var]], axis=1)
data.plot.scatter(x=var, y='price', ylim=(0,150000));``````

price和特征6-11的散点图

``````sns.set()
cols = [ 'livingRoom', 'drawingRoom', 'kitchen', 'bathRoom', 'floorType', 'floorNum', 'price']
sns.pairplot(erhouse[cols], size = 2.5)
plt.show();``````

price和特征12-16的散点图

``````sns.set()
cols = [ 'buildingType', 'constructionTime', 'renovationCondition', 'buildingStructure', 'ladderRatio', 'price']
sns.pairplot(erhouse[cols], size = 2.5)
plt.show();``````

``````var = 'ladderRatio'
data = pd.concat([erhouse['price'], erhouse[var]], axis=1)
data.plot.scatter(x=var, y='price', ylim=(0,150000));``````

``````erhouse.sort_values(by = 'ladderRatio', ascending = False)[:2]

``````var = 'ladderRatio'
data = pd.concat([erhouse['price'], erhouse[var]], axis=1)
data.plot.scatter(x=var, y='price', ylim=(0,150000));``````

price和特征17-21的散点图

``````sns.set()
cols = [ 'elevator', 'fiveYearsProperty', 'subway', 'district', 'communityAverage', 'price']
sns.pairplot(erhouse[cols], size = 2.5)
plt.show();``````

``erhouse.shape``

3.8 目标变量处理

``````sns.distplot(erhouse['price'], fit=norm);
fig = plt.figure()``````

``erhouse['price'] = np.log(erhouse['price'])``

``````sns.distplot(erhouse['price'], fit=norm);
fig = plt.figure()``````

``````erhouse_labels = erhouse['price'].reset_index(drop=True)
features = erhouse.drop(['price'], axis=1)``````

``features.shape``

3.9 缺失值处理

``````features_na = (features.isnull().sum() / len(features)) * 100
features_na = features_na.drop(features_na[features_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :features_na})

``````#用平均值替换掉缺失值。
#DOM为上架时间，可以用平均时间28.82319154替换
features['DOM'] = features['DOM'] .fillna(28.82319154)
#buildingType为建筑类型，可以用常见的4（板楼）替换
features['buildingType'] = features['buildingType'] .fillna(4)
#floorType为所在楼层类型，可以用常见 2
features['floorType'] = features['floorType'] .fillna('2')
#drawingRoom为厅数量，可以用平均时间1替换
features['drawingRoom'] = features['drawingRoom'] .fillna(1)

#用0替换掉缺失值。
#livingRoom代表没有卧室，很迷啊，但是也不能随意增加，还是用0代替吧
features['livingRoom'] = features['livingRoom'].fillna(0)
#elevator，fiveYearsProperty，subway分别代表没有电梯，没有满五年，没有地铁，用0代替。
for col in ('elevator', 'fiveYearsProperty', 'subway'):
features[col] = features[col].fillna(0)

#在原始数据改
#communityAverage为区域均价，这个还涉及到一个不同年份价格的影响不同的问题，这里，我采取，按当年份区域均价补充的方法，
#介于代码不好写，继续在原始数据改，改为重新run
#缺失值删除``````

``````features_na = (features.isnull().sum() / len(features)) * 100
features_na = features_na.drop(features_na[features_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :features_na})

4 特征工程

“数据决定了机器学习的上限，而算法只是尽可能逼近这个上限”，这里的数据指的就是经过特征工程得到的数据。特征工程指的是把原始数据通过特征构建、特征处理等操作转变为模型的训练数据的过程，它的目的就是获取更好的训练数据特征，使得机器学习模型逼近这个上限。特征工程能使得模型的性能得到提升，有时甚至在简单的模型上也能取得不错的效果。

4.1 数据转换

``````numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for i in features.columns:
if features[i].dtype in numeric_dtypes:
numeric.append(i)``````

``````skew_features = features[numeric].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

print('There are {} numerical features with Skew > 0.5 :'.format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})

``````for i in skew_index:
features[i] = boxcox1p(features[i], boxcox_normmax(features[i] + 1))``````

``````skew_features = features[numeric].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index
print('There are {} numerical features with Skew > 0.5 :'.format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})

4.2 增加特征

``````#通过加总的特征
#卧室，厨房，卫生间等全部相加
features['TotalNum'] = features['livingRoom'] +features['kitchen']+features['bathRoom']
#建筑类型，装修情况，建筑结构类型，是否满五年，是否有电梯，是否地铁沿线等全部相加
features['TotalYN'] = (features['buildingType'] + features['renovationCondition'] +features['buildingStructure']
+features['fiveYearsProperty']+features['elevator']+features['subway'])

#通过相乘的特征
#市场价=区域价格*面积
features['TotaMprice'] = features['communityAverage'] * features['square'] ``````

4.3 特征转换

``````#通过对数处理获得新的特征
def logs(res, ls):
m = res.shape[1]
for l in ls:
res = res.assign(newcol=pd.Series(np.log(1.01+res[l])).values)
res.columns.values[m] = l + '_log'
m += 1
return res

log_features = ['DOM','followers','totalPrice','square','livingRoom','kitchen',
'district','communityAverage']

features = logs(features, log_features)

log_features =  ['DOM','followers','totalPrice','square','livingRoom','kitchen',
'district','communityAverage']

features = logs(features, log_features)
#通过平方转换获得新的特征
def squares(res, ls):
m = res.shape[1]
for l in ls:
res = res.assign(newcol=pd.Series(res[l]*res[l]).values)
res.columns.values[m] = l + '_sq'
m += 1
return res

squared_features = ['DOM','followers','totalPrice','square','livingRoom','kitchen',
'district','communityAverage']

features = squares(features, squared_features)``````

``features.shape``

4.4 字符型特征的独热编码

``features.info()``

floorType为文本，转换成数值

``features['floorType'] = features['floorType'].astype(int)``

``features.info()``

4.5 特征降维

5 数据建模

5.1.1 数据准备

``erhouse_labels.shape, features.shape``

5.1.2 主成分分析（PCA）

PCA的思想是通过坐标轴转换，寻找数据分布的最优子空间，从而达到降维、去相关的目的。

``````pca_model = PCA(n_components=63)
Features= pca_model.fit_transform(features)``````

5.2 设置交叉验证并定义错误度量

1）将训练集分成5份
2）不重复地每次取其中一份做测试，用其他四份做训练，之后计算该模型在测试集上的rmse
3）将5次的rmse平均得到最后的rmse

``````# 设置交叉验证
kf = KFold(n_splits=5, random_state=42, shuffle=True)

# 定义错误度量
def rmsle(y, y_pred):
return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model, X=Features):
rmse = np.sqrt(-cross_val_score(model, X, erhouse_labels, scoring="neg_mean_squared_error", cv=kf))
return (rmse)``````

5.3 设置模型

``````lightgbm = LGBMRegressor(objective='regression',
num_leaves=6,
learning_rate=0.01,
n_estimators=7000,
max_bin=200,
bagging_fraction=0.8,
bagging_freq=4,
bagging_seed=8,
feature_fraction=0.2,
feature_fraction_seed=8,
min_sum_hessian_in_leaf = 11,
verbose=-1,
random_state=42)

xgboost = XGBRegressor(learning_rate=0.01,
n_estimators=6000,
max_depth=4,
min_child_weight=0,
gamma=0.6,
subsample=0.7,
colsample_bytree=0.7,
objective='reg:linear',
scale_pos_weight=1,
seed=27,
reg_alpha=0.00006,
random_state=42)

rf = RandomForestRegressor(n_estimators=1200,
max_depth=15,
min_samples_split=5,
min_samples_leaf=5,
max_features=None,
oob_score=True,
random_state=42)``````

5.4 模型得分

``````scores = {}

score = cv_rmse(lightgbm)
print('lightgbm: {:.4f} ({:.4f})'.format(score.mean(), score.std()))
scores['lgb'] = (score.mean(), score.std())``````
``````score = cv_rmse(xgboost)
print('xgboost: {:.4f} ({:.4f})'.format(score.mean(), score.std()))
scores['xgb'] = (score.mean(), score.std())``````
``````score = cv_rmse(rf)
print('rf: {:.4f} ({:.4f})'.format(score.mean(), score.std()))
scores['rf'] = (score.mean(), score.std())``````

5.5 安装模型

``````print('lightgbm')
lgb_model_full_data = lightgbm.fit(Features,erhouse_labels)``````
``````print('xgboost')
xgb_model_full_data = xgboost.fit(Features,erhouse_labels)``````
``````print('RandomForest')
rf_model_full_data = rf.fit(Features,erhouse_labels)``````

5.6 加权模型结果得到预测值

``````def sumed_predictions(Features):
return ((0.3 * xgb_model_full_data.predict(Features)) +
(0.4 * lgb_model_full_data.predict(Features)) +
(0.3 * rf_model_full_data.predict(Features))))``````

``````sumed_score = rmsle(erhouse_labels, sumed_predictions(Features))
scores['sumed'] = (sumed_score, 0)
print('RMSLE score on data:')
print(sumed_score)``````

5.7 确定性能最佳的模型

``````sns.set_style("white")
fig = plt.figure(figsize=(24, 12))

ax = sns.pointplot(x=list(scores.keys()), y=[score for score, _ in scores.values()], markers=['o'], linestyles=['-'])
for i, score in enumerate(scores.values()):
ax.text(i, score[0] + 0.002, '{:.6f}'.format(score[0]), horizontalalignment='left', size='large', color='black', weight='semibold')

plt.tick_params(axis='x', labelsize=13.5)
plt.tick_params(axis='y', labelsize=12.5)

plt.title('Scores of Models', size=20)

plt.show()``````

6 封装模型

``model = sumed_predictions(Features)``

#模型封装

``````from sklearn.externals import joblib
joblib.dump(model,'model.pkl')

7 调用模型