### 手把手教你用Python进行回归（附代码、学习资料）

THU数据派 2018-04-14 16:34:56

https://www.analyticsvidhya.com/blog/2015/08/comprehensive-guide-regression/

• 了解数据
• 简单回顾线性回归
• 多项式回归：对线性会回归的改进
• 理解样条回归及其实现
• 分段阶梯函数
• 基函数
• 分段多项式
• 约束和样条
• 三次样条和自然三次样条
• 确定节点的数量和位置
• 比较样条回归和多项式回归

#导入需要的包import pandas as pdimport numpy as npimport statsmodels.api as smimport matplotlib.pyplot as plt%matplotlib inline#读入数据data = pd.read_csv("Wage.csv")data.head()

data_x = data['age']data_y = data['wage']#将数据划分为训练集和验证集from sklearn.model_selection import train_test_splittrain_x, valid_x, train_y, valid_y = train_test_split(data_x, data_y, test_size=0.33, random_state = 1)#对年龄和工资的关系进行可视化import matplotlib.pyplot as pltplt.scatter(train_x, train_y, facecolor='None', edgecolor='k', alpha=0.3)plt.show()

from sklearn.linear_model import LinearRegression#拟合线性回归模型x = train_x.reshape(-1,1)model = LinearRegression()model.fit(x,train_y)print(model.coef_)print(model.intercept_)-> array([0.72190831])-> 80.65287740759283#在验证集上进行预测valid_x = valid_x.reshape(-1,1)pred = model.predict(valid_x)#可视化#我们将使用valid_x的最小值和最大值之间的70个点进行绘制xp = np.linspace(valid_x.min(),valid_x.max(),70)xp = xp.reshape(-1,1)pred_plot = model.predict(xp)plt.scatter(valid_x, valid_y, facecolor='None', edgecolor='k', alpha=0.3)plt.plot(xp, pred_plot)plt.show()

from sklearn.metrics import mean_squared_errorfrom math import sqrtrms = sqrt(mean_squared_error(valid_y, pred))print(rms)-> 40.436

#为回归函数生成权重，设degree=2weights = np.polyfit(train_x, train_y, 2)print(weights)-> array([ -0.05194765, 5.22868974, -10.03406116])#根据给定的权重生成模型model = np.poly1d(weights)#在验证集上进行预测pred = model(valid_x)#我们只画出其中的70个点xp = np.linspace(valid_x.min(),valid_x.max(),70)pred_plot = model(xp)plt.scatter(valid_x, valid_y, facecolor='None', edgecolor='k', alpha=0.3)plt.plot(xp, pred_plot)plt.show()

• 分段阶梯函数

#将数据划到四个区间中df_cut, bins = pd.cut(train_x, 4, retbins=True, right=True)df_cut.value_counts(sort=False)->(17.938, 33.5] 504(33.5, 49.0] 941(49.0, 64.5] 511(64.5, 80.0] 54Name: age, dtype: int64df_steps = pd.concat([train_x, df_cut, train_y],keys=['age','age_cuts','wage'], axis=1)#将讲年龄编码为哑变量df_steps_dummies = pd.get_dummies(df_cut)df_steps_dummies.head()

df_steps_dummies.columns = ['17.938-33.5','33.5-49','49-64.5','64.5-80']#拟合广义线性模型fit3 = sm.GLM(df_steps.wage, df_steps_dummies).fit()#同样将验证集划分到四个桶中bin_mapping = np.digitize(valid_x, bins)X_valid = pd.get_dummies(bin_mapping)#去掉离群点X_valid = pd.get_dummies(bin_mapping).drop([5], axis=1)#进行预测pred2 = fit3.predict(X_valid)#计算RMSEfrom sklearn.metrics import mean_squared_errorfrom math import sqrtrms = sqrt(mean_squared_error(valid_y, pred2))print(rms)->39.9#在这我们只画出70个观察点的图xp = np.linspace(valid_x.min(),valid_x.max()-1,70)bin_mapping = np.digitize(xp, bins)X_valid_2 = pd.get_dummies(bin_mapping)pred2 = fit3.predict(X_valid_2)#进行可视化fig, (ax1) = plt.subplots(1,1, figsize=(12,5))fig.suptitle('Piecewise Constant', fontsize=14)#画出样条回归的散点图ax1.scatter(train_x, train_y, facecolor='None', edgecolor='k', alpha=0.3)ax1.plot(xp, pred2, c='b')ax1.set_xlabel('age')ax1.set_ylabel('wage')plt.show()

• 基函数

• 分段多项式

• 约束和样条

• 三次样条和自然三次样条

from patsy import dmatriximport statsmodels.api as smimport statsmodels.formula.api as smf#生成一个三节点的三次样条（25,40,60）transformed_x = dmatrix("bs(train, knots=(25,40,60), degree=3, include_intercept=False)", {"train": train_x},return_type='dataframe')#在数据集及上拟合广义线性模型fit1 = sm.GLM(train_y, transformed_x).fit()#生成一个4节点的三次样条曲线transformed_x2 = dmatrix("bs(train, knots=(25,40,50,65),degree =3, include_intercept=False)", {"train": train_x}, return_type='dataframe')#在数据集上拟合广义线性模型fit2 = sm.GLM(train_y, transformed_x2).fit()#在两个样条上均进行预测pred1 = fit1.predict(dmatrix("bs(valid, knots=(25,40,60), include_intercept=False)", {"valid": valid_x}, return_type='dataframe'))pred2 = fit2.predict(dmatrix("bs(valid, knots=(25,40,50,65),degree =3, include_intercept=False)", {"valid": valid_x}, return_type='dataframe'))#计算RMSE值valuesrms1 = sqrt(mean_squared_error(valid_y, pred1))print(rms1)-> 39.4rms2 = sqrt(mean_squared_error(valid_y, pred2))print(rms2)-> 39.3#我们将使用70个点进行图形的绘制xp = np.linspace(valid_x.min(),valid_x.max(),70)#进行一些预测pred1 = fit1.predict(dmatrix("bs(xp, knots=(25,40,60), include_intercept=False)", {"xp": xp}, return_type='dataframe'))pred2 = fit2.predict(dmatrix("bs(xp, knots=(25,40,50,65),degree =3, include_intercept=False)", {"xp": xp}, return_type='dataframe'))#画出样条曲线和误差图plt.scatter(data.age, data.wage, facecolor='None', edgecolor='k', alpha=0.1)plt.plot(xp, pred1, label='Specifying degree =3 with 3 knots')plt.plot(xp, pred2, color='r', label='Specifying degree =3 with 4 knots')plt.legend()plt.xlim(15,85)plt.ylim(0,350)plt.xlabel('age')plt.ylabel('wage')plt.show()

#生成自然三次样条transformed_x3 = dmatrix("cr(train,df = 3)", {"train": train_x}, return_type='dataframe')fit3 = sm.GLM(train_y, transformed_x3).fit()#在验证集上进行预测pred3 = fit3.predict(dmatrix("cr(valid, df=3)", {"valid": valid_x}, return_type='dataframe'))#计算RMSE的值rms = sqrt(mean_squared_error(valid_y, pred3))print(rms)-> 39.44#选取其中70个点进行作图xp = np.linspace(valid_x.min(),valid_x.max(),70)pred3 = fit3.predict(dmatrix("cr(xp, df=3)", {"xp": xp}, return_type='dataframe'))#画出样条曲线plt.scatter(data.age, data.wage, facecolor='None', edgecolor='k', alpha=0.1)plt.plot(xp, pred3,color='g', label='Natural spline')plt.legend()plt.xlim(15,85)plt.ylim(0,350)plt.xlabel('age')plt.ylabel('wage')plt.show()

• 如何选取确定节点的数量和位置

• 取走一部分数据
• 选择一定数量的节点使样条能拟合剩下的这些数据
• 再用样条去预测之前取走的那部分数据

• 比较对样条回归和多项式回归进行比较

0