# "(译文)线性和多项式回归中的学习曲线"

## Learning Curves in Linear & Polynomial Regression

Posted by xuepro on May 8, 2018

：我没有完全照着原文翻译，省略了一些字句，使得尽量内容简洁。

## 学习曲线介绍 Introduction to Learning Curves

• If a model is balanced, both errors converge to small values as the training sample size increases.

• If a model has high bias, it ends up underfitting the data. As a result, both errors fail to decrease no matter how many examples there are in the training set.

• If a model has high variance, it ends up overfitting the training data. In that case, increasing the training sample size decreases the training error but it fails to decrease the validation error.

## Problem Definition and Dataset

import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize as opt # we'll need this later
import scipy.io as sio

x_train = dataset["X"]
x_val = dataset["Xval"]
x_test = dataset["Xtest"]

# squeeze the target variables into one dimensional arrays
y_train = dataset["y"].squeeze()
y_val = dataset["yval"].squeeze()
y_test = dataset["ytest"].squeeze()


• The training sample consists of x_train and y_train.

• The validation sample consists of x_val and y_val.*

• The test sample consists of x_test and y_test.

fig, ax = plt.subplots()
ax.scatter(x_train, y_train, marker="x", s=40, c='red')
plt.xlabel("change in water level", fontsize=14)
plt.ylabel("water flowing out of the dam", fontsize=14)
plt.title("Training sample", fontsize=16)
plt.show()


## 游戏计划The Game Plan

### Linear Regression

fmin_cg需要2个函数参数：一个函数计算假设的代价cost，另一个函数计算这个cost函数关于未知参数的梯度。我们可以利用前一篇博文的代码

• Cost函数可以直接拿过来

def cost(theta, X, y):
predictions = X @ theta
return np.sum(np.square(predictions - y)) / (2 * len(y))

predictions = X @ theta
return X.transpose() @ (predictions - y) / len(y)

def train_linear_regression(X, y):
theta = np.zeros(X.shape[1]) # initialize model parameters with zeros
return opt.fmin_cg(cost, theta, cost_gradient, (X, y), disp=False)


def insert_ones(x):
X = np.ones(shape=(x.shape[0], x.shape[1] + 1))
X[:, 1:] = x
return X


X_train = insert_ones(x_train)
theta = train_linear_regression(X_train, y_train)
hypothesis = X_train @ theta
ax.plot(X_train[:, 1], hypothesis, linewidth=2)
fig


## 线性回归的学习曲线 Learning Curves for Linear Regression

def learning_curves(X_train, y_train, X_val, y_val):
train_err = np.zeros(len(y_train))
val_err = np.zeros(len(y_train))
for i in range(1, len(y_train)):
theta = train_linear_regression(X_train[0:i + 1, :], y_train[0:i + 1])
train_err[i] = cost(theta, X_train[0:i + 1, :], y_train[0:i + 1])
val_err[i] = cost(theta, X_val, y_val)
plt.plot(range(2, len(y_train) + 1), train_err[1:], c="r", linewidth=2)
plt.plot(range(2, len(y_train) + 1), val_err[1:], c="b", linewidth=2)
plt.xlabel("number of training examples", fontsize=14)
plt.ylabel("error", fontsize=14)
plt.legend(["training", "validation"], loc="best")
plt.axis([2, len(y_train), 0, 100])
plt.grid()


X_val = insert_ones(x_val)
plt.title("Learning Curves for Linear Regression", fontsize=16)
learning_curves(X_train, y_train, X_val, y_val)


## Polynomial Regression

### Feature Mapping

def poly_features(x, degree):
X_poly = np.zeros(shape=(len(x), degree))
for i in range(0, degree):
X_poly[:, i] = x.squeeze() ** (i + 1);
return X_poly


x_train_poly = poly_features(x_train, 8)
x_val_poly = poly_features(x_val, 8)
x_test_poly = poly_features(x_test, 8)


### 特征规范化Feature Normalization

print(x_train_poly[:4, :])

[[ -1.59367581e+01   2.53980260e+02  -4.04762197e+03   6.45059724e+04
-1.02801608e+06   1.63832436e+07  -2.61095791e+08   4.16102047e+09]
[ -2.91529792e+01   8.49896197e+02  -2.47770062e+04   7.22323546e+05
-2.10578833e+07   6.13900035e+08  -1.78970150e+10   5.21751305e+11]
[  3.61895486e+01   1.30968343e+03   4.73968522e+04   1.71527069e+06
6.20748719e+07   2.24646160e+09   8.12984311e+10   2.94215353e+12]
[  3.74921873e+01   1.40566411e+03   5.27014222e+04   1.97589159e+06
7.40804977e+07   2.77743990e+09   1.04132297e+11   3.90414759e+12]]


• 每一列特征减去这一列的平均值
• 将每一列的值除以它们的平均方差，以便这些值位于[0,1]之间

train_means = x_train_poly.mean(axis=0)
train_stdevs = np.std(x_train_poly, axis=0, ddof=1)

x_train_poly = (x_train_poly - train_means) / train_stdevs
x_val_poly = (x_val_poly - train_means) / train_stdevs
x_test_poly = (x_test_poly - train_means) / train_stdevs

X_train_poly = insert_ones(x_train_poly)
X_val_poly = insert_ones(x_val_poly)
X_test_poly = insert_ones(x_test_poly)


def plot_fit(min_x, max_x, means, stdevs, theta, degree):
x = np.linspace(min_x - 5, max_x + 5, 1000)
x_poly = poly_features(x, degree)
x_poly = (x_poly - means) / stdevs
x_poly = insert_ones(x_poly)
plt.plot(x, x_poly @ theta, linewidth=2)
plt.show()

theta = train_linear_regression(X_train_poly, y_train)
plt.scatter(x_train, y_train, marker="x", s=40, c='red')
plt.xlabel("change in water level", fontsize=14)
plt.ylabel("water flowing out of the dam", fontsize=14)
plt.title("Polynomial Fit", fontsize=16)
plot_fit(min(x_train), max(x_train), train_means, train_stdevs, theta, 8)


plt.title("Learning Curves for Polynomial Regression", fontsize=16)
learning_curves(X_train_poly, y_train, X_val_poly, y_val)


## Regularized Polynomial Regression正则化多项式回归

Regularization允许我们在训练时通过对过大的$$\theta$$更大的惩罚避免过拟合。

$J(\theta) = \frac{1}{2m}\sum_{i=1}^{m}(h_\theta(x^{(i)})-y^{(i)})^2 + \frac{\lambda}{2m}\sum_{j=1}^{n}(\theta_j^2)$

$$\frac{J(\theta)}{\theta_0} = \frac{1}{m}\sum_{i=1}^{m}(h_\theta(x^{(i)})-y^{(i)})x_j^{(i)}$$ for $$j=0$$

$$\frac{J(\theta)}{\theta_j} = \frac{1}{m}\sum_{i=1}^{m}(h_\theta(x^{(i)})-y^{(i)})x_j^{(i)} +\frac{\lambda}{m}\theta_j$$ for $$j!=0$$

def cost(theta, X, y, lamb=0):
predictions = X @ theta
squared_errors = np.sum(np.square(predictions - y))
regularization = np.sum(lamb * np.square(theta[1:]))
return (squared_errors + regularization) / (2 * len(y))

predictions = X @ theta
gradient = X.transpose() @ (predictions - y)
regularization = lamb * theta
regularization[0] = 0 # don't penalize the intercept term
return (gradient + regularization) / len(y)

train_linear_regression 和 learning_curves也要做相应的修改

def train_linear_regression(X, y, lamb=0):
theta = np.zeros(X.shape[1])
return opt.fmin_cg(cost, theta, cost_gradient, (X, y, lamb), disp=False)

def learning_curves(X_train, y_train, X_val, y_val, lamb=0):
train_err = np.zeros(len(y_train))
val_err = np.zeros(len(y_train))
for i in range(1, len(y_train)):
theta = train_linear_regression(X_train[0:i + 1, :], y_train[0:i + 1], lamb)
train_err[i] = cost(theta, X_train[0:i + 1, :], y_train[0:i + 1])
val_err[i] = cost(theta, X_val, y_val)
plt.plot(range(2, len(y_train) + 1), train_err[1:], c="r", linewidth=2)
plt.plot(range(2, len(y_train) + 1), val_err[1:], c="b", linewidth=2)
plt.xlabel("number of training examples", fontsize=14)
plt.ylabel("error", fontsize=14)
plt.legend(["Training", "Validation"], loc="best")
plt.axis([2, len(y_train), 0, 100])
plt.grid()


plt.title("Learning Curves for Regularized Polynomial Regression", fontsize=16)
learning_curves(X_train_poly, y_train, X_val_poly, y_val, 1)


## Choosing the Optimal Regularization Parameter

lambda_values = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10];
val_err = []
for lamb in lambda_values:
theta = train_linear_regression(X_train_poly, y_train, lamb)
val_err.append(cost(theta, X_val_poly, y_val))
plt.plot(lambda_values, val_err, c="b", linewidth=2)
plt.axis([0, len(lambda_values), 0, val_err[-1] + 1])
plt.grid()
plt.xlabel("lambda", fontsize=14)
plt.ylabel("error", fontsize=14)
plt.title("Validation Curve", fontsize=16)
plt.show()


## 评估测试误差Evaluating Test Errors

X_test = insert_ones(x_test)
theta = train_linear_regression(X_train, y_train)
test_error = cost(theta, X_test, y_test)
print("Test Error =", test_error, "| Linear Regression")

theta = train_linear_regression(X_train_poly, y_train)
test_error = cost(theta, X_test_poly, y_test)
print("Test Error =", test_error, "| Polynomial Regression")

theta = train_linear_regression(X_train_poly, y_train, 3)
test_error = cost(theta, X_test_poly, y_test)
print("Test Error =", test_error, "| Regularized Polynomial Regression (at lambda = 3)")

Test Error = 32.5057492449 | Linear Regression
Test Error = 16.9349317906 | Polynomial Regression
Test Error = 3.85988781599 | Regularized Polynomial Regression (at lambda = 3)