# Increase degree of linear

# regression polynomial order

for d in range(1, degrees+1):

print("Degree: %s" % d)

# Create the model, split the sets and fit it

polynomial_features = PolynomialFeatures(

degree=d, include_bias=False


linear_regression = LinearRegression()

model = Pipeline([

("polynomial_features", polynomial_features),

("linear_regression", linear_regression)


X_train, X_test, y_train, y_test = train_test_split(

X, y, test_size=0.5, random_state=i


model.fit(X_train, y_train)

# Calculate the test MSE and append to the

# dictionary of all test curves

y_pred = model.predict(X_test)

test_mse = mean_squared_error(y_test, y_pred)

sample_dict["seed_%s" % i].append(test_mse)

# Convert these lists into numpy

# arrays to perform averaging

sample_dict["seed_%s" % i] = np.array(

sample_dict["seed_%s" % i]




# Create the "average test MSE" series by averaging the

# test MSE for each degree of the linear regression model,

# across all random samples

sample_dict["avg"] = np.zeros(degrees)

for i in range(1, random_seeds+1):

sample_dict["avg"] += sample_dict["seed_%s" % i]

sample_dict["avg"] /= float(random_seeds)

return sample_dict

We can use Matplotlib to plot this data. We need to import pylab and then create a function

to plot the test error curves:

