# ML With R (2): Regularized Linear Regression

Code in R for Coursera-ML and CMPUT466/551 in University of Alberta

This post refers to programming exercise 5 in Coursera-ML. All the code can be found in this repo.

setwd("~/Coding/R/CMPUT466/mlclass-ex5")
library(ggplot2)
library(R.matlab)
library(devtools)
source_url("https://raw.githubusercontent.com/ggrothendieck/gsubfn/master/R/list.R")

### Regularized Linear Regression

#### Visualizing the dataset

Use a scatter plot to visualize the data.

data <- readMat("ex5data1.mat")
X <- data$X y <- data$y
Xtest <- data$Xtest ytest <- data$ytest
Xval <- data$Xval yval <- data$yval
df <- data.frame(X=X, y=y)
ggplot(df, aes(X, y, col="Training data")) + geom_point(shape=I(4), size=I(3)) +
labs(x="Change in water level (x)", y="Water flowing out of the damn (y)") +
scale_color_manual(guide=FALSE, values=c("Red"))

#### Regularized linear regression cost function

Compute the cost function with a regularized term:

$J(\theta)=\left(\sum_{i=1}^m(h_\theta(x^{(i)})-y^{(i)})^2\right)+\frac{\lambda}{2m}\left(\sum_{j=1}^n\theta_j^2\right)$

where $$\lambda$$ is a regularization parameter which controls the degree of regularization.

computeCost = function(theta, X, y, lambda) {
m = length(y)
n = length(theta)
sum((X %*% theta - y)^2)/(2*m) + lambda/(2*m)*t(theta[2:n]) %*% theta[2:n]
}

myTheta <- c(1., 1.)
computeCost(myTheta, cbind(1, X), y, 1)
##          [,1]
## [1,] 303.9932

Compute gradient of the cost function with a regularized term:

computeGrad = function(theta, X, y, lambda) {
m = length(y)
n = length(theta)
tmp <- theta
tmp[1] = 0
1./m*t(X) %*% (X %*% theta - y) + lambda/m*tmp
}

computeGrad(myTheta, cbind(1, X), y, 1)
##           [,1]
## [1,] -15.30302
## [2,] 598.25074

#### Fitting linear regression

Use the optim function to fit the linear regression and plot the fitted line.

trainLinearReg <- function(X, y, lambda, maxit) {
n <- dim(X)[2]
theta <- matrix(0, n, 1)
res <- optim(theta, computeCost, computeGrad, X, y, lambda, method = "CG", control = list(maxit = maxit))
res$par } lambda <- 0 theta <- trainLinearReg(cbind(1,X), y, lambda, 10000) df$y.pred <- cbind(1, X) %*% theta
ggplot(df, aes(X, y, col="Training data")) + geom_point(shape=I(4), size=I(3)) +
labs(x="Change in water level (x)", y="Water flowing out of the damn (y)", title="Linear Regression Fit") +
geom_line(aes(X, y.pred, col="Regularized linear regression")) +
scale_color_manual(guide=FALSE, values=c("Blue", "Red"))

### Bias-variance

#### Learning curves

Learning curves are useful in debugging learning algorithms which plot training and validation error as a function of training set size.

learningCurve <- function(X, y, X_val, y_val, lambda, maxit) {
m = length(y)
error.train = matrix(0, m, 1)
error.valid = matrix(0, m, 1)
for(i in 1:m) {
theta <- trainLinearReg(X[1:i,,drop=FALSE], y[1:i], lambda, maxit)
error.train[i] = computeCost(theta, X[1:i,], y[1:i], 0)
error.valid[i] = computeCost(theta, X_val, y_val, 0)
}
list(error.train=error.train, error.valid=error.valid)
}
list[error.train, error.valid] <- learningCurve(cbind(1, X), y, cbind(1, Xval), yval, lambda, 10000)

m = length(y)
df_lc <- data.frame(x=1:m, error.train=error.train, error.valid=error.valid)
ggplot(df_lc, aes(x)) + geom_line(aes(y=error.train, col="training error")) +
geom_line(aes(y=error.valid, col="validation error")) +
labs(x="Number of training examples", y="Error", title="Learning curve for linear regression") +
scale_color_discrete(name="")

### Polynomial regression

For polynomial regression, our hypothesis has the form:

$h_\theta(x) = \theta_0+\theta_1 x + \theta_2 x^2 + \dots + \theta_p x^p$

#### Learning Polynomial Regression

Even though we have polynomial terms in our feature vector, we are still solving a linear regression optimization problem. The polynomial terms have simply turned into features that we can use for linear regression. As a result, we can use the function written before.

polyFeatures = function(X, p) {
X_poly = matrix(0, length(X), p)
for(i in 1:p)
X_poly[,i] = X^i
X_poly
}

p <- 8
X_poly <- polyFeatures(X, p)
X_poly <- scale(X_poly)
mu <- attr(X_poly, "scaled:center")
sigma <- attr(X_poly, "scaled:scale")
X_poly <- cbind(1, X_poly)

X_poly_test <- polyFeatures(Xtest, p)
X_poly_test <- sweep(X_poly_test, 2, mu)
X_poly_test <- sweep(X_poly_test, 2, sigma, "/")
X_poly_test <- cbind(1, X_poly_test)

X_poly_val <- polyFeatures(Xval, p)
X_poly_val <- sweep(X_poly_val, 2, mu)
X_poly_val <- sweep(X_poly_val, 2, sigma, "/")
X_poly_val <- cbind(1, X_poly_val)

lambda <- 0
n = dim(X_poly)[2]
theta <- trainLinearReg(X_poly, y, lambda, 200)

idx <- seq(min(X)-15, max(X)+15, length.out=100)
X_poly_fit <- polyFeatures(idx, p)
X_poly_fit <- sweep(X_poly_fit, 2, mu)
X_poly_fit <- sweep(X_poly_fit, 2, sigma, "/")
X_poly_fit <- cbind(1, X_poly_fit)
df_fit <- data.frame(x=idx,y=X_poly_fit %*% theta)
ggplot(df, aes(X, y, col="Training data")) + geom_point(shape=I(4), size=I(3)) +
labs(x="Change in water level (x)", y="Water flowing out of the damn (y)", title=sprintf("Polynomial Regression Fit (lambda = %.1f)", lambda)) +
geom_line(aes(x, y, col="Regularized polynomial regression"), df_fit, linetype="dashed") +
scale_color_manual(guide=FALSE, values=c("Blue", "Red"))

Also, plot the learning curve for polynomial regression.

list[error.train, error.valid] <- learningCurve(X_poly, y, X_poly_val, yval, lambda, 200)
df_lc <- data.frame(x=1:m, error.train=error.train, error.valid=error.valid)
ggplot(df_lc, aes(x)) + geom_line(aes(y=error.train, col="training error")) +
geom_line(aes(y=error.valid, col="validation error")) +
labs(x="Number of training examples", y="Error", title=sprintf("Polynomial Regression Learning Curve (lambda = %.1f)", lambda)) +
scale_color_discrete(name="")

#### Select $$\lambda$$ using a cross validation set

The value of $$\lambda$$ can significantly affect the results of regularized polynomial regression on the training and validation set. We use a validation set to evaluate how good each $$\lambda$$ value is. After selecting the best $$\lambda$$ value using the validation set, we can then evaluate the model on the test set to estimate how well the model will perform on actual unseen data.

validationCurve <- function(X, y, Xval, yval, maxit) {
lambda_vec = c(0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10)
m = length(lambda_vec)
error.train = matrix(0, m, 1)
error.valid = matrix(0, m, 1)
for(i in 1:m) {
lambda = lambda_vec[i]
theta <- trainLinearReg(X, y, lambda, maxit)
error.train[i] = computeCost(theta, X, y, 0)
error.valid[i] = computeCost(theta, Xval, yval, 0)
}
list(lambda_vec=lambda_vec, error.train=error.train, error.valid=error.valid)
}
list[lambda_vec, error.train, error.valid] <- validationCurve(X_poly, y, X_poly_val, yval, 200)

df_cv <- data.frame(lambda_vec=lambda_vec, error.train=error.train, error.valid=error.valid)
ggplot(df_cv, aes(lambda_vec)) + geom_line(aes(y=error.train, col="training error")) +
geom_line(aes(y=error.valid, col="validation error")) +
labs(x="Lambda", y="Error") +
scale_color_discrete(name="")