# ML With R (3): Logistic Regression

Code in R for Coursera-ML and CMPUT466/551 in University of Alberta

This post refers to programming exercise 2 in Coursera-ML. All the code can be found in this repo.

setwd("~/Coding/R/CMPUT466/mlclass-ex2")
library(ggplot2)
library(R.matlab)
library(devtools)
source_url("https://raw.githubusercontent.com/ggrothendieck/gsubfn/master/R/list.R")

### Logistic Regression

#### Visualizing the data

Use a scatter plot to visualize the data.

data <- read.csv("ex2data1.txt", header=FALSE)
X <- as.matrix(data[,c("V1", "V2")])
y <- as.matrix(data$V3) ggplot(data, aes(x=V1, y=V2, col=factor(V3))) + geom_point(shape=I(3), size=I(3)) + labs(x="Exam1 score", y="Exam2 score") + scale_color_manual(name="", labels=c("Not admitted", "Admitted"), values = c("blue", "red")) #### Cost function and gradient The cost function in logistic regression is $J(\theta)=\frac1m\sum_{i=1}^m[-y^{(i)}\log(h_\theta(x^{(i)}))-(1-y^{(i)})\log(1-h_\theta(x^{(i)}))],$ and the gradient of the cost is a vector of the same length as $$\theta$$ where the $$j^{th}$$ element (for $$j=0,1,\dots,n$$) is defined as follows: $\frac{\partial J(\theta)}{\partial \theta_j}=\frac1m\sum_{i=1}^m(h_\theta(x^{(i)})-y^{(i)})x_j^{(i)}$ costFunction <- function(theta, X, y) { m <- length(y) h <- 1./(1+exp(-X %*% theta)) J <- 1./m * (-t(y) %*% log(h) - t(1-y) %*% log(1-h)) grad <- 1./m * t(X) %*% (h-y) list(J, grad) } m <- dim(X)[1] n <- dim(X)[2] X <- cbind(1, X) initial_theta <- matrix(0, n+1, 1) list[cost, grad] = costFunction(initial_theta, X, y) cost ## [,1] ## [1,] 0.6931472 #### Learning parameters Use the optim function choosing BFGS (in order to do unconstrainted optimization) to fit the logistic regression and plot the decision boundary. computeCost <- function(theta, X, y) { m <- length(y) h <- 1./(1+exp(-X %*% theta)) 1./m * (-t(y) %*% log(h) - t(1-y) %*% log(1-h)) } computeGrad <- function(theta, X, y) { m <- length(y) h <- 1./(1+exp(-X %*% theta)) 1./m * t(X) %*% (h-y) } res <- optim(initial_theta, computeCost, computeGrad, X, y, method="BFGS", control=list(maxit = 400)) theta <- res$par
computeCost(theta, X, y)
##           [,1]
## [1,] 0.2034985
plot_x <- c(min(X[,2])-10, max(X[,2])+10)
plot_y <- (-1./theta[3]) * (theta[2] * plot_x + theta[1])
df_db <- data.frame(x=plot_x, y=plot_y)
ggplot(df_db, aes(x,y)) + geom_line(color="cyan3") +
geom_point(aes(V1, V2, col=factor(V3)), data, size=I(3), shape=I(3)) +
labs(x="Exam1 score", y="Exam2 score") +
coord_cartesian(xlim=c(30, 100), ylim=c(30, 100))

#### Evaluating logistic regression

sigmoid <- function(x) 1./(1+exp(-x))
prob <- sigmoid(c(1, 45, 85) %*% theta)
prob
##           [,1]
## [1,] 0.7756949
y_prob <- sigmoid(X %*% theta)
p <- y_prob > 0.5
mean(p==y) * 100.0
## [1] 89

### Regularized Logistic Regression

#### Visualizing the data

Use a scatter plot to visualize the data.

data <- read.csv("ex2data2.txt", header=FALSE)
X <- as.matrix(data[,c("V1", "V2")])
y <- as.matrix(data$V3) ggplot(data, aes(x=V1, y=V2, col=factor(V3))) + geom_point(shape=I(3), size=I(3)) + labs(x="Microchip Test 1", y="Microchip Test 2") + scale_color_manual(name="", labels=c("y = 0", "y = 1"), values = c("blue", "red")) #### Feature mapping One way to fit the data better is to create more features from each data point. Here we map the features into all polynomial terms of $$x_1$$ and $$x_2$$ up to the sixth power. As a result of this mapping, our vector of features has been transformed into a $$28$$-dimensional vector. mapFeature <- function(X1, X2) { degree <- 6 out <- matrix(1, length(X1), 1) for(i in 1:6) { for(j in 0:i) { out <- cbind(out, (X1^(i-j)) * (X2^j)) } } out } X <- mapFeature(X[,1], X[,2]) #### Cost function and gradient The regularized cost function in logistic regression is $J(\theta)=\left[\frac1m\sum_{i=1}^m[-y^{(i)}\log(h_\theta(x^{(i)}))-(1-y^{(i)})\log(1-h_\theta(x^{(i)}))]\right]+\frac{\lambda}{2m}\sum_{j=2}^n\theta_j^2.$ Note that you should not regularize the parameter $$\theta_0$$. The gradient of the cost is a vector where the $$j^{th}$$ element (for $$j=0,1,\dots,n$$) is defined as follows: $\begin{split} &\frac{\partial J(\theta)}{\partial \theta_0}=\frac1m\sum_{i=1}^m(h_\theta(x^{(i)})-y^{(i)})x_j^{(i)} & \mathrm{for}\ j=0\\ &\frac{\partial J(\theta)}{\partial \theta_j}=\left(\frac1m\sum_{i=1}^m(h_\theta(x^{(i)})-y^{(i)})x_j^{(i)}\right) + \frac{\lambda}m\theta_j \quad&\mathrm{for}\ j\ge1 \end{split}$ costFunctionReg <- function(theta, X, y, lambda) { m <- length(y) h <- 1./(1+exp(-X %*% theta)) theta0 <- theta theta0[1] <- 0 J <- 1./m*(-t(y) %*% log(h) - t(1-y) %*% log(1-h)) + lambda/(2*m)*t(theta0)%*%theta0 grad <- 1./m*t(X) %*% (h-y) + lambda/m*theta0 list(J, grad) } initial_theta <- matrix(0, dim(X)[2], 1) lambda <- 1 list[cost, grad] <- costFunctionReg(initial_theta, X, y, lambda) cost ## [,1] ## [1,] 0.6931472 #### Learning parameters Fit the regularized logistic regression. Then plot the non-linear decision boundary by computing the classifierâ€™s predictions on an evenly spaced grid and drew a contour plot of where the predictions change from $$y = 0$$ to $$y = 1$$ computeCostReg <- function(theta, X, y, lambda) { m <- length(y) h <- 1./(1+exp(-X %*% theta)) theta0 <- theta theta0[1] <- 0 1./m * (-t(y) %*% log(h) - t(1-y) %*% log(1-h)) + lambda/(2*m)*t(theta0)%*%theta0 } computeGradReg <- function(theta, X, y, lambda) { m <- length(y) h <- 1./(1+exp(-X %*% theta)) theta0 <- theta theta0[1] <- 0 1./m * t(X) %*% (h-y) + lambda/m*theta0 } res <- optim(initial_theta, computeCostReg, computeGradReg, X, y, lambda, method="BFGS", control=list(maxit = 400)) theta <- res$par

u <- seq(-1, 1.5, length=50)
v <- seq(-1, 1.5, length=50)
z <- matrix(0, length(u), length(v))
idx <- z
idy <- z
for(i in 1:length(u)) {
for(j in 1:length(v)) {
z[i,j] <- mapFeature(u[i], v[j]) %*% theta
idx[i, j] <- u[i]
idy[i, j] <- v[j]
}
}
df_contour <- data.frame(x=c(idx), y=c(idy), z=c(z))
ggplot(df_contour, aes(x,y,z=z)) + geom_contour(color="cyan3",bins=1) +
geom_point(data=data, aes(V1, V2, color=factor(V3), z=NULL), size=I(3), shape=I(3))+
labs(x="Microchip Test 1", y="Microchip Test 2") +
scale_color_manual(name="", labels=c("y = 0", "y = 1"), values = c("blue", "red"))

y_prob <- sigmoid(X %*% theta)
p <- y_prob > 0.5
mean(p==y) * 100.0
## [1] 83.05085