Billy Ian's Short Leisure-time Wander

into language, learning, intelligence and beyond

ML With R (3): Logistic Regression

| Comments

Code in R for Coursera-ML and CMPUT466/551 in University of Alberta

This post refers to programming exercise 2 in Coursera-ML. All the code can be found in this repo.

setwd("~/Coding/R/CMPUT466/mlclass-ex2")
library(ggplot2)
library(R.matlab)
library(devtools)
source_url("https://raw.githubusercontent.com/ggrothendieck/gsubfn/master/R/list.R")

Logistic Regression

Visualizing the data

Use a scatter plot to visualize the data.

data <- read.csv("ex2data1.txt", header=FALSE)
X <- as.matrix(data[,c("V1", "V2")])
y <- as.matrix(data$V3)
ggplot(data, aes(x=V1, y=V2, col=factor(V3))) + 
  geom_point(shape=I(3), size=I(3)) +
  labs(x="Exam1 score", y="Exam2 score") +
  scale_color_manual(name="", labels=c("Not admitted", "Admitted"), values = c("blue", "red"))

Cost function and gradient

The cost function in logistic regression is

\[J(\theta)=\frac1m\sum_{i=1}^m[-y^{(i)}\log(h_\theta(x^{(i)}))-(1-y^{(i)})\log(1-h_\theta(x^{(i)}))],\]

and the gradient of the cost is a vector of the same length as \(\theta\) where the \(j^{th}\) element (for \(j=0,1,\dots,n\)) is defined as follows:

\[\frac{\partial J(\theta)}{\partial \theta_j}=\frac1m\sum_{i=1}^m(h_\theta(x^{(i)})-y^{(i)})x_j^{(i)}\]

costFunction <- function(theta, X, y) {
  m <- length(y)
  h <- 1./(1+exp(-X %*% theta))
  J <- 1./m * (-t(y) %*% log(h) - t(1-y) %*% log(1-h))
  grad <- 1./m * t(X) %*% (h-y)
  list(J, grad)
}

m <- dim(X)[1]
n <- dim(X)[2]
X <- cbind(1, X)
initial_theta <- matrix(0, n+1, 1)

list[cost, grad] = costFunction(initial_theta, X, y)
cost
##           [,1]
## [1,] 0.6931472

Learning parameters

Use the optim function choosing BFGS (in order to do unconstrainted optimization) to fit the logistic regression and plot the decision boundary.

computeCost <- function(theta, X, y) {
  m <- length(y)
  h <- 1./(1+exp(-X %*% theta))
  1./m * (-t(y) %*% log(h) - t(1-y) %*% log(1-h))
}
computeGrad <- function(theta, X, y) {
  m <- length(y)
  h <- 1./(1+exp(-X %*% theta))
  1./m * t(X) %*% (h-y)
}
res <- optim(initial_theta, computeCost, computeGrad, X, y, method="BFGS", control=list(maxit = 400))
theta <- res$par
computeCost(theta, X, y)
##           [,1]
## [1,] 0.2034985
plot_x <- c(min(X[,2])-10, max(X[,2])+10)
plot_y <- (-1./theta[3]) * (theta[2] * plot_x + theta[1])
df_db <- data.frame(x=plot_x, y=plot_y)
ggplot(df_db, aes(x,y)) + geom_line(color="cyan3") + 
  geom_point(aes(V1, V2, col=factor(V3)), data, size=I(3), shape=I(3)) + 
  labs(x="Exam1 score", y="Exam2 score") +
  scale_color_manual(name="", labels=c("Not admitted", "Admitted"), values = c("blue", "red")) +
  coord_cartesian(xlim=c(30, 100), ylim=c(30, 100))

Evaluating logistic regression

sigmoid <- function(x) 1./(1+exp(-x))
prob <- sigmoid(c(1, 45, 85) %*% theta)
prob
##           [,1]
## [1,] 0.7756949
y_prob <- sigmoid(X %*% theta)
p <- y_prob > 0.5
mean(p==y) * 100.0
## [1] 89

Regularized Logistic Regression

Visualizing the data

Use a scatter plot to visualize the data.

data <- read.csv("ex2data2.txt", header=FALSE)
X <- as.matrix(data[,c("V1", "V2")])
y <- as.matrix(data$V3)
ggplot(data, aes(x=V1, y=V2, col=factor(V3))) + 
  geom_point(shape=I(3), size=I(3)) +
  labs(x="Microchip Test 1", y="Microchip Test 2") +
  scale_color_manual(name="", labels=c("y = 0", "y = 1"), values = c("blue", "red"))

Feature mapping

One way to fit the data better is to create more features from each data point. Here we map the features into all polynomial terms of \(x_1\) and \(x_2\) up to the sixth power. As a result of this mapping, our vector of features has been transformed into a \(28\)-dimensional vector.

mapFeature <- function(X1, X2) {
  degree <- 6
  out <- matrix(1, length(X1), 1)
  for(i in 1:6) {
    for(j in 0:i) {
      out <- cbind(out, (X1^(i-j)) * (X2^j))
    }
  }
  out
}
X <- mapFeature(X[,1], X[,2])

Cost function and gradient

The regularized cost function in logistic regression is

\[J(\theta)=\left[\frac1m\sum_{i=1}^m[-y^{(i)}\log(h_\theta(x^{(i)}))-(1-y^{(i)})\log(1-h_\theta(x^{(i)}))]\right]+\frac{\lambda}{2m}\sum_{j=2}^n\theta_j^2.\]

Note that you should not regularize the parameter \(\theta_0\). The gradient of the cost is a vector where the \(j^{th}\) element (for \(j=0,1,\dots,n\)) is defined as follows:

\[\begin{split} &\frac{\partial J(\theta)}{\partial \theta_0}=\frac1m\sum_{i=1}^m(h_\theta(x^{(i)})-y^{(i)})x_j^{(i)} & \mathrm{for}\ j=0\\ &\frac{\partial J(\theta)}{\partial \theta_j}=\left(\frac1m\sum_{i=1}^m(h_\theta(x^{(i)})-y^{(i)})x_j^{(i)}\right) + \frac{\lambda}m\theta_j \quad&\mathrm{for}\ j\ge1 \end{split}\]

costFunctionReg <- function(theta, X, y, lambda) {
  m <- length(y)
  h <- 1./(1+exp(-X %*% theta))
  theta0 <- theta
  theta0[1] <- 0
  J <- 1./m*(-t(y) %*% log(h) - t(1-y) %*% log(1-h)) + lambda/(2*m)*t(theta0)%*%theta0
  grad <- 1./m*t(X) %*% (h-y) + lambda/m*theta0
  list(J, grad)
}
initial_theta <- matrix(0, dim(X)[2], 1)
lambda <- 1
list[cost, grad] <- costFunctionReg(initial_theta, X, y, lambda)
cost
##           [,1]
## [1,] 0.6931472

Learning parameters

Fit the regularized logistic regression. Then plot the non-linear decision boundary by computing the classifier’s predictions on an evenly spaced grid and drew a contour plot of where the predictions change from \(y = 0\) to \(y = 1\)

computeCostReg <- function(theta, X, y, lambda) {
  m <- length(y)
  h <- 1./(1+exp(-X %*% theta))
  theta0 <- theta
  theta0[1] <- 0
  1./m * (-t(y) %*% log(h) - t(1-y) %*% log(1-h)) + lambda/(2*m)*t(theta0)%*%theta0
}
computeGradReg <- function(theta, X, y, lambda) {
  m <- length(y)
  h <- 1./(1+exp(-X %*% theta))
  theta0 <- theta
  theta0[1] <- 0
  1./m * t(X) %*% (h-y) + lambda/m*theta0
}
res <- optim(initial_theta, computeCostReg, computeGradReg, X, y, lambda, method="BFGS", control=list(maxit = 400))
theta <- res$par

u <- seq(-1, 1.5, length=50)
v <- seq(-1, 1.5, length=50)
z <- matrix(0, length(u), length(v))
idx <- z
idy <- z
for(i in 1:length(u)) {
  for(j in 1:length(v)) {
    z[i,j] <- mapFeature(u[i], v[j]) %*% theta
    idx[i, j] <- u[i]
    idy[i, j] <- v[j]
  }
}
df_contour <- data.frame(x=c(idx), y=c(idy), z=c(z))
ggplot(df_contour, aes(x,y,z=z)) + geom_contour(color="cyan3",bins=1) +
  geom_point(data=data, aes(V1, V2, color=factor(V3), z=NULL), size=I(3), shape=I(3))+
  labs(x="Microchip Test 1", y="Microchip Test 2") +
  scale_color_manual(name="", labels=c("y = 0", "y = 1"), values = c("blue", "red"))

y_prob <- sigmoid(X %*% theta)
p <- y_prob > 0.5
mean(p==y) * 100.0
## [1] 83.05085

Comments