Author: Anil Gurbuz
Last Updated: 7 MAY 2019
I will be implementing Perceptron and a simple architecture of Neural Network from scratch and will compare their classification performance on a dataset with circular decision boundary.
To observe the best performance of Nueral Network, I will be experimenting with different number of hidden layer neurons in Neural Network and select the archticture with best classification accuracy.
Perceptron is a binary classification algorithm which corresponds to simplest neural unit (neuron) in a Neural Network with step function as an activation function. Structurely it is a generalised linear model with an activation function on top.
Objective function of perceptron is classification accuracy and It will be optimized using Stochastic Gradient Descent (SGD) algorithm below.
Perceptron creates a linear decision boundary to differentiate binary class values. We will observe the effect of linear decision boundary when the underlying relationship is more complex then linear.
Neural Network consists of several neurons that are taking the output of one another as an input. NN optimizes objective function through process of backpropagation. This is the step that NN weights are updated -- where the learning happens. Weights are updated in a way that this potentially complex architecture of network produces the optimum results for the objective function.
Compared to Perceptron, NN can be much more powerful though in general more complex it gets more data will be needed to be able to estimate increasing number of model parameters i.e weights.
# Import required libraries
library(ggplot2)
library(reshape2)
# Load data
test=read.csv("test.csv")
train=read.csv("train.csv")
# Stucture datasets
train=train[complete.cases(train),]
test=test[complete.cases(test),]
train.data = train[,-3]
train.label = train[,3]
test.data = test[,-3]
test.label = test[,3]
train.data = as.data.frame(scale(train.data))
test.data = as.data.frame(scale(test.data))
# Change 0 labels to -1
train.label[train.label==0] = -1
test.label[test.label==0] = -1
train$y[train$y==0] = -1
# Plot training set
ggplot(data=train, aes(x1,x2, color=factor(y) )) +
geom_point() + theme_minimal()
perceptron = function(train.data, train.label, eta, epsilon, tau.max, initial_weights){
# Initialisation
Phi = as.matrix(cbind(1, train.data))
T = train.label
W = matrix(,nrow=tau.max, ncol=ncol(Phi))
W[1,] = runif(ncol(Phi))
train.len=nrow(train.data)
error.trace <- matrix(0,nrow=tau.max, ncol=1)
error.trace[1] <- sum((Phi%*%W[1,])*T<0)/train.len*100
tau = 1
terminate = FALSE
while(!terminate){
# Shuffle data
train.index <- sample(1:train.len, replace = FALSE)
Phi <- Phi[train.index,]
T <- T[train.index]
for (i in 1:train.len){
if (tau==tau.max){break}
if ((W[tau,]%*%Phi[i,])*T[i]<0){
# Update counter
tau = tau+1
# Update parameters
W[tau,] <- W[tau-1,] + eta * Phi[i,] * T[i]
# Update the records
error.trace[tau] <- sum((Phi%*%W[tau,])*T<0)/train.len*100
}
}
# Decrease eta
eta = eta * 0.99
# Recalculate termination conditions
terminate <- tau >= tau.max |
abs(sum((Phi%*%W[tau,])*T<0)/train.len - sum((Phi%*%W[tau-1,])*T<0)/train.len) <= epsilon
}
# Get the final model parameters
W = W[tau,]
return(W)
}
# Set parameters
eta=0.01
epsilon=0.00001
tau.max=1000
intial.weights=runif(3)
# Run perceptron
w = perceptron(train.data, train.label, eta, epsilon, tau.max, intial.weights)
# Make predictions
predictions = as.matrix(cbind(1,test.data))%*%w
predictions[predictions>0,] = 1
predictions[predictions<0,] = -1
# Calculate test error
test.error = sum(predictions!=test.label)/nrow(test.data)
# Plot test data and perceptron predictions
df=as.data.frame(cbind(test.data,predictions))
ggplot(df, aes(x=x1, y=x2, color=factor(predictions))) +
geom_point() + theme_minimal()
# Print test error of perceptron
perceptron.error = test.error
cat(sprintf("Perceptron Test Error :%f",perceptron.error))
As seen above, perceptron can only create a linear decision boundary which is not appropriate for our data as the boundary is circular for this dataset.
We can also conclude that our perceptron model by itself is not complex enough to learn relatively more complex relationship in the dataset. Therefore, I will employ a more complex model -- Neural Network -- in the upcoming part.
# auxiliary functions
## the activation function (sigmoid here)
h <- function(z) {
return (1/(1+exp(-3*z)))
}
## the derivitive of the activation function (sigmoid here)
h.d <- function(z) {
return (h(z)*(1-h(z)))
}
## Class Probabilities
probability <- function(X, W1, W2, b1, b2){
a2 <- h(sweep(W1 %*% X, 1, b1,'+' ))
a3 <- h(sweep(W2 %*% a2, 1, b2,'+' ))
return (a3)
}
## prediction
prediction <- function(X, W1, W2, b1, b2, threshold=0.5){
return (ifelse(probability(X, W1, W2, b1, b2)>=threshold, 1, 0))
}
## Accuracy
accuracy <- function(Y, T){
return (sum(Y==T)/length(T)*100)
}
## The following structure helps us to have functions with multiple outputs
### credit: https://stat.ethz.ch/pipermail/r-help/2004-June/053343.html
list <- structure(NA,class="result")
"[<-.result" <- function(x,...,value) {
args <- as.list(match.call())
args <- args[-c(1:2,length(args))]
length(value) <- length(args)
for(i in seq(along=args)) {
a <- args[[i]]
if(!missing(a)) eval.parent(substitute(a <- v,list(a=a,v=value[[i]])))
}
x
}
feedforward <- function(Xi, Ti, W1, b1, W2, b2){
### 1st (input) layer
a1 <- Xi
y <- Ti
### 2nd (hidden) layer
z2 <- W1 %*% a1 + b1
a2 <- h(z2)
### 3rd (output) layer
z3 <- W2 %*% a2 + b2
a3 <- h(z3)
return(list(a1, a2, a3, y, z2, z3))
}
backpropagation <- function(Ti, W2, z2, z3, a3){
### 3rd (output) layer
d3 <- -(Ti-a3) * h.d(z3)
### 2nd (hidden) layer
d2 <- t(W2)%*%d3 * h.d(z2)
return(list(d2,d3))
}
NN <- function(K, X1, T1, X2, T2, eta, lambda, tracing=FALSE, epoch.max=500){
# Setting parameters
D <- 2
N = ncol(X1)
preds=matrix(nrow=epoch.max, ncol=ncol(T1))
# initialization
if (tracing==TRUE) {train.accuracy <- matrix(0,nrow=epoch.max, ncol=1); test.accuracy <- train.accuracy}
epoch <- 1 # epoch (iteration) counter
terminate <- FALSE # termination criteria
W1 <- matrix(rnorm(D*K, sd=0.5), nrow=K, ncol=D)
b1 <- matrix(rnorm(1*K), nrow=K, ncol=1)
W2 <- matrix(rnorm(K*1, sd=0.5), nrow=1, ncol=K)
b2 <- matrix(rnorm(1*1), nrow=1, ncol=1)
# main loop
while (!terminate){
## delta vectors/matrices initialization
W1.d <- W1 *0
b1.d <- b1 *0
W2.d <- W2 *0
b2.d <- b2 *0
## inner loop for each train sample
for (i in 1:N){
## Feedforward:
list[a1, a2, a3, y, z2, z3] <- feedforward(X1[,i], T1[i], W1, b1, W2, b2)
## Backpropagation:
list[d2, d3] <- backpropagation(T1[i], W2, z2, z3, a3)
## calculate the delta values
### 1st layer
W1.d <- W1.d + d2 %*% t(a1)
b1.d <- b1.d + d2
### 2nd layer
W2.d <- W2.d + d3 %*% t(a2)
b2.d <- b2.d + d3
}
## update weight vectors and matrices
W1 <- W1 - eta * (W1.d/N + lambda*W1)
b1 <- b1 - eta * (b1.d/N)
W2 <- W2 - eta * (W2.d/N + lambda*W2)
b2 <- b2 - eta * (b2.d/N)
## record the errors
if (tracing==TRUE){
train.accuracy[epoch]<- accuracy(prediction(X1, W1, W2, b1, b2), T1)
test.accuracy[epoch]<- accuracy(prediction(X2, W1, W2, b1, b2), T2)
}
## increase the iteration counter
epoch <- epoch + 1
## check the termination criteria
if (epoch > epoch.max) {terminate <- TRUE}
}
if (tracing==FALSE){
train.accuracy <- accuracy(prediction(X1, W1, W2, b1, b2), T1)
test.accuracy <- accuracy(prediction(X2, W1, W2, b1, b2), T2)
}
return(list(test.accuracy,prediction(X2,W1,W2,b1,b2)))
}
# Again Load data and turn structure into appropriate format for NN
test=read.csv("test.csv")
train=read.csv("train.csv")
train=train[complete.cases(train),]
test=test[complete.cases(test),]
train.data = train[,-3]
train.label = train[,3]
test.data = test[,-3]
test.label = test[,3]
train.data = as.data.frame(scale(train.data))
test.data = as.data.frame(scale(test.data))
X1 <- t(unname(data.matrix(train.data)))
T1 <- t(data.matrix(as.numeric(train.label)))
X2 <- t(unname(data.matrix(test.data)))
T2 <- t(data.matrix(as.numeric(test.label)))
# Matrix to store test errors of models for different K values
error= matrix(nrow = 50, ncol =2)
error=as.data.frame(error)
names(error)=c("K","Test Error")
# Matrix to store predictions of model for different K values
predictions = matrix(ncol=ncol(T2), nrow=50)
eta = 0.1
lambda = 0.0001
# For loop to run NN with different K values
for (i in 21:30){
results <- NN(i*2, X1, T1, X2, T2, eta, lambda, tracing = FALSE, epoch.max = 1000)
error[i,1]=i*2
error[i,2]= (100-results[[1]])
predictions[i,]=results[[2]]
}
# Rename error data frame
names(error)=c("K","Test_Error")
# Plot test error for different K values
ggplot(error ,aes(K, Test_Error)) +
geom_point() + theme_minimal() +
ylab("Test Error %")
# Preint best model properties
index=which.min(error[,2])
cat(sprintf(" Best Model K: %i,\n Best Model Test Error: %f",error[index,1],error[index,2]))
# Add x1 and x2 of test set to best models predictions
best_preds=predictions[index,]
best_preds=cbind(t(X2),best_preds)
# Rename best_preds data frame's columns
best_preds=as.data.frame(best_preds)
colnames(best_preds)=c("x1","x2",'Predictions')
# Plot predictions done by best model
ggplot(best_preds, aes(x1, x2, color=factor(Predictions))) +
geom_point() + theme_minimal()
As seen above, even a relatively simple Neural Network with K number of hidden neurans was complex enough to capture underlying correct decision boundary in the data.
This was expected as NN consists of several perceptron which makes it much more complex and flexiable.