Author: Anil Gurbuz
Last Updated: 12 Apr 2019
Perceptron is the building block of neural networks. Every node in a NN architecture is a single perceptron and it can be used for binary classification by forwarding the output of linear predictors ∑→w→x⊤ to an activation function which would map this summation to class probability between [0,1].
In this notebook, I will implement multiclass perceptron which could be done using several binary discriminant functions.
I will be using 2 approaches to combine several binary classifiers to create a multi-class decision boundary.
using K(K-1) classifiers -- one for every possible pair of classes. Basically, trianing one binary classifier for each pair of classes then applying majority vote of K(K-1) number of classifiers on each test point. We will see that this approach creates ambigious regions in the predictor space so some examples ends up without any prediction -- basically equal number of votes for each class.
One way to step over ambigious region is using K number of binary classifiers each representing one class. This way during training each classifier learns assigning higher prob for a single class and final class predictions for each training/test point is decided by selecting the classifier that assigned highest probability to the test/train example. One advantage of this approach is capability of dividing the predictor space without creating any ambigious region.
# Load libraries
library(ggplot2)
library(reshape2)
options(warn=-1)
options(repr.plot.width=6, repr.plot.height=4)
# Load datasets
train = read.csv('train.csv')
test = read.csv('test.csv')
# create training and testing datasets:
train.data = train[,-5]
train.label = train[,5]
test.data = test[,-5]
test.label = test[,5]
# Scale data
train.data=scale(train.data)
test.data=scale(test.data)
train.len = nrow(train)
test.len = nrow(test)
Phi <- as.matrix(cbind(1, train.data)) # add a column of 1 to training data
eta <- 0.1 # Learning rate
epsilon <- 0.001 # Stoping criterion
tau.max <- 1000 # Maximum number of iterations
T <- train.label # rename just for conviniance
W.1 <- matrix(,nrow=tau.max, ncol=ncol(Phi)) # Empty Weight vector
W.2 <- matrix(,nrow=tau.max, ncol=ncol(Phi)) # Empty Weight vector
W.3 <- matrix(,nrow=tau.max, ncol=ncol(Phi)) # Empty Weight vector
W.1[1,] <- runif(ncol(Phi)) # Random initial values for weight vector
W.2[1,] <- runif(ncol(Phi)) # Random initial values for weight vector
W.3[1,] <- runif(ncol(Phi)) # Random initial values for weight vector
error = matrix(,nrow=tau.max, ncol=1) # To store error values
tau <- 1 # iteration counter
terminate <- FALSE # termination status
while(!terminate){
# Shuffle the data
train.index <- sample(1:train.len, replace = FALSE)
Phi <- Phi[train.index,]
T <- T[train.index]
#For each point...
for (i in train.index){
# Check the termination creteria
if (tau == tau.max) {break}
# Calculate the predictions of current models for current point
val1= W.1[tau,]%*%Phi[i,]
val2= W.2[tau,]%*%Phi[i,]
val3= W.3[tau,]%*%Phi[i,]
max = max(val1,val2,val3)
# If prediction doesn't match with label...
if (val1==max && T[i]!="C1"|val2==max && T[i]!="C2"|val3==max && T[i] !="C3"){
# Increase iteration counter
tau = tau +1
# Update coefficients
if (T[i]=='C1' && max==val2){
W.1[tau,] = W.1[tau-1,] + eta*Phi[i,]
W.2[tau,] = W.2[tau-1,] - eta*Phi[i,]
W.3[tau,] = W.3[tau-1,]
}else if (T[i]=='C1' && max==val3){
W.1[tau,] = W.1[tau-1,] + eta*Phi[i,]
W.2[tau,] = W.2[tau-1,]
W.3[tau,] = W.3[tau-1,] - eta*Phi[i,]
}else if (T[i]=='C2' && max==val1){
W.1[tau,] = W.1[tau-1,] - eta*Phi[i,]
W.2[tau,] = W.2[tau-1,] + eta*Phi[i,]
W.3[tau,] = W.3[tau-1,]
}else if (T[i]=='C2' && max==val3){
W.1[tau,] = W.1[tau-1,]
W.2[tau,] = W.2[tau-1,] + eta*Phi[i,]
W.3[tau,] = W.3[tau-1,] - eta*Phi[i,]
}else if (T[i]=='C3' && max==val2){
W.1[tau,] = W.1[tau-1,]
W.2[tau,] = W.2[tau-1,] - eta*Phi[i,]
W.3[tau,] = W.3[tau-1,] + eta*Phi[i,]
}else if (T[i]=='C3' && max==val1){
W.1[tau,] = W.1[tau-1,] - eta*Phi[i,]
W.2[tau,] = W.2[tau-1,]
W.3[tau,] = W.3[tau-1,] + eta*Phi[i,]
}
}
}
# Calculate predictions for current models and the error value
predictions=as.data.frame(cbind(t(W.1[tau,]%*%t(Phi)),t(W.2[tau,]%*%t(Phi)),t(W.3[tau,]%*%t(Phi))))
for (row in 1:nrow(predictions)){
predictions[row,'pred']=paste('C', which.max(predictions[row,]),sep = '')
}
error.percentage=1-(sum(predictions[,'pred']==T)/nrow(predictions))
# recalculate termination conditions
terminate <- (tau >= tau.max | (error.percentage<=epsilon))
}
# Done, trim the dataframes
W.1 <- W.1[1:tau,]
W.2 <- W.2[1:tau,]
W.3 <- W.3[1:tau,]
# test data appropriate for multiplication with model parameters
test.Phi <- as.matrix(cbind(1, test.data))
# Data frame to store error values and corresponding batch
error.percentage=data.frame(0,ncol=2)
colnames(error.percentage)=c('Batch','Error Percentage')
# Initilise batch.no
batch.no=0
# For each batch...
for (batch in seq(5,nrow(W.1),5)){
# Calculate batch No
batch.no= batch.no + 1
# Make predictions on entire test data using each W
predictions=as.data.frame(cbind(t(W.1[batch,]%*%t(test.Phi)),t(W.2[batch,]%*%t(test.Phi)),t(W.3[batch,]%*%t(test.Phi))))
# Classify predicted points
for (row in 1:nrow(predictions)){
predictions[row,'pred']=paste('C', which.max(predictions[row,]),sep = '')
}
# Store Batch no and Ratio of missclassifications
error.percentage[batch.no,1]=batch.no
error.percentage[batch.no,2]=1-(sum(predictions[,'pred']==train.label)/nrow(predictions))
}
# Plot Batch no vs Missclassification Ratio
error.m=melt(error.percentage, id='Batch')
ggplot(data=error.m, aes(Batch,value)) +
labs(y='Misclassification Ratio', x='Batch No') +
ggtitle('Misclassification Ratio vs Batch No') +
geom_line() + theme_minimal()
# Define binary classifier to use for all combination of classes in the next steps
binary <- function(train.data, train.label, train.len, eta, epsilon, tau.max){
Phi <- as.matrix(cbind(1, train.data)) # add a column of 1 as phi_0
T <- train.label # rename just for conviniance
W <- matrix(,nrow=tau.max, ncol=ncol(Phi)) # Empty Weight vector
W[1,] <- runif(ncol(Phi)) # Random initial values for weight vector
error.trace <- matrix(0,nrow=tau.max, ncol=1) # Placeholder for errors
error.trace[1] <- sum((Phi%*%W[1,])*T<0)/train.len*100 # record error for initial weights
tau <- 1 # iteration counter
terminate <- FALSE # termination status
while(!terminate){
# resuffling train data and associated labels:
train.index <- sample(1:train.len, replace = FALSE)
Phi <- Phi[train.index,]
T <- T[train.index]
for (i in 1:train.len){
if (tau == tau.max) {break}
# look for missclassified samples
if ((W[tau,]%*%Phi[i,])*T[i]<0){
# update tau counter
tau <- tau +1
# update the weights
W[tau,] <- W[tau-1,] + eta * Phi[i,] * T[i]
# update the records
error.trace[tau] <- sum((Phi%*%W[tau,])*T<0)/train.len*100
}
}
# decrease eta:
eta = eta * 0.99
# recalculate termination conditions
terminate <- (tau >= tau.max | sum((Phi%*%W[tau,])*T<0)/train.len <= epsilon )
}
W <- W[1:tau,] # cut the empty part of the matrix (when the loop stops before tau == tau.max)
# Return final parameters
return(W[tau,])
}
eta <- 0.1 # Learning rate
epsilon <- 0.001 # Stoping criterion
tau.max <- 100 # Maximum number of iterations
# Index values of each class labels
indx.c1=which(train.label=='C1')
indx.c2=which(train.label=='C2')
indx.c3=which(train.label=='C3')
# Because the labels are sorted we can use known indexes of classes.
# For example to catch C1 and C2 1:50 is enough because C1 indexes = 1:25
# C2 indexes = 26:50 , C3 indexes = 51:75
# C1 -> +1, C2 -> -1
train.data.c1.c2 = train.data[1:50,]
train.label.c1.c2 = rep(1,50)
train.label.c1.c2[26:50]=-1
train.len.c1.c2=50
# C1 -> +1, C3 -> -1
train.data.c1.c3 = train.data[c(1:25,51:75),]
train.label.c1.c3 = rep(1,50)
train.label.c1.c3[26:50]=-1
train.len.c1.c3=50
# C2 -> +1, C3 -> -1
train.data.c2.c3 = train.data[26:75,]
train.label.c2.c3 = rep(1,50)
train.label.c2.c3[26:50]=-1
train.len.c2.c3=50
# Train a binary classifier for all combinations of classes
w.c1.c2=binary(train.data.c1.c2, train.label.c1.c2, train.len.c1.c2, eta, epsilon, tau.max)
w.c1.c3=binary(train.data.c1.c3, train.label.c1.c3, train.len.c1.c3, eta, epsilon, tau.max)
w.c2.c3=binary(train.data.c2.c3, train.label.c2.c3, train.len.c2.c3, eta, epsilon, tau.max)
# test data appropriate for multiplication with model parameters
Phi.test=as.matrix(cbind(1, test.data))
# Make predictions on entire test data using all of the classifiers
pred.c1.c2 =Phi.test %*% w.c1.c2
pred.c1.c3 =Phi.test %*% w.c1.c3
pred.c2.c3 =Phi.test %*% w.c2.c3
# postive prediction -> C1 , negative prediction -> C2
pred.c1.c2 = as.data.frame(cbind(pred.c1.c2,0))
colnames(pred.c1.c2) = c("Value",'C1-C2')
pred.c1.c2[pred.c1.c2['Value']>=0,2] = 'C1'
pred.c1.c2[pred.c1.c2['Value']<0,2] = 'C2'
pred.c1.c2=pred.c1.c2['C1-C2']
# postive prediction -> C1 , negative prediction -> C3
pred.c1.c3 = as.data.frame(cbind(pred.c1.c3,0))
colnames(pred.c1.c3) = c("Value",'C1-C3')
pred.c1.c3[pred.c1.c3['Value']>=0,2] = 'C1'
pred.c1.c3[pred.c1.c3['Value']<0,2] = 'C3'
pred.c1.c3=pred.c1.c3['C1-C3']
# postive prediction -> C2 , negative prediction -> C3
pred.c2.c3 = as.data.frame(cbind(pred.c2.c3,0))
colnames(pred.c2.c3) = c("Value",'C2-C3')
pred.c2.c3[pred.c2.c3['Value']>=0,2] = 'C2'
pred.c2.c3[pred.c2.c3['Value']<0,2] = 'C3'
pred.c2.c3=pred.c2.c3['C2-C3']
# Reshape for vote counting
predictions=cbind(pred.c1.c2,pred.c1.c3,pred.c2.c3)
# Matrix to store classification of all points after vote counting
last_predictions=matrix(nrow=nrow(test.data))
# Calculate the votings for each point to classify
for (i in 1:nrow(predictions)){
if (predictions[i,1] =='C1' && predictions[i,2]== 'C1'){
last_predictions[i]='C1'
}else if (predictions[i,1] =='C2' && predictions[i,3]== 'C2'){
last_predictions[i]='C2'
}else if(predictions[i,2] =='C3' && predictions[i,3]== 'C3'){
last_predictions[i]='C3'
}else {last_predictions[i]='Confusion Event'}
}
# Print number of Confusion events
cat(sprintf('Number of Confusion Events :%i',sum(last_predictions=='Confusion Event')))
There are confusion events because of the amigious region created in 1 vs 1 approach. Number of cunfusion events are currently 25 for this run (it can be different for different runs so it is printed at the previous command).
In the above data, there are 3 different target classes which requires to create totally 3*2/3=3 different classifier corresponding one for each pair of the classes. Partitioning of this 4-D(x1,x2,x3,x4) training space space by 3 different 3-D spaces creates an ambigious part. If a point's classes are predicted as C1, C2 and C3 by these 3 different classifiers, this point is located in ambigious part of the partitioned 4-D space and called confusion event.
It is unavoidable to have an ambigious region in 1 vs 1 approach because there is one linear classifier created for each class. This is visualised in the below image. The triangular area between 3 classifiers' decision boundary is the ambigious region. Points in this area have 1 votes to each classes which makes the points confusion events.