Author: Anil Gurbuz
Last Updated: 7 Apr 2019
Non-parametric machine learning algorithms are quite popular in data science world such as Decision Trees, Support Vector Machines etc. k-Nearest Neighbour is one of them which makes it highly flexiable and capable to fit wide range of functional forms without making any strong assumptions on the underlying function. Therefore, it can outperform other parametric models in several scenarios.
In this Notebook, I will implement kNN algorithm from scratch together with a cross-validation loop to analyse the relationship between the error of kNN and its hyperparameter k.
# Load libraries and set global options
library(reshape2)
library(ggplot2)
library(corrplot)
options(warn=-1)
options(repr.plot.width=6, repr.plot.height=4)
# Loading the Simulated datasets
test = read.csv('test.csv')
train = read.csv('train.csv')
df = rbind(train,test)
knn<-function(train.input, train.target, test.input, K=3){
# Count number of training set elements
train.len = length(train.input)
# Count number of test samples
test.len = length(test.input)
# Create a vector to store test labels
predictions = c()
# Calculate distance between points
dist = as.matrix(dist(c(test.input,train.input), method = 'euclidean'))[1:test.len,(test.len+1):(test.len+train.len)]
# for each test sample...
for (i in 1:test.len){
# ... find K nearest data points from training samples
nn=as.numeric(rownames(as.data.frame(sort(dist[i,])))[1:K])
# ... and calculate the predictions using mean of targets
predictions[i] = mean(train.target[(nn - test.len)])
}
# return the predictions
return (predictions)
}
ggplot(df,aes(x1,y)) + geom_point() +
stat_summary(fun.data=mean_cl_normal) + geom_smooth(method='lm', formula= y~x1^3)
# Creating data frame to store error values
error <- data.frame('K'=1:20, 'train'=rep(0,20), 'test'=rep(0,20))
# Test error K=1...20
for (k in 1:20){
error[k,'test'] = sum((knn(train$x1, train$y, test$x1, K=k) - test$y)^2)/length(test$x1)
error[k,'train'] = sum((knn(train$x1, train$y, train$x1, K=k) - train$y)^2)/length(train$x1)
}
# Plot Error values for train and test data
error.m=melt(error, id ='K')
names(error.m) = c('K','Type','Error')
ggplot(data = error.m , aes(x= 1/K, y= Error , color = Type )) + geom_line() +
theme_minimal() + ggtitle('Error vs 1/K')
test=error.m[error.m$Type=='test',]
cat('Minimum Test error occurs when K =',test[which.min(test$Error),'K'])
K is the only paramater that is effecting the complexity of model that is builded by the regressor so it is essential to choose K correctly. Appropriate value of K and can be decided by evaluating the graph above and the data frame that contains error values and corresponding K values.
By looking at data frame, it is clear that minimum error value is observed when K = 11 so it is the optimum value for testing error.
Graph above clearly shows that increasing K increases training error which implies that increasing K tends to create underfitting.
Also by looking at test error, increasing K (starting from 1) firstly decreases the test error then increases. In terms of test error K creates a "U" shape error curve where the bottom point is the optimum value and increasing K again tends to create underfitting whereas decreasing K tends to create overfitting.
All in all, K going 11 to 1 becomes more overfitted and K going 11 to 20 becomes more underfitted where K = 1 the optimal value of K.
cv = function(train.data, train.label, numFold=10, K=3){
# Count number of training set elements
train.len = length(train.data)
# a dataframe to track errors for each fold
error = data.frame("Fold"=1:numFold, 'Error'=rep(0,numFold))
# Number of elements in every fold except for the last one
fold.len = train.len %/% numFold
# For each fold ...
for (fold in 1:numFold){
# If it is not the last fold
if (fold != numFold){
# Calculate the indexes of the fold
indx = (fold.len*(fold-1)+1):(fold.len*fold)
}
# If it the last fold
else{
# Calculate the index of the fold
indx= (fold.len*(fold-1)+1):train.len
}
# Store the error value for the current fold
error[error$Fold==fold,'Error']= sum((knn(train.data[-indx], train.label[-indx], train.data[indx], K=K) - train.label[indx])^2) / length(train.data)
}
return(error)
}
# Fix the parameters
K=20 # Maximum K for KNN
numFold=10 # Number of Folds
# Matrix to store error values for each fold and K.
error = matrix(nrow = K, ncol=numFold+4)
# For each K...
for (k in 1:20){
# Calculate and store error of each fold
error[k,1:10]=cv(train$x1, train$y, numFold = 10 , K=k)$Error
# Calculate mean value
error[k,11]=mean(error[k,1:10])
# Calculate standard deviation
error[k,12] = sd(error[k,1:10])
# Calculate mean - standard deviation
error[k,13] = error[k,11] - error[k,12]
# Calculate mean + standard deviation
error[k,14] = error[k,11] + error[k,12]
}
# Naming columns
error = as.data.frame(error)
colnames(error)[11] = "Mean Error"
colnames(error)[12] = "Std. Error"
colnames(error)[13] = "Mean - Std. Error"
colnames(error)[14] = "Mean + Std. Error"
error$K=1:K
error.m=melt(error,id='K')
# Plot mean error and +-1 standard deviation
error.m=error.m[error.m$variable %in% c('Mean Error','Mean - Std. Error','Mean + Std. Error'),]
ggplot(error.m, aes(1/K,value)) + geom_point(aes(shape=variable)) +theme_minimal() +
scale_color_discrete(guide = guide_legend(title=NULL)) + labs(y='Value') +
ggtitle('Mean Error, +- Standard Deviation vs 1/K')
mean.error=error.m[error.m$variable=='Mean Error',]
cat('Minimum Mean error occurs when K =',mean.error[which.min(mean.error$value),'K'])
cat('\nMinimum Standard deviation occurs when K =',which.min(error[,12]))