Week 6

Multiple Regression Model

Apply multiple regressions, if data have a continuous independent variable.

mydata$rank<-factor(mydata$rank)
mylogit<-glm(admit~gre+gpa+rank,data=mydata,family=”binomial”)
summary(mylogit)

Week 7

REGRESSION MODEL FOR PREDICTION

Apply regression Model techniques to predict the data on above dataset.

library(rio) #for data reading
data <- import(" >https://stats.idre.ucla.edu/wp-content/uploads/2016/02/binary.sas7bdat")
str(data)
 
data$ADMIT <- as.factor(data$ADMIT)
data$RANK <- as.factor(data$RANK)
str(data)
 
summary(data)
 
plot(data$GPA,data$GRE,col="red")
cor(data$GRE, data$GPA)

Week 8

Classification Model

a) Install relevant packages for classification.

install.packages("rpart.plot")
install.packages("tree">install.packages("ISLR"))
install.packages("rattle")
library(tree)
library(ISLR)
library(rpart.plot)
library(rattle)

b) Choose a classifier for classification problems.

attach(Hitters)
View(Hitters)
 
# Remove NA data
Hitters<-na.omit(Hitters)
 
# log transform Salary to make it a bit more normally distributed
hist(Hitters$Salary)
 
Hitters$Salary <- log(Hitters$Salary)
hist(Hitters$Salary)

c) Evaluate the performance of the classifier.

tree.fit <- tree(Salary~Hits+Years, data=Hitters)
summary(tree.fit)
plot(tree.fit, uniform=TRUE,margin=0.2)
text(tree.fit, use.n=TRUE, all=TRUE, cex=.8)
 
#plot(tree.fit)
split <- createDataPartition(y=Hitters$Salary, p=0.5, list=FALSE)
train <- Hitters[split,]
test <- Hitters[-split,]
 
#Create tree model
trees <- tree(Salary~., train)
plot(trees)
text(trees, pretty=0)
 
cv.trees <- cv.tree(trees)
plot(cv.trees)
prune.trees <- prune.tree(trees, best=4)
plot(prune.trees)
text(prune.trees, pretty=0)
 
yhat <- predict(prune.trees, test)
plot(yhat, test$Salary)
abline(0,1)
 
mean((yhat - test$Salary)^2)

Week 8

Clustering Model

a) Clustering algorithms for unsupervised classification.

library(cluster)
set.seed(20)
irisCluster <- kmeans(iris[, 3:4], 3, nstart = 20)
 
# nstart = 20. This means that R will try 20 different random starting assignments and then select the one with the lowest within cluster variation.
irisCluster
irisCluster$cluster <- as.factor(irisCluster$cluster)
ggplot(iris, aes(Petal.Length, Petal.Width, color = irisCluster$cluster)) + geom_point()
 
d <- dist(as.matrix(mtcars)) # find distance matrix
hc <- hclust(d) # apply hirarchical clustering
plot(hc) # plot the dendrogram

b) Plot the cluster data using R visualizations.

x <- rbind(cbind(rnorm(10,0,0.5), rnorm(10,0,0.5)),
cbind(rnorm(15,5,0.5), rnorm(15,5,0.5)))
clusplot(pam(x, 2))
 
## add noise, and try again :
x4 <- cbind(x, rnorm(25), rnorm(25))
clusplot(pam(x4, 2))