#### HW 6 solutions, Fall 2022

# Problem 1 (a):

bumpbird <- read.table("http://www.stat.sc.edu/~hitchcock/bumpusbird.txt", header=T)
names(bumpbird) <- c("ID", "tot.length", "alar.length", "beak.head.length", "humerus.length", "keel.stern.length")
attach(bumpbird)
bumpbird.numeric <- bumpbird[,-1]
bumpbird.IDs <- bumpbird[,1]
survival.indicator <- as.factor(c(rep("survived",times=21),rep("died",length=28)))

# Using the built-in lda function in the MASS package
# for linear discriminant analysis:

library(MASS)

# assuming equal prior probabilities of surviving or not:

dis <- lda(survival.indicator ~ tot.length+alar.length+beak.head.length+humerus.length+keel.stern.length, data=bumpbird, prior=c(0.5, 0.5))


# Let's predict whether a new bird with 
# measurements of: tot.length=156, alar.length=242, beak.head.length=31.4, humerus.length=18.1, keel.stern.length=19.4
# will survive:

newobs <- rbind( c(155, 238, 31.2, 17.9, 19.3) )
dimnames(newobs) <- list(NULL,c("tot.length", "alar.length", "beak.head.length", "humerus.length", "keel.stern.length"))
newobs <- data.frame(newobs)
predict(dis,newdata=newobs)$class

# Posterior probabilities of this bird being in each group:

predict(dis,newdata=newobs)$posterior

## problem 1(b)

##### Misclassification rate of LDA rule:

# Simple plug-in misclassification rate:

group<-predict(dis, bumpbird, method='plug-in')$class
table(group,survival.indicator)

# The plug-in misclassification rate for LDA here is 17/49 = 0.347.

# cross-validation rate of LDA rule:

########
correct<-rep(0,times=nrow(bumpbird.numeric) )
for (j in 1:nrow(bumpbird.numeric) ) {
mydis<-lda(grouping=survival.indicator[-j], x=bumpbird.numeric[-j,1:5], prior=c(0.5,0.5))
mypred<-predict(mydis,newdata=bumpbird.numeric[j,1:5])$class
correct[j] <- (mypred==survival.indicator[j])
}
cv.misclass <- 1-mean(correct)
cv.misclass
#########

# The CV rate is 0.551.


# assuming default prior probabilities:

dis2 <- lda(survival.indicator ~ tot.length+alar.length+beak.head.length+humerus.length+keel.stern.length, data=bumpbird)


# Let's predict whether a new bird with 
# measurements of: tot.length=156, alar.length=242, beak.head.length=31.4, humerus.length=18.1, keel.stern.length=19.4
# will survive:

newobs <- rbind( c(155, 238, 31.2, 17.9, 19.3) )
dimnames(newobs) <- list(NULL,c("tot.length", "alar.length", "beak.head.length", "humerus.length", "keel.stern.length"))
newobs <- data.frame(newobs)
predict(dis2,newdata=newobs)$class

# Posterior probabilities of this bird being in each group:

predict(dis2,newdata=newobs)$posterior

## problem 1(d)

##### Misclassification rate of LDA rule:

# Simple plug-in misclassification rate:

group<-predict(dis2, bumpbird, method='plug-in')$class
table(group,survival.indicator)

# The plug-in misclassification rate for LDA here is 18/49 = 0.367.

# cross-validation rate of LDA rule:

########
correct<-rep(0,times=nrow(bumpbird.numeric) )
for (j in 1:nrow(bumpbird.numeric) ) {
mydis<-lda(grouping=survival.indicator[-j], x=bumpbird.numeric[-j,1:5])
mypred<-predict(mydis,newdata=bumpbird.numeric[j,1:5])$class
correct[j] <- (mypred==survival.indicator[j])
}
cv.misclass <- 1-mean(correct)
cv.misclass
#########

# The CV rate is 0.469.

###########################################################################################
### problem 2:

skulls <- read.table("http://people.stat.sc.edu/hitchcock/skullschap7.txt", header=T)

attach(skulls)

library(rpart)

tree.sk.eq <- rpart(EPOCH ~ MB+BH+BL+NH, data=skulls, method="class", parms = list(prior = c(1/5, 1/5, 1/5, 1/5, 1/5)) )


# Plotting this tree:

plot(tree.sk.eq); text(tree.sk.eq)


group<-predict(tree.sk.eq, type="class")
tab.tree.eq <- table(group,EPOCH)

# The plug-in misclassification rate for the classification tree is:

1-( sum(diag(tab.tree.eq)) / sum(tab.tree.eq) )

# Let's predict the epoch of a new skull with 
# MB = 133, BH = 130, BL = 95, NH = 50:

predict(tree.sk.eq, data.frame(MB = 133, BH = 130, BL = 95, NH = 50))


## Part (b)

# loading the 'randomForest' package:

library(randomForest)

## Specifying prior probabilities:

skull.rf.eq <- randomForest(as.factor(EPOCH)~MB+BH+BL+NH, method="class", classwt=c(0.2,0.2,0.2,0.2,0.2) )
skull.rf.eq 

# Let's predict the epoch of a new skull with 
# MB = 133.0, BH = 130.0, BL = 95.0, NH = 50.0

newobs <- rbind( c(133,130,95,50) )
dimnames(newobs) <- list(NULL,c('MB','BH', 'BL', 'NH'))
newobs <- data.frame(newobs)
predict(skull.rf.eq,newdata=newobs, type='response')

predict(skull.rf.eq,newdata=newobs, type='prob')

# Seeing how important each explanatory variable was in the classification:
# (the higher the number, the more important)

skull.rf.eq$importance

########################################################################
### Problem 3

USairpol.full <- read.table("http://people.stat.sc.edu/hitchcock/usair.txt", header=T)
city.names <- as.character(USairpol.full[,1])
USairpol.data <- USairpol.full[,-1]
USairpol.data$Temp <- (-USairpol.data$Temp)
attach(USairpol.data)

# load the 'rpart' package:

library(rpart)

# load the 'MASS' package:

library(MASS)

# fitting a regression tree, with default settings:

usairpol.tree <- rpart(SO2 ~  Temp+Manuf+Pop+Wind+Precip+Days, data=USairpol.data)

# basic summary output:

print(usairpol.tree)

# plotting the graph of the tree (spacing proportional to improvement in fit):

plot(usairpol.tree); text(usairpol.tree)

x.val <- c(Temp=60, Manuf=390, Pop=500, Wind=8.5, Precip=45, Days=110)

predict(usairpol.tree, data.frame(t(x.val)))

## Random Forest:

library(randomForest)

##
## Regression with random forest:
##

library(MASS)


usairpol.rf <- randomForest(SO2 ~  Temp+Manuf+Pop+Wind+Precip+Days, data=USairpol.data)

x.val <- c(Temp=60, Manuf=390, Pop=500, Wind=8.5, Precip=45, Days=110)

predict(usairpol.rf, data.frame(t(x.val)))

# Seeing how important each explanatory variable was:
# (the higher the number, the more important)

usairpol.rf$importance

########################################################################
### Problem 4


sidsdata <- read.table("http://www.stat.sc.edu/~hitchcock/SIDSdata.txt", header=T)
attach(sidsdata)

library(MASS)

# load the 'class' package:

library(class)

# load the 'ISLR' package:

library(ISLR)

library(e1071)

X.train <- cbind(HR,BW,Factor68,Gesage)
y.train <- Group


# Creating the SVC:

dat=data.frame(x=X.train, y=as.factor(y.train))

tune.out <- tune(svm, y ~ ., data=dat, kernel='radial', scale=TRUE, probability=TRUE, 
ranges=list(cost=c(.001,.01,.1,1,5,10,100,1000), gamma=c(0.5,1,2,3,4) ))
summary(tune.out)

# cost=0.001 and gamma=5 works well.

svmfit <- svm(y ~ ., data=dat, kernel='radial', scale=TRUE, probability=TRUE, cost=.001, gamma=5)


# The points plotted as "x" are the support vectors.  The other points are plotted as "o".

# HR = 100, BW = 3000, Factor68 = 0.3, Gesage = 40

newobs <- rbind( c(100,3000,0.3,40) )
dimnames(newobs) <- list(NULL,c('HR','BW','Factor68','Gesage'))
testdat <- data.frame(x=newobs)
predict(svmfit,testdat,probability=TRUE)