######################
######################
#
# Binary response:
#
######################
######################
######################


## This code is adapted from the example at
## https://brunaw.com/phd/mcmc/report.pdf

library(mvtnorm)
# May need to install the mvtnorm package first?
# If so, type at the command line:  install.packages("mvtnorm", dependencies=T)
# while plugged in to the internet.

# Define Y = 0 if individual has no senility
# Define Y = 1 if individual has senility present

# x = score on subset of Wechler Adult Intelligence Scale (WAIS)

y<-c(rep(1,times=14), rep(0, times=40))

x<-c(9,13,6,8,10,4,14,8,11,7,9,7,5,14,13,16,10,12,11,14,15,18,7,16,9,9,11,13,15,13,10,11,6,17,14,19,9,11,14,10,16,10,16,14,13,13,9,15,10,11,12,4,14,20)

logreg.out <- glm(y ~ x, family=binomial(logit))
summary(logreg.out)

#X<-cbind(x)

x1<-x

X <- cbind(rep(1,times=length(x1)), x1)

# This code just below includes some subjective prior information about beta_1:
#beta.prior.mean <- c(0,-0.35)  
# reflects no prior belief about beta_0, but good prior belief about beta_1
#beta.prior.cov <- diag(c(100,0.175^2))

# This code here shows a VERY weakly informative, basically noninformative, prior
beta.prior.mean <- c(0,0)  
beta.prior.cov <- diag(c(100,100))

k<-1 # Can adjust this k up or down if the acceptance rate is too high or low...

proposal.cov.matrix <- k*solve(t(X)%*%diag(fitted(logreg.out))%*%X)    
#proposal.cov.matrix <- k*diag(p)

MCMCBetasI <- beta.prior.mean
V <- proposal.cov.matrix
mu <- beta.prior.mean
Sigma_inv <- solve(beta.prior.cov)

j<-0
BurnIn <- 1000
TotIter <- 50000  # Make this bigger, especially if you need to do more thinning...
SaveResults<-matrix(0,nrow=TotIter,ncol=length(beta.prior.mean))

for (i in 1:TotIter){
# Proposal distribution for beta
MCMCBetasC <- rmvnorm(1, MCMCBetasI, V)
MCMCBetasC <- as.vector(MCMCBetasC)
# Metropolis ratio
ratio <- ((-k * sum(log(1 + exp(X%*%MCMCBetasC))) + sum(y * X%*%MCMCBetasC) - (1/2) *
t(MCMCBetasC - mu)%*%Sigma_inv%*%(MCMCBetasC - mu)) - ((-k * sum(log(1 + exp(X%*%MCMCBetasI)))) +
sum(y * X%*%MCMCBetasI) - (1/2) * t(MCMCBetasI - mu)%*%Sigma_inv%*%(MCMCBetasI - mu)))
# Accept/reject step
if(runif(1) < min(1, exp(ratio)))
{
MCMCBetasI <- MCMCBetasC
j <- j+1
}
# Saving results
SaveResults[i,] <- MCMCBetasI
}

# Acceptance rate:
acs<- j/TotIter
acs

## Thinning (every 5th value):
S <- TotIter
beta.values.thin <- SaveResults[5*(1:(S/5) ),]

## Burn-in:

beta.values.thin.b <- beta.values.thin[-(1:BurnIn),]

par(mfrow=c(2,2)) # 2 by 2 plotting window
 acf(beta.values.thin.b[,1])
 acf(beta.values.thin.b[,2])
 plot(beta.values.thin.b[,1],type='l')
 plot(beta.values.thin.b[,2],type='l')
par(mfrow=c(1,1))

### Posterior summary:

post.medians <- apply(beta.values.thin.b,2,median)  # Posterior medians for each regression coefficent

post.lower <- apply(beta.values.thin.b, 2, quantile, probs=0.025);
post.upper <- apply(beta.values.thin.b, 2, quantile, probs=0.975) 
# approximate 0.025 and 0.975 quantiles for each regression coefficent
# to get approximate 95% quantile-based intervals

#library(TeachingDemos)
#t(apply(beta.values.thin.b, 2, emp.hpd, conf=0.95)) # approximate 95% HPD intervals

#names.predictors <- c("x1", "x2", "x3")
names.predictors <- c("x1") #just one predictor in this example...
beta.post.summary <- data.frame(cbind(post.lower, post.medians, post.upper), row.names=c("intercept", names.predictors))
names(beta.post.summary) <- c('0.025 Quantile', '0.5 Quantile', '0.975 Quantile')
print(beta.post.summary)




### Logistic Regression using rstanarm

# Load packages
library(bayesrules)
library(rstanarm)
library(bayesplot)
library(tidyverse)
library(tidybayes)
library(broom.mixed)

# Create a data frame:

wais_data <- data.frame(y,x)

# Run a prior simulation
wais_model <- stan_glm(y ~ x,
                             data = wais_data, family = binomial,
                             prior_intercept = normal(-0.5, 0.45),
                             prior = normal(-0.35, 0.175),
                             chains = 4, iter = 5000*2, 
                             prior_PD = F)

# prior intercept: We believe that a "typical" subject might have probability 0.2 to 0.6 of senility.
# So log odds of senility for such a person should be between log(0.2/0.8) = -1.4 and log(0.6/0.4) = 0.4
# So set prior mean for (CENTERED) beta_0 (different from the beta_0 in model) to be halfway between those, at -0.5.
# Set prior sd to be half of the distance between -0.5 and 0.4, so prior sd=0.45.

# prior slope:  we believe that for a one-unit increase in WAIS score, 
# the odds of senility might be anywhere from half as large to the same, i.e., between 0.5 and 1.
# So beta_1 might be between log(0.5) = -0.69 and log(1) = 0.
# So make prior mean on beta_1 to be -0.35 and prior sd around 0.175.

mcmc_trace(wais_model)
mcmc_dens_overlay(wais_model)
mcmc_acf(wais_model)

# A little autocorrelation, but otherwise OK.

## Plotting posterior plausible models:

wais_data %>%
  add_fitted_draws(wais_model, n = 100) %>%
  ggplot(aes(x = x, y = y)) +
    geom_line(aes(y = .value, group = .draw), alpha = 0.15) + 
    labs(y = "probability of senility")

# Summary of point estimates and interval estimates for the betas:

tidy(wais_model, conf.int = TRUE, conf.level = 0.95)

# How do these compare to the frequentist MLEs?

logreg.out <- glm(y ~ x, family=binomial(logit))
summary(logreg.out)

# Posterior predictions of binary outcome (senility present or not?) for a person with WAIS score 10:

binary_prediction <- posterior_predict(
  wais_model, newdata = data.frame(x=10))

# Histogram to show posterior predictive distribution for this person:

mcmc_hist(binary_prediction) + 
   labs(x = "Y")

# Summarize the posterior predictions of Y with a table:
table(binary_prediction)


# posterior predictive mean (this is the posterior predictive probability that such a person would be senile):

colMeans(binary_prediction)

## Checking posterior predictive distribution:

proportion_senile <- function(x){mean(x == 1)}
pp_check(wais_model, nreps = 100,
         plotfun = "stat", stat = "proportion_senile") + 
  xlab("probability of senility")

# The observed value is near the middle of the predictive distribution -- that's good.

## Getting the confusion matrix to check predictive accuracy:

classification_summary(model = wais_model, data = wais_data, cutoff = 0.5)

# Could try with a different cutoff as well...

# Checking classification accuracy on "out-of-sample" data using cross-validation:

cv_accuracy_1 <- classification_summary_cv(
  model = wais_model, data = wais_data, cutoff = 0.5, k = 10)
cv_accuracy_1$cv



###
###  Logistic Regression with Multiple Predictors
###

# Rain example from the book:

# Load and process the data
data(weather_perth)
weather <- weather_perth %>% 
  select(day_of_year, raintomorrow, humidity9am, humidity3pm, raintoday)

# Prior belief:
# On a "typical" day, there is 0.2 chance of rain.
# prior mean on CENTERED beta_0 is  log(0.2/(1-0.2)) = -1.4

# The prior sd on CENTERED beta_0 of 0.7 implies 
# a 95% chance the log-odds of rain on a typical day is between -2.8 and 0.
# Converts to an odds of between 0.06 and 1
# or a probability of between 0.057 and 0.50.

rain_model_2 <- stan_glm(
  raintomorrow ~ humidity9am + humidity3pm + raintoday, 
  data = weather, family = binomial,
  prior_intercept = normal(-1.4, 0.7),
  prior = normal(0, 2.5, autoscale = TRUE), 
  chains = 4, iter = 5000*2)

tidy(rain_model_2, effects = "fixed", conf.int = TRUE, conf.level = 0.95)

### Model comparison:
# Let's compare this model to a model that only contains one predictor:

rain_model_1 <- stan_glm(raintomorrow ~ humidity9am,
                             data = weather, family = binomial,
                             prior_intercept = normal(-1.4, 0.7),
                             prior = normal(0.07, 0.035),
                             chains = 4, iter = 5000*2, 
                             prior_PD = FALSE)

### Comparing classification accuracy:
## The book notes that c = 0.2 is a reasonable cutoff value here,
## but you could try other cutoff values as desired ...

cv_accuracy_1 <- classification_summary_cv(
  model = rain_model_1, data = weather, cutoff = 0.2, k = 10)

cv_accuracy_2 <- classification_summary_cv(
  model = rain_model_2, data = weather, cutoff = 0.2, k = 10)

# CV for the models
cv_accuracy_1$cv
cv_accuracy_2$cv

# One approach to model selection:

# Calculate ELPD for the models
loo_1 <- loo(rain_model_1)
loo_2 <- loo(rain_model_2)
loo_1$estimates
loo_2$estimates

# Could use BIC to compare models (not involving any prior information):

rain_model_1_glm <- glm(raintomorrow ~ humidity9am, data = weather, family = binomial(logit))
rain_model_2_glm <- glm(raintomorrow ~ humidity9am + humidity3pm + raintoday, data = weather, family = binomial(logit))

BIC(rain_model_1_glm)
BIC(rain_model_2_glm)

# Try a model with only humidity3pm & raintoday as predictors:

rain_model_simp <- stan_glm(
  raintomorrow ~  humidity3pm + raintoday, 
  data = weather, family = binomial,
  prior_intercept = normal(-1.4, 0.7),
  prior = normal(0, 2.5, autoscale = TRUE), 
  chains = 1, iter = 5000*2)

cv_accuracy_simp <- classification_summary_cv(
  model = rain_model_simp, data = weather, cutoff = 0.2, k = 10)
cv_accuracy_simp$cv

loo_simp <- loo(rain_model_simp)
loo_simp$estimates

rain_model_simp_glm <- glm(raintomorrow ~ humidity3pm + raintoday, data = weather, family = binomial(logit))
BIC(rain_model_simp_glm)



### Coding the multiple logistic regression model using base R:
### 

library(mvtnorm)
# May need to install the mvtnorm package first?
# If so, type at the command line:  install.packages("mvtnorm", dependencies=T)
# while plugged in to the internet.

y <- weather$raintomorrow; x1 <- weather$humidity9am; x2 <- weather$humidity3pm; x3 <- weather$raintoday

# make y and x3 numeric for the calculations below:

y<-as.numeric(y); # Converts to 1's and 2's
y <- y-1 # Converts to 0's and 1's
x3<-as.numeric(x3)

X <- cbind(rep(1,times=length(x1)), x1, x2, x3)

beta.prior.mean <- c(0,-0.35, -0.2, 0.5)  
# reflects no prior belief about beta_0, but some prior belief about beta_1, beta_2, beta_3
beta.prior.cov <- diag(c(100,40, 40, 40 ))

k<-1 # Can adjust this k up or down if the acceptance rate is too high or low...

logreg.out <- glm(y ~ x1+x2+x3, family=binomial(logit))
summary(logreg.out)


proposal.cov.matrix <- k*solve(t(X)%*%diag(fitted(logreg.out))%*%X)    
#proposal.cov.matrix <- k*diag(p)

MCMCBetasI <- beta.prior.mean
V <- proposal.cov.matrix
mu <- beta.prior.mean
Sigma_inv <- solve(beta.prior.cov)

j<-0
BurnIn <- 1000
TotIter <- 50000  # Make this bigger, especially if you need to do more thinning...
SaveResults<-matrix(0,nrow=TotIter,ncol=length(beta.prior.mean))

for (i in 1:TotIter){
# Proposal distribution for beta
MCMCBetasC <- rmvnorm(1, MCMCBetasI, V)
MCMCBetasC <- as.vector(MCMCBetasC)
# Metropolis ratio
ratio <- ((-k * sum(log(1 + exp(X%*%MCMCBetasC))) + sum(y * X%*%MCMCBetasC) - (1/2) *
t(MCMCBetasC - mu)%*%Sigma_inv%*%(MCMCBetasC - mu)) - ((-k * sum(log(1 + exp(X%*%MCMCBetasI)))) +
sum(y * X%*%MCMCBetasI) - (1/2) * t(MCMCBetasI - mu)%*%Sigma_inv%*%(MCMCBetasI - mu)))
# Accept/reject step
if(runif(1) < min(1, exp(ratio)))
{
MCMCBetasI <- MCMCBetasC
j <- j+1
}
# Saving results
SaveResults[i,] <- MCMCBetasI
}

# Acceptance rate:
acs<- j/TotIter
acs

## Thinning (every 5th value):
S <- TotIter
beta.values.thin <- SaveResults[5*(1:(S/5) ),]

## Burn-in:

beta.values.thin.b <- beta.values.thin[-(1:BurnIn),]

par(mfrow=c(2,2))
 acf(beta.values.thin.b[,1])
 acf(beta.values.thin.b[,2])
 acf(beta.values.thin.b[,3])
 acf(beta.values.thin.b[,4])
par(mfrow=c(1,1))
dev.new()
par(mfrow=c(2,2))
 plot(beta.values.thin.b[,1],type='l')
 plot(beta.values.thin.b[,2],type='l')
 plot(beta.values.thin.b[,3],type='l')
 plot(beta.values.thin.b[,4],type='l')
par(mfrow=c(1,1))

### Posterior summary:

post.medians <- apply(beta.values.thin.b,2,median)  # Posterior medians for each regression coefficent

post.lower <- apply(beta.values.thin.b, 2, quantile, probs=0.025);
post.upper <- apply(beta.values.thin.b, 2, quantile, probs=0.975) 
# approximate 0.025 and 0.975 quantiles for each regression coefficent
# to get approximate 95% quantile-based intervals

#library(TeachingDemos)
#t(apply(beta.values.thin.b, 2, emp.hpd, conf=0.95)) # approximate 95% HPD intervals

names.predictors <- c("x1", "x2", "x3")
beta.post.summary <- data.frame(cbind(post.lower, post.medians, post.upper), row.names=c("intercept", names.predictors))
names(beta.post.summary) <- c('0.025 Quantile', '0.5 Quantile', '0.975 Quantile')
print(beta.post.summary)