###############################
# Bayesian Linear Regression  #
###############################

###################################################
### Example 1, Chapter 4 notes (Oxygen uptake data)
###################################################

# data from Hoff (2009)

y <- c(-0.87,-10.74,-3.27,-1.97,7.50,-7.25,17.05,4.96,10.40,11.05,0.26,2.51)
# y = change in maximal oxygen uptake (positive number = improvement)
x1 <- c(0,0,0,0,0,0,1,1,1,1,1,1)              # 0 = running group, 1 = aerobics group
x2 <- c(23,22,22,25,27,20,31,23,27,28,22,24)  # ages of subjects
x3 <- x1*x2                                   # cross-product term (interaction b/w group and age)

X <- as.matrix( cbind(rep(1, length(x1)), x1, x2, x3) )


# Gill's function to produce posterior medians
# and quantile based credible intervals for the
# regression parameters (based on multivariate-t posterior)

t.ci.table <- function(coefs, cov.mat, level=0.95, degrees=Inf, quantiles=c(0.025, 0.50, 0.975) ) {
 quantile.mat <- cbind(coefs, sqrt(diag(cov.mat)), t(qt(quantiles, degrees) %o% sqrt(diag(cov.mat))) + 
                 matrix(rep(coefs,length(quantiles)), ncol=length(quantiles)) )
 quantile.names <- c("mean", "Std. Error") 
 for (i in 1:length(quantiles)) {
  quantile.names <- c(quantile.names, paste(quantiles[i], "Quantile"))
 }
 dimnames(quantile.mat)[2] <- list(quantile.names)
 return(round(quantile.mat,4))
}


### Noninformative Prior Analysis:

bhat <- solve(t(X) %*% X) %*% t(X) %*% y
sig2hat <- t(y - X %*% bhat) %*% (y - X %*% bhat) / (nrow(X) - ncol(X))
my.cov.mat <- solve(t(X) %*% X) * ((nrow(X)-ncol(X))*sig2hat / (nrow(X) - ncol(X) - 2))[1,1]

# Getting posterior information about the betas:
table.beta.info <- t.ci.table(bhat, my.cov.mat, degrees = nrow(X) - ncol(X))
table.beta.info

# Getting posterior information about sigma^2:

library(TeachingDemos) # loading TeachingDemos package, to use hpd function
library(pscl)  # loading pscl package, to use inverse gamma distribution

my.alpha <- (nrow(X) - ncol(X) - 1)/2
my.beta <- 0.5 * sig2hat * (nrow(X) - ncol(X))

# Point estimate (posterior median here) for sigma^2:

sig.sq.poster.med <- qigamma(0.50, alpha=my.alpha, beta=my.beta)
round(sig.sq.poster.med, 3)

hpd.sig.sq <- hpd(qigamma, alpha=my.alpha, beta=my.beta)
round(hpd.sig.sq, 3)


###################################################
### Example 2, Chapter 4 notes (Automobile data)
###################################################

autodata <- read.table("http://www.stat.sc.edu/~hitchcock/autoregresslarge.txt", header=T)

y <- autodata$mpg

x1 <- autodata$displacement 
x2 <- autodata$horsepower 
x3 <- autodata$weight

X <- as.matrix( cbind(rep(1, length(x1)), x1, x2, x3) )

###### Setting up prior specification :

# 3 predictor variables, so we need exactly 3+1=4 hypothetical "prior observations"

# Suppose that based on "expert opinion" we have the following guesses:

xtilde.obs.1 <- 
c(150, 100, 2000)
ytilde.obs.1 <- 25  
# Prior belief:  a car with displacement=150, horsepower=100, weight=2000 should have mpg around 25

xtilde.obs.2 <- 
c(200, 160, 4500)
ytilde.obs.2 <- 10  
# Prior belief:  a car with displacement=200, horsepower=160, weight=4500 should have mpg around 10

xtilde.obs.3 <- 
c(250, 140, 3000)
ytilde.obs.3 <- 20
# Prior belief:  a car with displacement=250, horsepower=140, weight=3000 should have mpg around 20

xtilde.obs.4 <- 
c(100, 80, 1800)
ytilde.obs.4 <- 35
# Prior belief:  a car with displacement=100, horsepower=80, weight=1800 should have mpg around 35

# Making the matrix X~ :
prior.obs.stacked <- rbind(xtilde.obs.1, xtilde.obs.2, xtilde.obs.3, xtilde.obs.4)
xtilde <- cbind(rep(1, times=nrow(prior.obs.stacked) ), prior.obs.stacked)

# Making the vector Y~ :
ytilde <- c(ytilde.obs.1, ytilde.obs.2, ytilde.obs.3, ytilde.obs.4)

# Diagonal matrix D contains weights that indicate how much worth we place on 
# our hypothetical prior observations (note the weights could vary if we are more
# confident about some hypothetical prior observations than about others):

D <- diag(c(4, 4, 4, 4))

# This D yields a D^{-1} that is diag(c(1/4, 1/4, 1/4, 1/4)), which
# assumes that our four hypothetical prior observations TOGETHER are
# "worth" about one actual sample observation.

# The prior mean on the beta vector that we are inducing is:

prior.mean.beta <- as.vector(solve(xtilde) %*% ytilde)
print("Prior mean for beta vector was:"); print(round(prior.mean.beta,3))


### Choosing prior parameters a and b for gamma prior on tau:

# The parameter "a" reflects our confidence in our prior:
# Let's choose "a" to be 0.5, which implies that our 
# prior knowledge about tau is "worth" about 1 sample observation.

a <- 0.5

# Consider prior observation 1:
# Prior belief:  a car with displacement=150, horsepower=100, weight=2000 should have mpg around 25
# Suppose the expert believes the largest realistic mpg for this type of car is 35

# Then set the prior guess for sigma to be (35 - 25)/1.645 = 6.08
# So the prior guess for sigma^2 is (6.08^2) = 36.97
# So the prior guess for tau is 1/36.97 = 0.027

# Since the gamma prior mean is a/b, let
# b = a / tau.guess:

tau.guess <- 0.027
b <- a / tau.guess 

# Here b is around 18.5 ...

##### Posterior information for tau and beta:

# Calculate beta-hat :

betahat <- as.vector(solve(t(X) %*% X + t(xtilde)%*%solve(D)%*%xtilde) %*% (t(X)%*%y + t(xtilde)%*%solve(D)%*%ytilde))


# Calculate s* :

n <- length(y)

sstar <- as.numeric( (t(y-X%*%betahat)%*%(y-X%*%betahat) + t(ytilde-xtilde%*%betahat)%*%solve(D)%*%(ytilde-xtilde%*%betahat) + 2*b)/(n+2*a) )


### Point estimates for tau (and thus for sigma^2):

p.mean.tau <- 1 / sstar
p.mean.sig.sq <- 1 / p.mean.tau

p.median.tau <- qgamma(0.50, shape=(n+2*a)/2, rate=((n+2*a)/2)*sstar)
p.median.sig.sq <- 1 / p.median.tau

print(paste("posterior.mean for sigma^2=", round(p.mean.sig.sq,3), 
      "posterior.median for sigma^2=", round(p.median.sig.sq,3) ))

### Marginal Interval estimate for tau (and sigma^2):

library(TeachingDemos) # loading TeachingDemos package, to use hpd function

hpd.95.tau <- hpd(qgamma, shape=(n+2*a)/2, rate=((n+2*a)/2)*sstar )

hpd.95.sig.sq <- 1 / hpd.95.tau

print("95% posterior interval for sigma^2:")

round(sort(hpd.95.sig.sq), 3)

### NOT QUITE THE BEST APPROACH:

### Point estimates for beta, given the pt. estimate for tau:

# Posterior mean, median and 95% intervals for the betas:

table.beta.info <- t.ci.table(betahat, (1/p.median.tau)*solve(t(X)%*%X + t(xtilde)%*%solve(D)%*%xtilde) )
table.beta.info

### A BETTER APPROACH TO GET POINT ESTIMATES FOR BETA:

# Randomly generate many tau values from its posterior distribution:

how.many <-30000

tau.values <- rgamma(n=how.many, shape=(n+2*a)/2, rate=((n+2*a)/2)*sstar )

library(mvtnorm)

beta.values<- matrix(0,nr = how.many, nc=length(betahat))
for (j in 1:how.many){
beta.values[j,] <- rmvnorm(n=1, mean=betahat, sigma= (1/tau.values[j])*solve(t(X)%*%X + t(xtilde)%*%solve(D)%*%xtilde) )
}

# Posterior median for each regression coefficient:

post.medians<-apply(beta.values,2,median)

# 95% posterior interval for each regression coefficient:

post.lower <- apply(beta.values,2,quantile, probs=0.025)
post.upper <- apply(beta.values,2,quantile, probs=0.975)

## Summarizing:

names.predictors <- c("x1", "x2", "x3")
beta.post.summary <- data.frame(cbind(post.lower, post.medians, post.upper), row.names=c("intercept", names.predictors))
names(beta.post.summary) <- c('0.025 Quantile', '0.5 Quantile', '0.975 Quantile')
print(beta.post.summary)

# On a similar but smaller (and older) data set that is built into R:
y <- mtcars$mpg
x1 <- mtcars$disp
x2 <- mtcars$hp
x3 <- mtcars$wt

# Redo analysis with both types of prior...

###################################################
### Bayesian Regression with rstanarm
###################################################

library(rstanarm)
library(bayesrules)
library(tidyverse)
library(bayesplot)
library(broom)
library(broom.mixed)
library(tidybayes)

## Note the first part in the prior_intercept argument (22 here)
## is the prior expected response (mpg here) for a "typical" observation.

car_model <- stan_glm(mpg ~ displacement+horsepower+weight, data = autodata,
                       family = gaussian,
                       prior_intercept = normal(22, 20),
                       prior = normal(c(0.325, -1.563, 0.025), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 4, iter = 5000*2)

# MCMC diagnostics
mcmc_trace(car_model, size = 0.1)
mcmc_dens_overlay(car_model)
mcmc_acf(car_model)
neff_ratio(car_model)
rhat(car_model)

#Try using the 'tidy' function in the 'broom.mixed' package:
library(broom.mixed)
broom.mixed::tidy(car_model, effects = c("fixed", "aux"),
     conf.int = TRUE, conf.level = 0.90)

# Alternatively, could use 'summary' function (change amount of rounding with the 'digits' argument):

summary(car_model,digits=4,probs = c(0.025, 0.5, 0.975))

## Compare to the classical regression approach:

summary(lm(mpg ~ displacement+horsepower+weight,data=autodata))

# Posterior prediction:

Xstar <- cbind(1, 150, 100, 2.7)

# Simulate a set of predictions
#set.seed(84735)
shortcut_prediction <- 
  posterior_predict(car_model, newdata = data.frame(displacement=150,horsepower=100,weight=2.7))

# Point prediction (median of predictive distribution)
median(shortcut_prediction)

# Construct a 95% posterior credible interval
posterior_interval(shortcut_prediction, prob = 0.95)


# Plot the approximate predictive model
mcmc_dens(shortcut_prediction) + 
  xlab("predicted MPG for a car with disp=150,HP=100,wt=2.7")

###################################################
### A Bayesian Approach to Model Selection
###################################################

# Function (from Hoff 2009) to calculate a marginal probability for y:

lpy.X <- function(y,X,g=length(y),nu0=1,s20=try(summary(lm(y~-1+X))$sigma^2,silent=TRUE)) {
 n<-dim(X)[1]; p<-dim(X)[2]
 if(p==0) {Hg<-0; s20<-mean(y^2)}
 if(p>0) {Hg<-(g/(g+1))*X%*%solve(t(X)%*%X)%*%t(X)}
 SSRg<- t(y)%*%( diag(1,nrow=n) - Hg)%*%y

 -.5*(n*log(pi)+p*log(1+g)+(nu0+n)*log(nu0*s20+SSRg)-nu0*log(nu0*s20))+lgamma((nu0+n)/2)-lgamma(nu0/2)
}

## The oxygen uptake data:

y <- c(-0.87,-10.74,-3.27,-1.97,7.50,-7.25,17.05,4.96,10.40,11.05,0.26,2.51)
# y = change in maximal oxygen uptake (positive number = improvement)
x1 <- c(0,0,0,0,0,0,1,1,1,1,1,1)              # 0 = running group, 1 = aerobics group
x2 <- c(23,22,22,25,27,20,31,23,27,28,22,24)  # ages of subjects
x3 <- x1*x2                                   # cross-product term (interaction b/w group and age)
X <- as.matrix( cbind(rep(1, length(x1)), x1, x2, x3) )

### Starting values for Gibbs Sampler:
z<-rep(1,dim(X)[2])  # starting with z = all 1's (all terms in model)
lpy.c<-lpy.X(y,X[,z==1,drop=FALSE])
S <- 10000  # number of Monte Carlo iterations
Z<-matrix(NA,S,dim(X)[2])

### The Gibbs Sampler:
for(s in 1:S)
{
 for(j in sample(1:dim(X)[2]))
 {
 zp<-z; zp[j] <- 1-zp[j]
 lpy.p<-lpy.X(y,X[,zp==1,drop=FALSE])
 r<- (lpy.p - lpy.c)*(-1)^(zp[j]==0)
 z[j]<-rbinom(1,1,1/(1+exp(-r)))
 if(z[j]==zp[j]) {lpy.c<-lpy.p}
 }
Z[s,]<-z
}
#########

# Considering all possible subsets:

poss.z.vectors <-  unique(Z,MARGIN=1)
z.probs <- rep(0, times= nrow(poss.z.vectors))

for(i in 1:nrow(poss.z.vectors)) {
z.probs[i] <- sum(apply(Z,1,identical, y=poss.z.vectors[i,]))
}
z.probs <- z.probs/sum(z.probs)

cbind(poss.z.vectors, z.probs)[rev(order(z.probs)),]

# Considering only certain z vectors (interaction only appearing when both first-order terms appear):
#
poss.z.vectors <- matrix( c(
1,0,0,0,
1,1,0,0,
1,0,1,0,
1,1,1,0,
1,1,1,1
), ncol=4, byrow=T)

z.probs <- rep(0, times= nrow(poss.z.vectors))

for(i in 1:nrow(poss.z.vectors)) {
z.probs[i] <- sum(apply(Z,1,identical, y=poss.z.vectors[i,]))
}
z.probs <- z.probs/sum(z.probs)

cbind(poss.z.vectors, z.probs)[rev(order(z.probs)),]
#

## The small automobile data set:

y <- mtcars$mpg
x1 <- mtcars$disp
x2 <- mtcars$hp
x3 <- mtcars$wt

X <- as.matrix( cbind(rep(1, length(x1)), x1, x2, x3) )

# Rerun analysis, considering all possible subsets ...


###################################################
### Posterior Predictive Distribution with Regression
###################################################

##### Regression example:

# On the small automobile data set that is built into R:

y <- mtcars$mpg
x1 <- mtcars$disp
x2 <- mtcars$hp
x3 <- mtcars$wt

X <- as.matrix( cbind(rep(1, length(x1)), x1, x2, x3) )

library(mvtnorm)  # to get "rmvt" function to sample from multivariate t distribution

bhat <- solve(t(X) %*% X) %*% t(X) %*% y
sig2hat <- t(y - X %*% bhat) %*% (y - X %*% bhat) / (nrow(X) - ncol(X))

# Letting X* just equal the observed X:

Xstar <- X

my.Sigma <- as.numeric( sig2hat*(nrow(X) - ncol(X)) / ((nrow(X) - ncol(X) - 2)) )* 
(diag(rep(1,times=nrow(Xstar))) + Xstar %*% solve( (t(X) %*% X) ) %*% t(Xstar) )

# Sampling from multivariate t distribution:
pred.post.err <- rmvt(n=500, sigma=my.Sigma, df=(nrow(X) - ncol(X)))

# Adding multivariate t samples to fitted Y-values to get posterior predictive distribution of response values:
pred.post.samp <- matrix(Xstar %*% bhat, nrow=nrow(pred.post.err), 
                         ncol=ncol(pred.post.err), byrow=T) + pred.post.err

matrix.of.ys <- matrix(y, nrow=nrow(pred.post.samp), ncol=ncol(pred.post.samp), byrow=T)

# Plotting original Y-values against samples from posterior predictive distribution of response values:
matplot(matrix.of.ys, pred.post.samp, pch='o', cex=0.3)
abline(0,1)

# Examining model lack-of-fit:

cbind(mtcars[,1:2], (y - Xstar %*% bhat) )[order(y),]

### Predicting the mileage for another car having dispersion 150, horsepower 100, and weight 2.7:

Xstar <- cbind(1, 150, 100, 2.7)

my.Sigma <- as.numeric( sig2hat*(nrow(X) - ncol(X)) / ((nrow(X) - ncol(X) - 2)) )* 
(diag(rep(1,times=nrow(Xstar))) + Xstar %*% solve( (t(X) %*% X) ) %*% t(Xstar) )

# Sampling from multivariate t distribution:
pred.post.err <- rmvt(n=500, sigma=my.Sigma, df=(nrow(X) - ncol(X)))

# Adding multivariate t samples to fitted Y-values to get posterior predictive distribution of response values:
pred.post.samp <- matrix(Xstar %*% bhat, nrow=nrow(pred.post.err), 
                         ncol=ncol(pred.post.err), byrow=T) + pred.post.err

# Getting 95% prediction interval, and point prediction (via median of posterior predictive distribution):

quantile(pred.post.samp, probs=c(0.025, 0.5, 0.975))

### Using the functions in 'bayesrules' package to examine posterior predictive distribution:
### This is on the smaller 'mtcars' data set:

library(bayesrules)
data(mtcars)

car_model_mtcars <- stan_glm(mpg ~ disp+hp+wt, data = mtcars,
                       family = gaussian,
                       prior_intercept = normal(22, 20),
                       prior = normal(c(0.325, -1.563, 0.025), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 4, iter = 5000*2)

# Simulate a set of predictions
#set.seed(84735)
predictions <- 
  posterior_predict(car_model_mtcars, newdata = mtcars)

dim(predictions)

ppc_intervals(mtcars$mpg, yrep = predictions, x = mtcars$weight, 
              prob = 0.5, prob_outer = 0.95)

# Do the observed y-values in the data set (dark blue circles) 
# mostly fall within the prediction intervals?

prediction_summary(car_model_mtcars, data = mtcars)

# median absolute error (MAE):
# measures the typical difference between the observed responses and their posterior predictive means

# scaled median absolute error:
# measures the typical number of std deviations that the observed responses fall from their posterior predictive mean

# within_50 statistic
# measures the proportion of observed response values that fall within their 50% posterior prediction interval.

# within_95 statistic
# measures the proportion of observed response values that fall within their 95% posterior prediction interval.


# Cross-validation to measure predictive accuracy:
# This splits the data set into 10 parts.
# Uses a model built with nine-tenths of the data to predict the response for the other tenth of the individuals.
# Does this for all 10 subsets being left out in turn, predicting each left-out tenth, averaging the results.

#set.seed(84735)
cv_procedure <- prediction_summary_cv(
  model = car_model_mtcars, data = mtcars, k = 10)

# Results for each subset left out:
cv_procedure$folds

# Results, averaging across the 10 subsets:
cv_procedure$cv

model_elpd <- loo(car_model_mtcars)
model_elpd$estimates

# ELPD as a way to compare different possible models:

car_model_mtcars_1 <- stan_glm(mpg ~ disp+hp, data = mtcars,
                       family = gaussian,
                       prior_intercept = normal(82.5, 20),
                       prior = normal(c(0.325, -1.563), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 4, iter = 5000*2)

model_elpd_1 <- loo(car_model_mtcars_1)
model_elpd_1$estimates

car_model_mtcars_2 <- stan_glm(mpg ~ disp+wt, data = mtcars,
                       family = gaussian,
                       prior_intercept = normal(82.5, 20),
                       prior = normal(c(0.325, 0.025), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 4, iter = 5000*2)

model_elpd_2 <- loo(car_model_mtcars_2)
model_elpd_2$estimates

## Which of the 3 models is better based on ELPD?

## Also see BIC values to compare regression models...






####
####  The following is exactly all the same analysis as above, but on the LARGER 'autodata' data set.
####  I'm showing the posterior predictive analyses, model fit, and model selection things in class
####  using the smaller 'mtcars' data set simply because it doesn't take as much time to run.
####





### Using the functions in 'bayesrules' package to examine posterior predictive distribution:
### This is on the larger 'autodata' data set:

library(bayesrules)

# Simulate a set of predictions
#set.seed(84735)
predictions <- 
  posterior_predict(car_model, newdata = autodata)

dim(predictions)

ppc_intervals(autodata$mpg, yrep = predictions, x = autodata$weight, 
              prob = 0.5, prob_outer = 0.95)

# Do the observed y-values in the data set (dark blue circles) 
# mostly fall within the prediction intervals?

prediction_summary(car_model, data = autodata)

# median absolute error (MAE):
# measures the typical difference between the observed responses and their posterior predictive means

# scaled median absolute error:
# measures the typical number of std deviations that the observed responses fall from their posterior predictive mean

# within_50 statistic
# measures the proportion of observed response values that fall within their 50% posterior prediction interval.

# within_95 statistic
# measures the proportion of observed response values that fall within their 95% posterior prediction interval.


# Cross-validation to measure predictive accuracy:
# This splits the data set into 10 parts.
# Uses a model built with nine-tenths of the data to predict the response for the other tenth of the individuals.
# Does this for all 10 subsets being left out in turn, predicting each left-out tenth, averaging the results.

#set.seed(84735)
cv_procedure <- prediction_summary_cv(
  model = car_model, data = autodata, k = 10)

# Results for each subset left out:
cv_procedure$folds

# Results, averaging across the 10 subsets:
cv_procedure$cv

model_elpd <- loo(car_model)
model_elpd$estimates

# ELPD as a way to compare different possible models:

car_model_1 <- stan_glm(mpg ~ displacement+horsepower, data = autodata,
                       family = gaussian,
                       prior_intercept = normal(82.5, 20),
                       prior = normal(c(0.325, -1.563), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 4, iter = 5000*2)

model_elpd_1 <- loo(car_model_1)
model_elpd_1$estimates

car_model_2 <- stan_glm(mpg ~ displacement+weight, data = autodata,
                       family = gaussian,
                       prior_intercept = normal(82.5, 20),
                       prior = normal(c(0.325, 0.025), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 4, iter = 5000*2)

model_elpd_2 <- loo(car_model_2)
model_elpd_2$estimates

## Which of the 3 models is better based on ELPD?

## Also see BIC values to compare regression models...