# Problem 3 (c)

x1 <- c(28.5, 30, 31.5, 32, 33.5, 35.9, 39, 40.5, 42.5, 45, 54.6, 62.3, 70, 80)
y <-  c(94, 93.5, 99.5, 105, 110, 116, 125, 130.6, 129.9, 140, 170, 171, 185, 177)

X <- as.matrix( cbind(rep(1, length(x1)), x1) )

###### Setting up prior specification :

# 1 predictor variable, so we need exactly 1+1=2 hypothetical "prior observations"

# Suppose that based on "expert opinion" we have the following guesses:

xtilde.obs.1 <- 
c(40)
ytilde.obs.1 <- 115 

xtilde.obs.2 <- 
c(60)
ytilde.obs.2 <- 150  


# Making the matrix X~ :
prior.obs.stacked <- rbind(xtilde.obs.1, xtilde.obs.2)
xtilde <- cbind(rep(1, times=nrow(prior.obs.stacked) ), prior.obs.stacked)

# Making the vector Y~ :
ytilde <- c(ytilde.obs.1, ytilde.obs.2)

# Diagonal matrix D contains weights that indicate how much worth we place on 
# our hypothetical prior observations (note the weights could vary if we are more
# confident about some hypothetical prior observations than about others):

D <- diag(c(2, 2))

# This D yields a D^{-1} that is diag(c(1/2, 1/2)), which
# assumes that our two hypothetical prior observations TOGETHER are
# "worth" about one actual sample observation.

# The prior mean on the beta vector that we are inducing is:

prior.mean.beta <- as.vector(solve(xtilde) %*% ytilde)
print("Prior mean for beta vector was:"); print(round(prior.mean.beta,3))


### Choosing prior parameters a and b for gamma prior on tau:

# The parameter "a" reflects our confidence in our prior:
# Let's choose "a" to be 0.1, which implies that our 
# prior knowledge about tau is "worth" about 0.2 sample observations.

a <- 0.1


# set the prior guess for sigma to be (195 - 115)/1.645 = 48.63222
# So the prior guess for sigma^2 is (48.63222^2) =  2365.093
# So the prior guess for tau is 1/ 2365.093 = 0.0004228164

# Since the gamma prior mean is a/b, let
# b = a / tau.guess:

tau.guess <- 0.0004228164
b <- a / tau.guess 

# Here b is around  236.5 ...

##### Posterior information for tau and beta:

# Calculate beta-hat :

betahat <- as.vector(solve(t(X) %*% X + t(xtilde)%*%solve(D)%*%xtilde) %*% (t(X)%*%y + t(xtilde)%*%solve(D)%*%ytilde))


# Calculate s* :

n <- length(y)

sstar <- as.numeric( (t(y-X%*%betahat)%*%(y-X%*%betahat) + t(ytilde-xtilde%*%betahat)%*%solve(D)%*%(ytilde-xtilde%*%betahat) + 2*b)/(n+2*a) )


### Point estimates for tau (and thus for sigma^2):

p.mean.tau <- 1 / sstar
p.mean.sig.sq <- 1 / p.mean.tau

p.median.tau <- qgamma(0.50, shape=(n+2*a)/2, rate=((n+2*a)/2)*sstar)
p.median.sig.sq <- 1 / p.median.tau

print(paste("posterior.mean for sigma^2=", round(p.mean.sig.sq,3), 
      "posterior.median for sigma^2=", round(p.median.sig.sq,3) ))




# 3(d)

### Point estimates for beta, given the conditional draws for tau:
### "Honest" approach:

# Randomly generate many tau values from its posterior distribution:

how.many <-30000

tau.values <- rgamma(n=how.many, shape=(n+2*a)/2, rate=((n+2*a)/2)*sstar )

library(mvtnorm)

beta.values<- matrix(0,nr = how.many, nc=length(betahat))
for (j in 1:how.many){
beta.values[j,] <- rmvnorm(n=1, mean=betahat, sigma= (1/tau.values[j])*solve(t(X)%*%X + t(xtilde)%*%solve(D)%*%xtilde) )
}

# Posterior median for each regression coefficient:

post.medians<-apply(beta.values,2,median)

# 95% posterior interval for each regression coefficient:

post.lower <- apply(beta.values,2,quantile, probs=0.025)
post.upper <- apply(beta.values,2,quantile, probs=0.975)

## Summarizing:

names.predictors <- c("x1")
beta.post.summary <- data.frame(cbind(post.lower, post.medians, post.upper), row.names=c("intercept", names.predictors))
names(beta.post.summary) <- c('0.025 Quantile', '0.5 Quantile', '0.975 Quantile')
print(beta.post.summary)





########## Problem 4 ##############

#### Gill's function:

t.ci.table <- function(coefs, cov.mat, level=0.95, degrees=Inf, quantiles=c(0.025, 0.50, 0.975) ) {
 quantile.mat <- cbind(coefs, sqrt(diag(cov.mat)), t(qt(quantiles, degrees) %o% sqrt(diag(cov.mat))) + 
                 matrix(rep(coefs,length(quantiles)), ncol=length(quantiles)) )

 quantile.names <- c("mean", "Std. Error") 
 for (i in 1:length(quantiles)) {
  quantile.names <- c(quantile.names, paste(quantiles[i], "Quantile"))
 }
 dimnames(quantile.mat)[2] <- list(quantile.names)
 return(round(quantile.mat,4))
}



ener.data <- read.table("http://people.stat.sc.edu/hitchcock/energybardata.txt", 
header=F, col.names = c("price", "calories", "protein", "fat")) ; attach(ener.data) 

y <- price; x1 <- calories; x2 <- protein; x3 <- fat

X <- as.matrix( cbind(rep(1, length(x1)), x1, x2, x3) )


### Noninformative Prior Analysis:

bhat <- solve(t(X) %*% X) %*% t(X) %*% y
sig2hat <- t(y - X %*% bhat) %*% (y - X %*% bhat) / (nrow(X) - ncol(X))
my.cov.mat <- solve(t(X) %*% X) * ((nrow(X)-ncol(X))*sig2hat / (nrow(X) - ncol(X) - 2))[1,1]

# Getting posterior information about the betas:
table.beta.info <- t.ci.table(bhat, my.cov.mat, degrees = nrow(X) - ncol(X))
table.beta.info

my.alpha <- (nrow(X) - ncol(X) - 1)/2
my.beta <- 0.5 * sig2hat * (nrow(X) - ncol(X))

library(TeachingDemos)
library(pscl)
hpd.sig.sq <- hpd(qigamma, alpha=my.alpha, beta=my.beta)
round(hpd.sig.sq, 3)


## Problem 4(c)

# Function to calculate a marginal probability for y:

lpy.X <- function(y,X,g=length(y),nu0=1,s20=try(summary(lm(y~-1+X))$sigma^2,silent=TRUE)) {
 n<-dim(X)[1]; p<-dim(X)[2]
 if(p==0) {Hg<-0; s20<-mean(y^2)}
 if(p>0) {Hg<-(g/(g+1))*X%*%solve(t(X)%*%X)%*%t(X)}
 SSRg<- t(y)%*%( diag(1,nrow=n) - Hg)%*%y

 -.5*(n*log(pi)+p*log(1+g)+(nu0+n)*log(nu0*s20+SSRg)-nu0*log(nu0*s20))+lgamma((nu0+n)/2)-lgamma(nu0/2)
}


### Starting values for Gibbs Sampler:
z<-rep(1,dim(X)[2])  # starting with z = all 1's (all terms in model)
lpy.c<-lpy.X(y,X[,z==1,drop=FALSE])
S <- 10000  # number of Monte Carlo iterations
Z<-matrix(NA,S,dim(X)[2])

### The Gibbs Sampler:
for(s in 1:S)
{
 for(j in sample(1:dim(X)[2]))
 {
 zp<-z; zp[j] <- 1-zp[j]
 lpy.p<-lpy.X(y,X[,zp==1,drop=FALSE])
 r<- (lpy.p - lpy.c)*(-1)^(zp[j]==0)
 z[j]<-rbinom(1,1,1/(1+exp(-r)))
 if(z[j]==zp[j]) {lpy.c<-lpy.p}
 }
Z[s,]<-z
}
#########

# Considering all possible subsets:

poss.z.vectors <-  unique(Z,MARGIN=1)
z.probs <- rep(0, times= nrow(poss.z.vectors))

for(i in 1:nrow(poss.z.vectors)) {
z.probs[i] <- sum(apply(Z,1,identical, y=poss.z.vectors[i,]))
}
z.probs <- z.probs/sum(z.probs)

cbind(poss.z.vectors, z.probs)[rev(order(z.probs)),]


# 4(d)

x4 <- x1*x2; x5 <- x1*x3; x6 <- x2*x3

X <- as.matrix( cbind(rep(1, length(x1)), x1, x2, x3, x4, x5, x6) )

### Starting values for Gibbs Sampler:
z<-rep(1,dim(X)[2])  # starting with z = all 1's (all terms in model)
lpy.c<-lpy.X(y,X[,z==1,drop=FALSE])
S <- 10000  # number of Monte Carlo iterations
Z<-matrix(NA,S,dim(X)[2])

### The Gibbs Sampler:
for(s in 1:S)
{
 for(j in sample(1:dim(X)[2]))
 {
 zp<-z; zp[j] <- 1-zp[j]
 lpy.p<-lpy.X(y,X[,zp==1,drop=FALSE])
 r<- (lpy.p - lpy.c)*(-1)^(zp[j]==0)
 z[j]<-rbinom(1,1,1/(1+exp(-r)))
 if(z[j]==zp[j]) {lpy.c<-lpy.p}
 }
Z[s,]<-z
}
#########

# Considering only certain z vectors (interaction only appearing when both first-order terms appear):
#
poss.z.vectors <- matrix( c(
1,0,0,0,0,0,0,
1,1,0,0,0,0,0,
1,0,1,0,0,0,0,
1,0,0,1,0,0,0,
1,1,1,0,0,0,0,
1,1,0,1,0,0,0,
1,0,1,1,0,0,0,
1,1,1,0,1,0,0,
1,1,0,1,0,1,0,
1,0,1,1,0,0,1,
1,1,1,1,0,0,0,
1,1,1,1,1,0,0,
1,1,1,1,0,1,0,
1,1,1,1,0,0,1,
1,1,1,1,1,1,0,
1,1,1,1,0,1,1,
1,1,1,1,1,0,1,
1,1,1,1,1,1,1
), ncol=7, byrow=T)

z.probs <- rep(0, times= nrow(poss.z.vectors))

for(i in 1:nrow(poss.z.vectors)) {
z.probs[i] <- sum(apply(Z,1,identical, y=poss.z.vectors[i,]))
}
z.probs <- z.probs/sum(z.probs)

cbind(poss.z.vectors, z.probs)[rev(order(z.probs)),]
#



##### Problem 5, noninformative prior analysis:

#5(a)

cer.data <- read.table("http://people.stat.sc.edu/hitchcock/cerealdatabayes.txt", header=T)
# This creates a data frame cer.data with columns named
# Sugar, Sodium, Fiber, Carbohydrates, Potassium

# Alternatively you could create generically named response and predictor variables:
y <- cer.data$Sugar
x1 <- cer.data$Sodium  
x2 <- cer.data$Fiber
x3 <- cer.data$Carbohydrates
x4 <- cer.data$Potassium 
cer.data.generic <- data.frame(y,x1,x2,x3,x4) #creating a data frame with these variables

X <- as.matrix( cbind(rep(1, length(x1)), x1, x2, x3, x4) )

library(mvtnorm)  # to get "rmvt" function to sample from multivariate t distribution

bhat <- solve(t(X) %*% X) %*% t(X) %*% y
sig2hat <- t(y - X %*% bhat) %*% (y - X %*% bhat) / (nrow(X) - ncol(X))

# Letting X* just equal the observed X:

Xstar <- X

my.Sigma <- as.numeric( sig2hat*(nrow(X) - ncol(X)) / ((nrow(X) - ncol(X) - 2)) )* 
(diag(rep(1,times=nrow(Xstar))) + Xstar %*% solve( (t(X) %*% X) ) %*% t(Xstar) )

# Sampling from multivariate t distribution:
pred.post.err <- rmvt(n=500, sigma=my.Sigma, df=(nrow(X) - ncol(X)))

# Adding multivariate t samples to fitted Y-values to get posterior predictive distribution of response values:
pred.post.samp <- matrix(Xstar %*% bhat, nrow=nrow(pred.post.err), 
                         ncol=ncol(pred.post.err), byrow=T) + pred.post.err

matrix.of.ys <- matrix(y, nrow=nrow(pred.post.samp), ncol=ncol(pred.post.samp), byrow=T)

# Plotting original Y-values against samples from posterior predictive distribution of response values:
matplot(matrix.of.ys, pred.post.samp, pch='o', cex=0.3)
abline(0,1)

# 5(b)

# Examining model lack-of-fit:

cbind(cer.data[,c(1,5)], (y - Xstar %*% bhat) )[order(y),]

# 5(c)

### Predictions for a cereal having sodium=140, fiber=3.5, carbohydrates=14, and potassium=90. 

Xstar <- cbind(1, 140, 3.5, 14, 90)

#my.Sigma <- as.numeric( sig2hat*(nrow(X) - ncol(X)) / ((nrow(X) - ncol(X) - 2)) )* 
#solve(diag(rep(1,times=nrow(X))) - Xstar %*% solve( (t(X) %*% X) + (t(Xstar) %*% Xstar) ) %*% t(Xstar) )

my.Sigma <- as.numeric( sig2hat*(nrow(X) - ncol(X)) / ((nrow(X) - ncol(X) - 2)) )* 
(diag(rep(1,times=nrow(Xstar))) + Xstar %*% solve( (t(X) %*% X) ) %*% t(Xstar) )

# Sampling from multivariate t distribution:
pred.post.err <- rmvt(n=500, sigma=my.Sigma, df=(nrow(X) - ncol(X)))

# Adding multivariate t samples to fitted Y-values to get posterior predictive distribution of response values:
pred.post.samp <- matrix(Xstar %*% bhat, nrow=nrow(pred.post.err), 
                         ncol=ncol(pred.post.err), byrow=T) + pred.post.err

# Getting 90% prediction interval, and point prediction (via median of posterior predictive distribution):

quantile(pred.post.samp, probs=c(0.05, 0.5, 0.95))



##### Problem 5, subjective prior analysis using 'rstanarm' package:



library(rstanarm)
library(bayesrules)
library(tidyverse)
library(bayesplot)
library(broom)
library(broom.mixed)
library(tidybayes)

## Note the first part in the prior_intercept argument (22 here)
## is the prior expected response (mpg here) for a "typical" observation.

cer_model <- stan_glm(Sugar ~ Sodium+Fiber+Carbohydrates+Potassium, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(-1, -1, 1, 1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 4, iter = 5000*2)

# MCMC diagnostics
mcmc_trace(cer_model, size = 0.1)
mcmc_acf(cer_model)


#Try using the 'tidy' function in the 'broom.mixed' package:
library(broom.mixed)
broom.mixed::tidy(cer_model, effects = c("fixed", "aux"),
     conf.int = TRUE, conf.level = 0.90)

# Alternatively, could use 'summary' function (change amount of rounding with the 'digits' argument):

#summary(cer_model,digits=4,probs = c(0.05, 0.5, 0.95))

# Posterior prediction:

# Simulate a set of predictions
#set.seed(84735)
shortcut_prediction <- 
  posterior_predict(cer_model, newdata = data.frame(Sodium=140, Fiber=3.5, Carbohydrates=14, Potassium=90))

# Point prediction (median of predictive distribution)
median(shortcut_prediction)

# Construct a 95% posterior credible interval
posterior_interval(shortcut_prediction, prob = 0.90)


# Plot the approximate predictive model
mcmc_dens(shortcut_prediction) + 
  xlab("predicted Sugar for a cereal with sodium=140, fiber=3.5, carb=14, potassium=90")

# Checking overall model fit:

predictions <- 
  posterior_predict(cer_model, newdata = cer.data)

ppc_intervals(cer.data$Sugar, yrep = predictions, x = cer.data$Sugar, 
              prob = 0.5, prob_outer = 0.95)

# Do the observed y-values in the data set (dark blue circles) 
# mostly fall within the prediction intervals?

prediction_summary(cer_model, data = cer.data)

## #6 (model selection in the cereal data)
# Using the ELPD approach, just first order terms:

# Just running 1 chain of each of these to save time:

cer_model_1234 <- stan_glm(Sugar ~ Sodium+Fiber+Carbohydrates+Potassium, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(-1, -1, 1, 1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 1, iter = 5000*2)

cer_model_1 <- stan_glm(Sugar ~ Sodium, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(-1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 1, iter = 5000*2)

cer_model_2 <- stan_glm(Sugar ~ Fiber, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(-1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 1, iter = 5000*2)

cer_model_3 <- stan_glm(Sugar ~ Carbohydrates, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 1, iter = 5000*2)
cer_model_4 <- stan_glm(Sugar ~ Potassium, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 1, iter = 5000*2)
cer_model_12 <- stan_glm(Sugar ~ Sodium+Fiber, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(-1, -1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 1, iter = 5000*2)
cer_model_13 <- stan_glm(Sugar ~ Sodium+Carbohydrates, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(-1,1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 1, iter = 5000*2)
cer_model_14 <- stan_glm(Sugar ~ Sodium+Potassium, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(-1,1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 1, iter = 5000*2)

cer_model_23 <- stan_glm(Sugar ~ Fiber+Carbohydrates, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(-1,1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 1, iter = 5000*2)
cer_model_24 <- stan_glm(Sugar ~ Fiber+Potassium, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(-1,1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 1, iter = 5000*2)

cer_model_34 <- stan_glm(Sugar ~ Carbohydrates+Potassium, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(1, 1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 1, iter = 5000*2)
cer_model_123 <- stan_glm(Sugar ~ Sodium+Fiber+Carbohydrates, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(-1, -1, 1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 1, iter = 5000*2)
cer_model_124 <- stan_glm(Sugar ~ Sodium+Fiber+Potassium, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(-1, -1, 1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 1, iter = 5000*2)
cer_model_134 <- stan_glm(Sugar ~ Sodium+Carbohydrates+Potassium, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(-1, 1, 1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 1, iter = 5000*2)
cer_model_234 <- stan_glm(Sugar ~ Fiber+Carbohydrates+Potassium, data = cer.data,
                       family = gaussian,
                       prior_intercept = normal(15, 20),
                       prior = normal(c(-1, 1, 1), 40, autoscale=TRUE), 
                       prior_aux = exponential(1, autoscale=TRUE),
                       chains = 1, iter = 5000*2)

loo(cer_model_1)$estimates
loo(cer_model_2)$estimates
loo(cer_model_3)$estimates
loo(cer_model_4)$estimates
loo(cer_model_12)$estimates
loo(cer_model_13)$estimates
loo(cer_model_14)$estimates
loo(cer_model_23)$estimates
loo(cer_model_24)$estimates
loo(cer_model_34)$estimates
loo(cer_model_123)$estimates
loo(cer_model_124)$estimates
loo(cer_model_134)$estimates
loo(cer_model_234)$estimates
loo(cer_model_1234)$estimates