############################### # Bayesian Linear Regression # ############################### ################################################### ### Example 1, Chapter 4 notes (Oxygen uptake data) ################################################### # data from Hoff (2009) y <- c(-0.87,-10.74,-3.27,-1.97,7.50,-7.25,17.05,4.96,10.40,11.05,0.26,2.51) # y = change in maximal oxygen uptake (positive number = improvement) x1 <- c(0,0,0,0,0,0,1,1,1,1,1,1) # 0 = running group, 1 = aerobics group x2 <- c(23,22,22,25,27,20,31,23,27,28,22,24) # ages of subjects x3 <- x1*x2 # cross-product term (interaction b/w group and age) X <- as.matrix( cbind(rep(1, length(x1)), x1, x2, x3) ) # Gill's function to produce posterior medians # and quantile based credible intervals for the # regression parameters (based on multivariate-t posterior) t.ci.table <- function(coefs, cov.mat, level=0.95, degrees=Inf, quantiles=c(0.025, 0.50, 0.975) ) { quantile.mat <- cbind(coefs, sqrt(diag(cov.mat)), t(qt(quantiles, degrees) %o% sqrt(diag(cov.mat))) + matrix(rep(coefs,length(quantiles)), ncol=length(quantiles)) ) quantile.names <- c("mean", "Std. Error") for (i in 1:length(quantiles)) { quantile.names <- c(quantile.names, paste(quantiles[i], "Quantile")) } dimnames(quantile.mat)[2] <- list(quantile.names) return(round(quantile.mat,4)) } ### Noninformative Prior Analysis: bhat <- solve(t(X) %*% X) %*% t(X) %*% y sig2hat <- t(y - X %*% bhat) %*% (y - X %*% bhat) / (nrow(X) - ncol(X)) my.cov.mat <- solve(t(X) %*% X) * ((nrow(X)-ncol(X))*sig2hat / (nrow(X) - ncol(X) - 2))[1,1] # Getting posterior information about the betas: table.beta.info <- t.ci.table(bhat, my.cov.mat, degrees = nrow(X) - ncol(X)) table.beta.info # Getting posterior information about sigma^2: library(TeachingDemos) # loading TeachingDemos package, to use hpd function library(pscl) # loading pscl package, to use inverse gamma distribution my.alpha <- (nrow(X) - ncol(X) - 1)/2 my.beta <- 0.5 * sig2hat * (nrow(X) - ncol(X)) # Point estimate (posterior median here) for sigma^2: sig.sq.poster.med <- qigamma(0.50, alpha=my.alpha, beta=my.beta) round(sig.sq.poster.med, 3) hpd.sig.sq <- hpd(qigamma, alpha=my.alpha, beta=my.beta) round(hpd.sig.sq, 3) ################################################### ### Example 2, Chapter 4 notes (Automobile data) ################################################### autodata <- read.table("http://www.stat.sc.edu/~hitchcock/autoregresslarge.txt", header=T) y <- autodata$mpg x1 <- autodata$displacement x2 <- autodata$horsepower x3 <- autodata$weight X <- as.matrix( cbind(rep(1, length(x1)), x1, x2, x3) ) ###### Setting up prior specification : # 3 predictor variables, so we need exactly 3+1=4 hypothetical "prior observations" # Suppose that based on "expert opinion" we have the following guesses: xtilde.obs.1 <- c(150, 100, 2000) ytilde.obs.1 <- 25 # Prior belief: a car with displacement=150, horsepower=100, weight=2000 should have mpg around 25 xtilde.obs.2 <- c(200, 160, 4500) ytilde.obs.2 <- 10 # Prior belief: a car with displacement=200, horsepower=160, weight=4500 should have mpg around 10 xtilde.obs.3 <- c(250, 140, 3000) ytilde.obs.3 <- 20 # Prior belief: a car with displacement=250, horsepower=140, weight=3000 should have mpg around 20 xtilde.obs.4 <- c(100, 80, 1800) ytilde.obs.4 <- 35 # Prior belief: a car with displacement=100, horsepower=80, weight=1800 should have mpg around 35 # Making the matrix X~ : prior.obs.stacked <- rbind(xtilde.obs.1, xtilde.obs.2, xtilde.obs.3, xtilde.obs.4) xtilde <- cbind(rep(1, times=nrow(prior.obs.stacked) ), prior.obs.stacked) # Making the vector Y~ : ytilde <- c(ytilde.obs.1, ytilde.obs.2, ytilde.obs.3, ytilde.obs.4) # Diagonal matrix D contains weights that indicate how much worth we place on # our hypothetical prior observations (note the weights could vary if we are more # confident about some hypothetical prior observations than about others): D <- diag(c(4, 4, 4, 4)) # This D yields a D^{-1} that is diag(c(1/4, 1/4, 1/4, 1/4)), which # assumes that our four hypothetical prior observations TOGETHER are # "worth" about one actual sample observation. # The prior mean on the beta vector that we are inducing is: prior.mean.beta <- as.vector(solve(xtilde) %*% ytilde) print("Prior mean for beta vector was:"); print(round(prior.mean.beta,3)) ### Choosing prior parameters a and b for gamma prior on tau: # The parameter "a" reflects our confidence in our prior: # Let's choose "a" to be 0.5, which implies that our # prior knowledge about tau is "worth" about 1 sample observation. a <- 0.5 # Consider prior observation 1: # Prior belief: a car with displacement=150, horsepower=100, weight=2000 should have mpg around 25 # Suppose the expert believes the largest realistic mpg for this type of car is 35 # Then set the prior guess for sigma to be (35 - 25)/1.645 = 6.08 # So the prior guess for sigma^2 is (6.08^2) = 36.97 # So the prior guess for tau is 1/36.97 = 0.027 # Since the gamma prior mean is a/b, let # b = a / tau.guess: tau.guess <- 0.027 b <- a / tau.guess # Here b is around 18.5 ... ##### Posterior information for tau and beta: # Calculate beta-hat : betahat <- as.vector(solve(t(X) %*% X + t(xtilde)%*%solve(D)%*%xtilde) %*% (t(X)%*%y + t(xtilde)%*%solve(D)%*%ytilde)) # Calculate s* : n <- length(y) sstar <- as.numeric( (t(y-X%*%betahat)%*%(y-X%*%betahat) + t(ytilde-xtilde%*%betahat)%*%solve(D)%*%(ytilde-xtilde%*%betahat) + 2*b)/(n+2*a) ) ### Point estimates for tau (and thus for sigma^2): p.mean.tau <- 1 / sstar p.mean.sig.sq <- 1 / p.mean.tau p.median.tau <- qgamma(0.50, shape=(n+2*a)/2, rate=((n+2*a)/2)*sstar) p.median.sig.sq <- 1 / p.median.tau print(paste("posterior.mean for sigma^2=", round(p.mean.sig.sq,3), "posterior.median for sigma^2=", round(p.median.sig.sq,3) )) ### Marginal Interval estimate for tau (and sigma^2): library(TeachingDemos) # loading TeachingDemos package, to use hpd function hpd.95.tau <- hpd(qgamma, shape=(n+2*a)/2, rate=((n+2*a)/2)*sstar ) hpd.95.sig.sq <- 1 / hpd.95.tau print("95% posterior interval for sigma^2:") round(sort(hpd.95.sig.sq), 3) ### NOT QUITE THE BEST APPROACH: ### Point estimates for beta, given the pt. estimate for tau: # Posterior mean, median and 95% intervals for the betas: table.beta.info <- t.ci.table(betahat, (1/p.median.tau)*solve(t(X)%*%X + t(xtilde)%*%solve(D)%*%xtilde) ) table.beta.info ### A BETTER APPROACH TO GET POINT ESTIMATES FOR BETA: # Randomly generate many tau values from its posterior distribution: how.many <-30000 tau.values <- rgamma(n=how.many, shape=(n+2*a)/2, rate=((n+2*a)/2)*sstar ) library(mvtnorm) beta.values<- matrix(0,nr = how.many, nc=length(betahat)) for (j in 1:how.many){ beta.values[j,] <- rmvnorm(n=1, mean=betahat, sigma= (1/tau.values[j])*solve(t(X)%*%X + t(xtilde)%*%solve(D)%*%xtilde) ) } # Posterior median for each regression coefficient: post.medians<-apply(beta.values,2,median) # 95% posterior interval for each regression coefficient: post.lower <- apply(beta.values,2,quantile, probs=0.025) post.upper <- apply(beta.values,2,quantile, probs=0.975) ## Summarizing: names.predictors <- c("x1", "x2", "x3") beta.post.summary <- data.frame(cbind(post.lower, post.medians, post.upper), row.names=c("intercept", names.predictors)) names(beta.post.summary) <- c('0.025 Quantile', '0.5 Quantile', '0.975 Quantile') print(beta.post.summary) # On a similar but smaller (and older) data set that is built into R: y <- mtcars$mpg x1 <- mtcars$disp x2 <- mtcars$hp x3 <- mtcars$wt # Redo analysis with both types of prior... ################################################### ### Bayesian Regression with rstanarm ################################################### library(rstanarm) library(bayesrules) library(tidyverse) library(bayesplot) library(broom) library(broom.mixed) library(tidybayes) ## Note the first part in the prior_intercept argument (22 here) ## is the prior expected response (mpg here) for a "typical" observation. car_model <- stan_glm(mpg ~ displacement+horsepower+weight, data = autodata, family = gaussian, prior_intercept = normal(22, 20), prior = normal(c(0.325, -1.563, 0.025), 40, autoscale=TRUE), prior_aux = exponential(1, autoscale=TRUE), chains = 4, iter = 5000*2) # MCMC diagnostics mcmc_trace(car_model, size = 0.1) mcmc_dens_overlay(car_model) mcmc_acf(car_model) neff_ratio(car_model) rhat(car_model) #Try using the 'tidy' function in the 'broom.mixed' package: library(broom.mixed) broom.mixed::tidy(car_model, effects = c("fixed", "aux"), conf.int = TRUE, conf.level = 0.90) # Alternatively, could use 'summary' function (change amount of rounding with the 'digits' argument): summary(car_model,digits=4,probs = c(0.025, 0.5, 0.975)) ## Compare to the classical regression approach: summary(lm(mpg ~ displacement+horsepower+weight,data=autodata)) # Posterior prediction: Xstar <- cbind(1, 150, 100, 2.7) # Simulate a set of predictions #set.seed(84735) shortcut_prediction <- posterior_predict(car_model, newdata = data.frame(displacement=150,horsepower=100,weight=2.7)) # Point prediction (median of predictive distribution) median(shortcut_prediction) # Construct a 95% posterior credible interval posterior_interval(shortcut_prediction, prob = 0.95) # Plot the approximate predictive model mcmc_dens(shortcut_prediction) + xlab("predicted MPG for a car with disp=150,HP=100,wt=2.7") ################################################### ### A Bayesian Approach to Model Selection ################################################### # Function (from Hoff 2009) to calculate a marginal probability for y: lpy.X <- function(y,X,g=length(y),nu0=1,s20=try(summary(lm(y~-1+X))$sigma^2,silent=TRUE)) { n<-dim(X)[1]; p<-dim(X)[2] if(p==0) {Hg<-0; s20<-mean(y^2)} if(p>0) {Hg<-(g/(g+1))*X%*%solve(t(X)%*%X)%*%t(X)} SSRg<- t(y)%*%( diag(1,nrow=n) - Hg)%*%y -.5*(n*log(pi)+p*log(1+g)+(nu0+n)*log(nu0*s20+SSRg)-nu0*log(nu0*s20))+lgamma((nu0+n)/2)-lgamma(nu0/2) } ## The oxygen uptake data: y <- c(-0.87,-10.74,-3.27,-1.97,7.50,-7.25,17.05,4.96,10.40,11.05,0.26,2.51) # y = change in maximal oxygen uptake (positive number = improvement) x1 <- c(0,0,0,0,0,0,1,1,1,1,1,1) # 0 = running group, 1 = aerobics group x2 <- c(23,22,22,25,27,20,31,23,27,28,22,24) # ages of subjects x3 <- x1*x2 # cross-product term (interaction b/w group and age) X <- as.matrix( cbind(rep(1, length(x1)), x1, x2, x3) ) ### Starting values for Gibbs Sampler: z<-rep(1,dim(X)[2]) # starting with z = all 1's (all terms in model) lpy.c<-lpy.X(y,X[,z==1,drop=FALSE]) S <- 10000 # number of Monte Carlo iterations Z<-matrix(NA,S,dim(X)[2]) ### The Gibbs Sampler: for(s in 1:S) { for(j in sample(1:dim(X)[2])) { zp<-z; zp[j] <- 1-zp[j] lpy.p<-lpy.X(y,X[,zp==1,drop=FALSE]) r<- (lpy.p - lpy.c)*(-1)^(zp[j]==0) z[j]<-rbinom(1,1,1/(1+exp(-r))) if(z[j]==zp[j]) {lpy.c<-lpy.p} } Z[s,]<-z } ######### # Considering all possible subsets: poss.z.vectors <- unique(Z,MARGIN=1) z.probs <- rep(0, times= nrow(poss.z.vectors)) for(i in 1:nrow(poss.z.vectors)) { z.probs[i] <- sum(apply(Z,1,identical, y=poss.z.vectors[i,])) } z.probs <- z.probs/sum(z.probs) cbind(poss.z.vectors, z.probs)[rev(order(z.probs)),] # Considering only certain z vectors (interaction only appearing when both first-order terms appear): # poss.z.vectors <- matrix( c( 1,0,0,0, 1,1,0,0, 1,0,1,0, 1,1,1,0, 1,1,1,1 ), ncol=4, byrow=T) z.probs <- rep(0, times= nrow(poss.z.vectors)) for(i in 1:nrow(poss.z.vectors)) { z.probs[i] <- sum(apply(Z,1,identical, y=poss.z.vectors[i,])) } z.probs <- z.probs/sum(z.probs) cbind(poss.z.vectors, z.probs)[rev(order(z.probs)),] # ## The small automobile data set: y <- mtcars$mpg x1 <- mtcars$disp x2 <- mtcars$hp x3 <- mtcars$wt X <- as.matrix( cbind(rep(1, length(x1)), x1, x2, x3) ) # Rerun analysis, considering all possible subsets ... ################################################### ### Posterior Predictive Distribution with Regression ################################################### ##### Regression example: # On the small automobile data set that is built into R: y <- mtcars$mpg x1 <- mtcars$disp x2 <- mtcars$hp x3 <- mtcars$wt X <- as.matrix( cbind(rep(1, length(x1)), x1, x2, x3) ) library(mvtnorm) # to get "rmvt" function to sample from multivariate t distribution bhat <- solve(t(X) %*% X) %*% t(X) %*% y sig2hat <- t(y - X %*% bhat) %*% (y - X %*% bhat) / (nrow(X) - ncol(X)) # Letting X* just equal the observed X: Xstar <- X my.Sigma <- as.numeric( sig2hat*(nrow(X) - ncol(X)) / ((nrow(X) - ncol(X) - 2)) )* (diag(rep(1,times=nrow(Xstar))) + Xstar %*% solve( (t(X) %*% X) ) %*% t(Xstar) ) # Sampling from multivariate t distribution: pred.post.err <- rmvt(n=500, sigma=my.Sigma, df=(nrow(X) - ncol(X))) # Adding multivariate t samples to fitted Y-values to get posterior predictive distribution of response values: pred.post.samp <- matrix(Xstar %*% bhat, nrow=nrow(pred.post.err), ncol=ncol(pred.post.err), byrow=T) + pred.post.err matrix.of.ys <- matrix(y, nrow=nrow(pred.post.samp), ncol=ncol(pred.post.samp), byrow=T) # Plotting original Y-values against samples from posterior predictive distribution of response values: matplot(matrix.of.ys, pred.post.samp, pch='o', cex=0.3) abline(0,1) # Examining model lack-of-fit: cbind(mtcars[,1:2], (y - Xstar %*% bhat) )[order(y),] ### Predicting the mileage for another car having dispersion 150, horsepower 100, and weight 2.7: Xstar <- cbind(1, 150, 100, 2.7) my.Sigma <- as.numeric( sig2hat*(nrow(X) - ncol(X)) / ((nrow(X) - ncol(X) - 2)) )* (diag(rep(1,times=nrow(Xstar))) + Xstar %*% solve( (t(X) %*% X) ) %*% t(Xstar) ) # Sampling from multivariate t distribution: pred.post.err <- rmvt(n=500, sigma=my.Sigma, df=(nrow(X) - ncol(X))) # Adding multivariate t samples to fitted Y-values to get posterior predictive distribution of response values: pred.post.samp <- matrix(Xstar %*% bhat, nrow=nrow(pred.post.err), ncol=ncol(pred.post.err), byrow=T) + pred.post.err # Getting 95% prediction interval, and point prediction (via median of posterior predictive distribution): quantile(pred.post.samp, probs=c(0.025, 0.5, 0.975)) ### Using the functions in 'bayesrules' package to examine posterior predictive distribution: ### This is on the smaller 'mtcars' data set: library(bayesrules) data(mtcars) car_model_mtcars <- stan_glm(mpg ~ disp+hp+wt, data = mtcars, family = gaussian, prior_intercept = normal(22, 20), prior = normal(c(0.325, -1.563, 0.025), 40, autoscale=TRUE), prior_aux = exponential(1, autoscale=TRUE), chains = 4, iter = 5000*2) # Simulate a set of predictions #set.seed(84735) predictions <- posterior_predict(car_model_mtcars, newdata = mtcars) dim(predictions) ppc_intervals(mtcars$mpg, yrep = predictions, x = mtcars$weight, prob = 0.5, prob_outer = 0.95) # Do the observed y-values in the data set (dark blue circles) # mostly fall within the prediction intervals? prediction_summary(car_model_mtcars, data = mtcars) # median absolute error (MAE): # measures the typical difference between the observed responses and their posterior predictive means # scaled median absolute error: # measures the typical number of std deviations that the observed responses fall from their posterior predictive mean # within_50 statistic # measures the proportion of observed response values that fall within their 50% posterior prediction interval. # within_95 statistic # measures the proportion of observed response values that fall within their 95% posterior prediction interval. # Cross-validation to measure predictive accuracy: # This splits the data set into 10 parts. # Uses a model built with nine-tenths of the data to predict the response for the other tenth of the individuals. # Does this for all 10 subsets being left out in turn, predicting each left-out tenth, averaging the results. #set.seed(84735) cv_procedure <- prediction_summary_cv( model = car_model_mtcars, data = mtcars, k = 10) # Results for each subset left out: cv_procedure$folds # Results, averaging across the 10 subsets: cv_procedure$cv model_elpd <- loo(car_model_mtcars) model_elpd$estimates # ELPD as a way to compare different possible models: car_model_mtcars_1 <- stan_glm(mpg ~ disp+hp, data = mtcars, family = gaussian, prior_intercept = normal(82.5, 20), prior = normal(c(0.325, -1.563), 40, autoscale=TRUE), prior_aux = exponential(1, autoscale=TRUE), chains = 4, iter = 5000*2) model_elpd_1 <- loo(car_model_mtcars_1) model_elpd_1$estimates car_model_mtcars_2 <- stan_glm(mpg ~ disp+wt, data = mtcars, family = gaussian, prior_intercept = normal(82.5, 20), prior = normal(c(0.325, 0.025), 40, autoscale=TRUE), prior_aux = exponential(1, autoscale=TRUE), chains = 4, iter = 5000*2) model_elpd_2 <- loo(car_model_mtcars_2) model_elpd_2$estimates ## Which of the 3 models is better based on ELPD? ## Also see BIC values to compare regression models... #### #### The following is exactly all the same analysis as above, but on the LARGER 'autodata' data set. #### I'm showing the posterior predictive analyses, model fit, and model selection things in class #### using the smaller 'mtcars' data set simply because it doesn't take as much time to run. #### ### Using the functions in 'bayesrules' package to examine posterior predictive distribution: ### This is on the larger 'autodata' data set: library(bayesrules) # Simulate a set of predictions #set.seed(84735) predictions <- posterior_predict(car_model, newdata = autodata) dim(predictions) ppc_intervals(autodata$mpg, yrep = predictions, x = autodata$weight, prob = 0.5, prob_outer = 0.95) # Do the observed y-values in the data set (dark blue circles) # mostly fall within the prediction intervals? prediction_summary(car_model, data = autodata) # median absolute error (MAE): # measures the typical difference between the observed responses and their posterior predictive means # scaled median absolute error: # measures the typical number of std deviations that the observed responses fall from their posterior predictive mean # within_50 statistic # measures the proportion of observed response values that fall within their 50% posterior prediction interval. # within_95 statistic # measures the proportion of observed response values that fall within their 95% posterior prediction interval. # Cross-validation to measure predictive accuracy: # This splits the data set into 10 parts. # Uses a model built with nine-tenths of the data to predict the response for the other tenth of the individuals. # Does this for all 10 subsets being left out in turn, predicting each left-out tenth, averaging the results. #set.seed(84735) cv_procedure <- prediction_summary_cv( model = car_model, data = autodata, k = 10) # Results for each subset left out: cv_procedure$folds # Results, averaging across the 10 subsets: cv_procedure$cv model_elpd <- loo(car_model) model_elpd$estimates # ELPD as a way to compare different possible models: car_model_1 <- stan_glm(mpg ~ displacement+horsepower, data = autodata, family = gaussian, prior_intercept = normal(82.5, 20), prior = normal(c(0.325, -1.563), 40, autoscale=TRUE), prior_aux = exponential(1, autoscale=TRUE), chains = 4, iter = 5000*2) model_elpd_1 <- loo(car_model_1) model_elpd_1$estimates car_model_2 <- stan_glm(mpg ~ displacement+weight, data = autodata, family = gaussian, prior_intercept = normal(82.5, 20), prior = normal(c(0.325, 0.025), 40, autoscale=TRUE), prior_aux = exponential(1, autoscale=TRUE), chains = 4, iter = 5000*2) model_elpd_2 <- loo(car_model_2) model_elpd_2$estimates ## Which of the 3 models is better based on ELPD? ## Also see BIC values to compare regression models...