############################ # Assessing Model Quality # ############################ ### Example 1b, Chapter 6b notes: lead <- c(48,53,44,55,52,39,62,38,23,27,41,37,41,46,32,17, 32,41,23,12,3,13,10,11,5,30,11,9,7,11,77,210,38,112,52,10,6) x <- lead xbar <- mean(x); n <- length(x); sum.x <- sum(x) my.seq <- seq(25, 40, length=200) sigma <- 40 # assumed known delta <- 30; tau <- 4 # Different posteriors: post.1 <- dnorm(my.seq, mean = ( (delta/(tau^2)) + (sum.x/(sigma^2)) ) / (1/(tau^2) + n/(sigma^2)), sd = sqrt( ((tau^2)*(sigma^2))/((sigma^2) + n*(tau^2)) ) ) post.2 <- dnorm(my.seq, mean = ( ((delta-tau)/(tau^2)) + (sum.x/(sigma^2)) ) / (1/(tau^2) + n/(sigma^2)), sd = sqrt( ((tau^2)*(sigma^2))/((sigma^2) + n*(tau^2)) ) ) post.3 <- dnorm(my.seq, mean = ( ((delta+tau)/(tau^2)) + (sum.x/(sigma^2)) ) / (1/(tau^2) + n/(sigma^2)), sd = sqrt( ((tau^2)*(sigma^2))/((sigma^2) + n*(tau^2)) ) ) post.4 <- dnorm(my.seq, mean = ( (delta/(0.5*tau^2)) + (sum.x/(sigma^2)) ) / (1/(0.5*tau^2) + n/(sigma^2)), sd = sqrt( ((0.5*tau^2)*(sigma^2))/((sigma^2) + n*(0.5*tau^2)) ) ) post.5 <- dnorm(my.seq, mean = ( (delta/(2*tau^2)) + (sum.x/(sigma^2)) ) / (1/(2*tau^2) + n/(sigma^2)), sd = sqrt( ((2*tau^2)*(sigma^2))/((sigma^2) + n*(2*tau^2)) ) ) ## All these posteriors plotted on one plot: plot(my.seq, post.1, type = 'l', col='red') lines(my.seq, post.2, col='orange') lines(my.seq, post.3, col='purple') lines(my.seq, post.4, col='blue') lines(my.seq, post.5, col='green') ## Posterior quantiles for all these posteriors: rbind( qnorm(c(.025,.25,.5,.75,.975), mean = ( (delta/(tau^2)) + (sum.x/(sigma^2)) ) / (1/(tau^2) + n/(sigma^2)), sd = sqrt( ((tau^2)*(sigma^2))/((sigma^2) + n*(tau^2)) ) ), qnorm(c(.025,.25,.5,.75,.975), mean = ( ((delta-tau)/(tau^2)) + (sum.x/(sigma^2)) ) / (1/(tau^2) + n/(sigma^2)), sd = sqrt( ((tau^2)*(sigma^2))/((sigma^2) + n*(tau^2)) ) ), qnorm(c(.025,.25,.5,.75,.975), mean = ( ((delta+tau)/(tau^2)) + (sum.x/(sigma^2)) ) / (1/(tau^2) + n/(sigma^2)), sd = sqrt( ((tau^2)*(sigma^2))/((sigma^2) + n*(tau^2)) ) ), qnorm(c(.025,.25,.5,.75,.975), mean = ( (delta/(0.5*tau^2)) + (sum.x/(sigma^2)) ) / (1/(0.5*tau^2) + n/(sigma^2)), sd = sqrt( ((0.5*tau^2)*(sigma^2))/((sigma^2) + n*(0.5*tau^2)) ) ), qnorm(c(.025,.25,.5,.75,.975), mean = ( (delta/(2*tau^2)) + (sum.x/(sigma^2)) ) / (1/(2*tau^2) + n/(sigma^2)), sd = sqrt( ((2*tau^2)*(sigma^2))/((sigma^2) + n*(2*tau^2)) ) ) ) ################################################################################# ### Example 2, Chapter 6b notes: # Prussian cavalry data: x <- c(rep(0,times=109), rep(1,times=65), rep(2, times=22), rep(3, times=3), rep(4, times=1) ) # With less data: # x <- x[10*(1:20)] # picking out every 10th value sum.x <- sum(x); n <- length(x) my.seq <- seq(.01, 1.5, length=200) # Posterior based on a gamma(2,4) prior: post.2and4 <- dgamma(my.seq, shape = sum.x + 2, rate = n + 4) # Posterior based on a gamma(4,8) prior: post.4and8 <- dgamma(my.seq, shape = sum.x + 4, rate = n + 8) # Posterior based on a gamma(1,2) prior: post.1and2 <- dgamma(my.seq, shape = sum.x + 1, rate = n + 2) # Posterior based on a gamma(0.1*2,sqrt(0.1)*4) prior: post.0.1.2andsqrt0.1.4 <- dgamma(my.seq, shape = sum.x + (0.1*2), rate = n + (sqrt(0.1)*4)) # Posterior based on a gamma(3*2,sqrt(3)*4) prior: post.6andsqrt3.4 <- dgamma(my.seq, shape = sum.x + (3*2), rate = n + (sqrt(3)*4)) ## All these posteriors plotted on one plot: plot(my.seq, post.2and4, type = 'l', col='red') lines(my.seq, post.4and8, col='orange') lines(my.seq, post.1and2, col='purple') lines(my.seq, post.0.1.2andsqrt0.1.4, col='blue') lines(my.seq, post.6andsqrt3.4, col='green') ## Posterior quantiles for all these posteriors: rbind( qgamma(c(.025,.25,.5,.75,.975), shape = sum.x + 2, rate = n + 4), qgamma(c(.025,.25,.5,.75,.975), shape = sum.x + 4, rate = n + 8), qgamma(c(.025,.25,.5,.75,.975), shape = sum.x + 1, rate = n + 2), qgamma(c(.025,.25,.5,.75,.975), shape = sum.x + (0.1*2), rate = n + (sqrt(0.1)*4)), qgamma(c(.025,.25,.5,.75,.975), shape = sum.x + (3*2), rate = n + (sqrt(3)*4)) ) ##################################################### # # Posterior Predictive Distributions # ##################################################### # Prussian cavalry data: x <- c(rep(0,times=109), rep(1,times=65), rep(2, times=22), rep(3, times=3), rep(4, times=1) ) sum.x <- sum(x); n <- length(x) my.alpha <- 2; my.beta <- 4 my.seq <- seq(0,4,by=1) # Posterior predictive distribution is negative binomial: nb.pred.post <- dnbinom(my.seq, size = sum.x + my.alpha, prob = (n + my.beta)/(n + my.beta + 1) ) # Comparing posterior predictive data density (red) to original data density estimate (black): plot(table(x)/n, type='h', lwd=4, ylab='probability') lines(my.seq+0.04, nb.pred.post, col='red', type='h', lwd=4) ### Chapter 6b notes: lead <- c(48,53,44,55,52,39,62,38,23,27,41,37,41,46,32,17, 32,41,23,12,3,13,10,11,5,30,11,9,7,11,77,210,38,112,52,10,6) x <- lead xbar <- mean(x); n <- length(x); sum.x <- sum(x) my.seq <- seq(25, 40, length=200) sigma <- 40 # assumed known delta <- 30; tau <- 4 # Sampling 5000 values of mu from the posterior distribution: sample.post <- rnorm(n = 5000, mean = ( (delta/(tau^2)) + (sum.x/(sigma^2)) ) / (1/(tau^2) + n/(sigma^2)), sd = sqrt( ((tau^2)*(sigma^2))/((sigma^2) + n*(tau^2)) ) ) # Sampling 5000 values of x_new from N(mu[j], 40^2): sample.post.pred <- rnorm(n=5000, mean = sample.post, sd = 40) # Comparing posterior predictive data density (red) to original data density estimate (black): plot(density(x)) lines(density(sample.post.pred), col='red', lty='dashed') # Comparing quantiles: quantile(x, probs = c(.025,.25,.5,.75,.975) ) quantile(sample.post.pred, probs = c(.025,.25,.5,.75,.975) ) # Not a great fit around the tails ... ##### Regression example: # On the small automobile data set that is built into R: y <- mtcars$mpg x1 <- mtcars$disp x2 <- mtcars$hp x3 <- mtcars$wt X <- as.matrix( cbind(rep(1, length(x1)), x1, x2, x3) ) library(mvtnorm) # to get "rmvt" function to sample from multivariate t distribution bhat <- solve(t(X) %*% X) %*% t(X) %*% y sig2hat <- t(y - X %*% bhat) %*% (y - X %*% bhat) / (nrow(X) - ncol(X)) # Letting X* just equal the observed X: Xstar <- X my.Sigma <- as.numeric( sig2hat*(nrow(X) - ncol(X)) / ((nrow(X) - ncol(X) - 2)) )* (diag(rep(1,times=nrow(Xstar))) + Xstar %*% solve( (t(X) %*% X) ) %*% t(Xstar) ) # Sampling from multivariate t distribution: pred.post.err <- rmvt(n=500, sigma=my.Sigma, df=(nrow(X) - ncol(X))) # Adding multivariate t samples to fitted Y-values to get posterior predictive distribution of response values: pred.post.samp <- matrix(Xstar %*% bhat, nrow=nrow(pred.post.err), ncol=ncol(pred.post.err), byrow=T) + pred.post.err matrix.of.ys <- matrix(y, nrow=nrow(pred.post.samp), ncol=ncol(pred.post.samp), byrow=T) # Plotting original Y-values against samples from posterior predictive distribution of response values: matplot(matrix.of.ys, pred.post.samp, pch='o', cex=0.3) abline(0,1) # Examining model lack-of-fit: cbind(mtcars[,1:2], (y - Xstar %*% bhat) )[order(y),] ### Predicting the mileage for another car having dispersion 150, horsepower 100, and weight 2.7: Xstar <- cbind(1, 150, 100, 2.7) my.Sigma <- as.numeric( sig2hat*(nrow(X) - ncol(X)) / ((nrow(X) - ncol(X) - 2)) )* (diag(rep(1,times=nrow(Xstar))) + Xstar %*% solve( (t(X) %*% X) ) %*% t(Xstar) ) # Sampling from multivariate t distribution: pred.post.err <- rmvt(n=500, sigma=my.Sigma, df=(nrow(X) - ncol(X))) # Adding multivariate t samples to fitted Y-values to get posterior predictive distribution of response values: pred.post.samp <- matrix(Xstar %*% bhat, nrow=nrow(pred.post.err), ncol=ncol(pred.post.err), byrow=T) + pred.post.err # Getting 95% prediction interval, and point prediction (via median of posterior predictive distribution): quantile(pred.post.samp, probs=c(0.025, 0.5, 0.975))