# High School students Award data (simulated data, from https://stats.idre.ucla.edu/stat/data/poisson_sim.csv) award.data <- read.csv("https://stats.idre.ucla.edu/stat/data/poisson_sim.csv") award.data #### #### Using Metropolis-Hastings in base R to estimate the Poisson regression model: library(mvtnorm) # May need to install the mvtnorm package first? # If so, type at the command line: install.packages("mvtnorm", dependencies=T) # while plugged in to the internet. y <- award.data[,2] x1 <- award.data[,4]; # math score x2 <- ifelse(award.data[,3]==2,1,0); # x2=1 for "academic" track students x3 <- ifelse(award.data[,3]==3,1,0); # x3=1 for "vocational" track students x1x2 <-x1*x2 # in case we want to include an interaction term x1x3 <-x1*x3 # in case we want to include an interaction term X <- cbind(rep(1,times=length(x1)), x1, x2, x3) p <- dim(X)[2] # number of columns of matrix X beta.prior.mean <- c(0, 3, 2, -1) # prior belief that number of awards will be higher for students with higher math scores, # higher for "academic" track students, and lower for "vocational" track students beta.prior.sd <- rep(10,times=p) k<-1 # Can adjust this k up or down if the acceptance rate is too high or low... proposal.cov.matrix <- k*var(log(y+1/2))*solve(t(X)%*%X) # should be reasonable in this case: should be close to (sigma^2)*(X'X)^{-1} S <- 100000 # Make this large (especially if you need to thin the chain) as long as it doesn't take too much time beta.current <- beta.prior.mean # initial value for M-H algorithm, reasonable to use the prior mean vector acs <- 0 # will be to track "acceptance rate" beta.values <- matrix(0,nrow=S,ncol=p) # will store sampled values of beta vector for (s in 1:S) { beta.proposed <- t(rmvnorm(1, beta.current, proposal.cov.matrix)) log.accept.ratio <- sum(dpois(y,exp(X %*% beta.proposed), log=T)) - sum(dpois(y,exp(X %*% beta.current), log=T)) + sum(dnorm(beta.proposed, beta.prior.mean, beta.prior.sd, log=T)) - sum(dnorm(beta.current, beta.prior.mean, beta.prior.sd, log=T) ) if (log.accept.ratio > log(runif(1)) ) { beta.current <- beta.proposed acs <- acs + 1 } beta.values[s,] <- beta.current } acs/S # gives the acceptance rate acf(beta.values[,1]) # plot autocorrelation values for beta_0 acf(beta.values[,2]) # plot autocorrelation values for beta_1 acf(beta.values[,3]) # plot autocorrelation values for beta_2 acf(beta.values[,4]) # plot autocorrelation values for beta_3 # Seems to be an issue with serial dependence. # Thinning out the sampled values by taking every 10th row: beta.values.thin <- beta.values[10*(1:(S/10) ),] # Looks much better... # Trace plots for the sampled beta_0, beta_1, beta_3, beta_4 values: par(mfrow=c(4,1)) plot(beta.values.thin[,1], type='l') plot(beta.values.thin[,2], type='l') plot(beta.values.thin[,3], type='l') plot(beta.values.thin[,4], type='l') par(mfrow=c(1,1)) # Geweke diagnostic for checking convergence of an MCMC chain: # This gives a z-statistic that compares the mean values of # (by default) the first 10% of the chain vs. the last 50% of the chain: library(coda) geweke.diag(beta.values.thin[,2]) # P-value associated with the Geweke diagnostic: # (a small p-value indicates a LACK of convergence): 2*pnorm(abs(geweke.diag(beta.values.thin[,2])[[1]]),lower=F) # Further removing first 10% of the MCMC sample as burn-in: beta.values.thin.b <- beta.values.thin[-(1:1000),] # Now try trace plots again, minus the burn-in: par(mfrow=c(4,1)) plot(beta.values.thin.b[,1], type='l') plot(beta.values.thin.b[,2], type='l') plot(beta.values.thin.b[,3], type='l') plot(beta.values.thin.b[,4], type='l') par(mfrow=c(1,1)) # Nice hairy caterpillars now ... ### Posterior summary: apply(beta.values.thin.b,2,median) # Posterior medians for each regression coefficent cbind(apply(beta.values.thin.b, 2, quantile, probs=0.025), apply(beta.values.thin.b, 2, quantile, probs=0.975) ) # approximate 0.025 and 0.975 quantiles for each regression coefficent # to get approximate 95% quantile-based intervals library(TeachingDemos) t(apply(beta.values.thin.b, 2, emp.hpd, conf=0.95)) # approximate 95% HPD intervals ## Posterior prediction for model fit: num.draws<-nrow(beta.values.thin.b) y.hats <- matrix(0,nrow=num.draws,ncol=nrow(X)) for (i in 1:nrow(beta.values.thin.b)){ y.hats[i,] <- exp( X %*% as.matrix(beta.values.thin.b[i,])) } matrix.of.ys <- matrix(y, nrow=nrow(y.hats), ncol=ncol(y.hats), byrow=T) # In-sample MAE: mae <- mean(abs(matrix.of.ys - y.hats)); print(mae) # Could fit a different model (like without x1? or with an interaction term?) # and compare MAE's across models ... smaller would be better. ### Poisson regression using the 'rstanarm' package: # Load packages library(bayesrules) library(rstanarm) library(bayesplot) library(tidyverse) library(tidybayes) library(broom.mixed) prog <- as.factor(award.data$prog) awards_data <- data.frame(y,x1,x2,x3,prog) # stan_glm wants to start out with the data in a data frame... awards_model <- stan_glm(y ~ x1+x2+x3, data=awards_data, family = poisson, prior_intercept = normal(0, 0.5), prior = normal(c(3,2,-1), 2.5, autoscale = TRUE), chains = 4, iter = 5000*2, prior_PD = FALSE) # Note this asks for 4 different MCMC chains. # Some built-in MCMC checks: mcmc_trace(awards_model) mcmc_dens_overlay(awards_model) mcmc_acf(awards_model) # A little autocorrelation, but nothing bad. # Plotting 5 posterior predictive histograms/bar charts (not densities since Y is a discrete random variable here): pp_check(awards_model, plotfun = "hist", nreps = 5) + xlab("awards") # The predictive data sets match the observed data set pretty well ... Poisson model looks good. # Plotting 50 posterior plausible models: awards_data %>% add_fitted_draws(awards_model, n = 50) %>% ggplot(aes(x = x1, y = y, color = prog)) + geom_line(aes(y = .value, group = paste(prog, .draw)), alpha = .1) + geom_point(data = awards_data, size = 0.1) # Visually we can see the academic-track students tend to be predicted to get the most awards. # And the predicted number of awards increases with math test score. # Summary of point estimates and interval estimates for the betas: tidy(awards_model, conf.int = TRUE, conf.level = 0.95) # Another type of summary of the posterior estimates for the parameters: # summary(awards_model,digits=4) # Posterior prediction of awards counts for a student on the "general" track with a math score of 40: example_prediction <- posterior_predict( awards_model, newdata = data.frame(x1 = 40, x2=0, x3=0)) table(example_prediction) # A posterior predictive distribution for this individual: table(example_prediction)/length(example_prediction) # Posterior predictive mean for this individual: post.pred.mean <- as.numeric( as.numeric(row.names(table(example_prediction))) %*% as.numeric(table(example_prediction)/length(example_prediction)) ) print(post.pred.mean) # Histogram of posterior predictive distribution for this individual: mcmc_hist(example_prediction, binwidth = 1) + xlab("Predicted number of awards, general student with math score 40") # Evaluating model fit: poisson_predictions <- posterior_predict(awards_model, newdata = awards_data) # Plot the posterior predictive models for each student: ppc_intervals_grouped(awards_data$y, yrep = poisson_predictions, x = awards_data$x1, group = awards_data$prog, prob = 0.5, prob_outer = 0.95, facet_args = list(scales = "fixed")) # Do the dark blue circles (observed counts) fall near the light blue circles (predicted counts)? # Do most of the dark blue circles (observed counts) fall within the 95% prediction intervals? # A numerical summary of in-sample model fit: prediction_summary(model = awards_model, data = awards_data) # A (better?) numerical summary of out-of-sample model fit: poisson_cv <- prediction_summary_cv(model = awards_model, data = awards_data, k = 10) poisson_cv$cv # Could fit a different model (like without x1? or with an interaction term?) # and compare MAE's across models ... smaller would be better. ################################################ ## ## Negative Binomial regression example ## ################################################ library(bayesrules) # Load data data(pulse_of_the_nation) # To see the whole data set, you can type: # print.data.frame(pulse) # Eliminate the outliers who have read 100 or more books in a year... pulse <- pulse_of_the_nation %>% filter(books < 100) # Looking at some summary plots: ggplot(pulse, aes(x = books)) + geom_histogram(color = "white") ggplot(pulse, aes(y = books, x = age)) + geom_point() ggplot(pulse, aes(y = books, x = wise_unwise)) + geom_boxplot() # Try a Poisson model first: books_poisson_sim <- stan_glm( books ~ age + wise_unwise, data = pulse, family = poisson, prior_intercept = normal(0, 2.5, autoscale = TRUE), prior = normal(0, 2.5, autoscale = TRUE), prior_aux = exponential(1, autoscale = TRUE), chains = 4, iter = 5000*2, seed = 84735) ## Coding comment: Note there are two predictors in the model, thus in addition to the intercept beta_0, there is both a beta_1 and a beta_2. ## The line: prior = normal(0, 2.5, autoscale = TRUE), ## is equivalent to: prior = normal(c(0,0), 2.5, autoscale = TRUE), ## That is, when only one number is given as a prior mean, that number is repeated as the prior mean of ALL the coefficients. # The posterior predictive distribution indicates a poor fit (this is based on a density plot): pp_check(books_poisson_sim) + xlab("books") # Not a close fit at all! # Since Y is discrete, maybe you'd prefer to show the predictive distribution using bar charts: pp_check(books_poisson_sim, plotfun = "hist", nreps = 5) + xlab("books") # Either way, the predictive distributions don't look like the observed data. ## Why? The variance is much greater than the mean for this data set: # (The Poisson distribution has the mean and variance equal.) mean(pulse$books) var(pulse$books) # or in tidyverse style: pulse %>% summarize(mean = mean(books), var = var(books)) # More relevant for the regression model, which models the response conditional on predictor values: # Mean and variability in readership # among subjects of similar age and wise_unwise response pulse %>% group_by(cut(age,3), wise_unwise) %>% summarize(mean = mean(books), var = var(books)) ## The variance is way bigger than the mean, for subsets of observations with similar predictor values. ## Try a negative binomial regression model: library(rstanarm) books_negbin_sim <- stan_glm( books ~ age + wise_unwise, data = pulse, family = neg_binomial_2, prior_intercept = normal(0, 2.5, autoscale = TRUE), prior = normal(0, 2.5, autoscale = TRUE), prior_aux = exponential(1, autoscale = TRUE), chains = 4, iter = 5000*2) ## Coding comment: Note there are two predictors in the model, thus in addition to the intercept beta_0, there is both a beta_1 and a beta_2. ## The line: prior = normal(0, 2.5, autoscale = TRUE), ## is equivalent to: prior = normal(c(0,0), 2.5, autoscale = TRUE), ## That is, when only one number is given as a prior mean, that number is repeated as the prior mean of ALL the coefficients. pp_check(books_negbin_sim) + xlim(0, 75) + xlab("books") # Some built-in MCMC checks: mcmc_trace(books_negbin_sim) mcmc_dens_overlay(books_negbin_sim) mcmc_acf(books_negbin_sim) # Looks perfect. # Numerical summaries tidy(books_negbin_sim, conf.int = TRUE, conf.level = 0.95) # Evaluating model fit: nb_predictions <- posterior_predict(books_negbin_sim, newdata = pulse) # Posterior prediction of number of books read for a person of age 30 who would rather be wise but unhappy: example_prediction_nb <- posterior_predict( books_negbin_sim, newdata = data.frame(age=30, wise_unwise='Wise but Unhappy')) table(example_prediction_nb) # A posterior predictive distribution for this individual: table(example_prediction_nb)/length(example_prediction_nb) # Posterior predictive mean for this individual: post.pred.mean <- as.numeric( as.numeric(row.names(table(example_prediction_nb))) %*% as.numeric(table(example_prediction_nb)/length(example_prediction_nb)) ) print(post.pred.mean) ### Easiest to see with a plot here: # Histogram of posterior predictive distribution for this individual: mcmc_hist(example_prediction_nb, binwidth = 1) + xlab("Predicted number of books read, person age 30, rather be wise but unhappy") # Plot the posterior predictive models for each observation: # Big data set, so this take a while... ppc_intervals_grouped(pulse$books, yrep = nb_predictions, x = pulse$age, group = pulse$wise_unwise, prob = 0.5, prob_outer = 0.95, facet_args = list(scales = "fixed")) # Do the dark blue circles (observed counts) fall near the light blue circles (predicted counts)? # Do most of the dark blue circles (observed counts) fall within the 95% prediction intervals? # A numerical summary of in-sample model fit: prediction_summary(model = books_negbin_sim, data = pulse) # A (better?) numerical summary of out-of-sample model fit: # This takes a long time to run since it's a large data set. nb_cv <- prediction_summary_cv(model = books_negbin_sim, data = pulse, k = 10) nb_cv$cv # Comparing a couple of other models: # Setting chains=1 for speed reasons... books_negbin_sim_only_wise <- stan_glm( books ~ wise_unwise, data = pulse, family = neg_binomial_2, prior_intercept = normal(0, 2.5, autoscale = TRUE), prior = normal(0, 2.5, autoscale = TRUE), prior_aux = exponential(1, autoscale = TRUE), chains = 1,iter = 5000*2) books_negbin_sim_interact <- stan_glm( books ~ age + wise_unwise + age:wise_unwise, data = pulse, family = neg_binomial_2, prior_intercept = normal(0, 2.5, autoscale = TRUE), prior = normal(0, 2.5, autoscale = TRUE), prior_aux = exponential(1, autoscale = TRUE), chains = 1,iter = 5000*2) # This takes a while .... loo(books_negbin_sim) loo(books_negbin_sim_only_wise) loo(books_negbin_sim_interact) ## Fitting the model using regular R coding can be done with the negative binomial model, ## but the code is not as easy since there is an extra parameter (mean plus dispersion) ## rather than the one-parameter Poisson. ## I won't provide the base R code for it here...