# Chapter 17 R examples from Bayes Rules! textbook: # Load packages library(bayesrules) library(tidyverse) library(rstanarm) library(bayesplot) library(tidybayes) library(broom.mixed) # Load data data(cherry_blossom_sample) running <- cherry_blossom_sample # Remove NAs running <- running %>% select(runner, age, net) %>% na.omit() # Fitting the hierarchical varying-intercepts model: # This is a little slow ... running_model_1 <- stan_glmer( net ~ age + (1 | runner), data = running, family = gaussian, prior_intercept = normal(100, 10), prior = normal(2.5, 1), prior_aux = exponential(1, autoscale = TRUE), prior_covariance = decov(reg = 1, conc = 1, shape = 1, scale = 1), chains = 4, iter = 5000*2) # Recall the scatterplots of race time vs. age for each runner: ggplot(running, aes(x = age, y = net)) + geom_point() + facet_wrap(~ runner) # Lots of parameters, may be best to examine these individually as in Chapter 16... #mcmc_trace(running_model_1) #mcmc_dens_overlay(running_model_1) #mcmc_acf(running_model_1) # Posterior summaries of the global parameters: tidy_summary_1 <- tidy(running_model_1, effects = "fixed", conf.int = TRUE, conf.level = 0.80) tidy_summary_1 # Picture of the globally estimated regression line, with posterior uncertainty shown: B0 <- tidy_summary_1$estimate[1] B1 <- tidy_summary_1$estimate[2] running %>% add_fitted_draws(running_model_1, n = 200, re_formula = NA) %>% ggplot(aes(x = age, y = net)) + geom_line(aes(y = .value, group = .draw), alpha = 0.1) + geom_abline(intercept = B0, slope = B1, color = "blue") + lims(y = c(75, 110)) # Posterior summaries of runner-specific intercepts runner_summaries_1 <- running_model_1 %>% spread_draws(`(Intercept)`, b[,runner]) %>% mutate(runner_intercept = `(Intercept)` + b) %>% select(-`(Intercept)`, -b) %>% median_qi(.width = 0.80) %>% select(runner, runner_intercept, .lower, .upper) runner_summaries_1 %>% filter(runner %in% c("runner:4", "runner:5")) # 100 posterior plausible models for runners 4 & 5 running %>% filter(runner %in% c("4", "5")) %>% add_fitted_draws(running_model_1, n = 100) %>% ggplot(aes(x = age, y = net)) + geom_line( aes(y = .value, group = paste(runner, .draw), color = runner), alpha = 0.1) + geom_point(aes(color = runner)) # Plot runner-specific models with the global model ggplot(running, aes(y = net, x = age, group = runner)) + geom_abline(data = runner_summaries_1, color = "gray", aes(intercept = runner_intercept, slope = B1)) + geom_abline(intercept = B0, slope = B1, color = "blue") + lims(x = c(50, 61), y = c(50, 135)) tidy_sigma <- tidy(running_model_1, effects = "ran_pars") tidy_sigma sigma_0 <- tidy_sigma[1,3] sigma_y <- tidy_sigma[2,3] sigma_0^2 / (sigma_0^2 + sigma_y^2) sigma_y^2 / (sigma_0^2 + sigma_y^2) ## Varying intercepts and slopes model: # Plot runner-specific models in the data # for 4 selected runners: running %>% filter(runner %in% c("4", "5", "20", "29")) %>% ggplot(., aes(x = age, y = net)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + facet_grid(~ runner) # For all 36 runners: ggplot(running, aes(x = age, y = net, group = runner)) + geom_smooth(method = "lm", se = FALSE, size = 0.5) # Posterior simulation (SLOW! I use only 1 chain and fewer iterations here): running_model_2 <- stan_glmer( net ~ age + (age | runner), data = running, family = gaussian, prior_intercept = normal(100, 10), prior = normal(2.5, 1), prior_aux = exponential(1, autoscale = TRUE), prior_covariance = decov(reg = 1, conc = 1, shape = 1, scale = 1), chains = 1, iter = 1200*2, adapt_delta = 0.99999 ) # Quick summary of global regression parameters tidy(running_model_2, effects = "fixed", conf.int = TRUE, conf.level = 0.80) # Group-specific parameters: # Get MCMC chains for the runner-specific intercepts & slopes runner_chains_2 <- running_model_2 %>% spread_draws(`(Intercept)`, b[term, runner], `age`) %>% pivot_wider(names_from = term, names_glue = "b_{term}", values_from = b) %>% mutate(runner_intercept = `(Intercept)` + `b_(Intercept)`, runner_age = age + b_age) # Posterior medians of runner-specific models runner_summaries_2 <- runner_chains_2 %>% group_by(runner) %>% summarize(runner_intercept = median(runner_intercept), runner_age = median(runner_age)) # Check it out head(runner_summaries_2, 3) # Plots of runner-specific regression lines: ggplot(running, aes(y = net, x = age, group = runner)) + geom_abline(data = runner_summaries_2, color = "gray", aes(intercept = runner_intercept, slope = runner_age)) + lims(x = c(50, 61), y = c(50, 135)) # Two runners (note the shrinkage) runner_summaries_2 %>% filter(runner %in% c("runner:1", "runner:10")) ## Model selection # Which model to choose? Use pp_check to help judge model fit: complete_pooled_model <- stan_glm( net ~ age, data = running, family = gaussian, prior_intercept = normal(0, 2.5, autoscale = TRUE), prior = normal(0, 2.5, autoscale = TRUE), prior_aux = exponential(1, autoscale = TRUE), chains = 4, iter = 5000*2) pp_check(complete_pooled_model) + labs(x = "net", title = "complete pooled model") dev.new() pp_check(running_model_1) + labs(x = "net", title = "running model 1") dev.new() pp_check(running_model_2) + labs(x = "net", title = "running model 2") # Checking predictive accuracy: prediction_summary(model = running_model_1, data = running) prediction_summary(model = running_model_2, data = running) # Doing the cross-validation version would take a LOOOONG time. # Calculate ELPD for the 2 models elpd_hierarchical_1 <- loo(running_model_1) elpd_hierarchical_2 <- loo(running_model_2) elpd_hierarchical_1$estimates elpd_hierarchical_2$estimates ## Posterior prediction of race time at age 61 for two runners in the sample, and a new individual: ## posterior_predict function may not work right: ## Using the varying-intercepts model: # Simulate posterior predictive models for the 3 runners predict_next_race <- posterior_predict( running_model_1, newdata = data.frame(runner = c("1", "Miles", "10"), age = c(61, 61, 61))) colMeans(predict_next_race) # Posterior predictive model plots mcmc_areas(predict_next_race, prob = 0.8) + ggplot2::scale_y_discrete(labels = c("runner 1", "Miles", "runner 10")) # Store the simulation in a data frame running_model_1_df <- as.data.frame(running_model_1) ## Posterior prediction of race time at age 61 for two runners in the sample, and a new individual (using base R rather than posterior_predict function): Miles_chains <- running_model_1_df %>% mutate(sigma_mu = sqrt(`Sigma[runner:(Intercept),(Intercept)]`), mu_miles = rnorm(5000, `(Intercept)`+`age`*61, sigma_mu), y_miles = rnorm(5000, mu_miles, sigma)) # Posterior predictive summaries Miles_chains %>% mean_qi(y_miles, .width = 0.80) # Similar to above: quantile(Miles_chains$y_miles, c(0.1,0.5,0.9)) runner_1.draws<-running_model_1_df[,'(Intercept)']+running_model_1_df[,'b[(Intercept) runner:1]']+ running_model_1_df[,'age']*61+running_model_1_df[,'sigma'] runner_10.draws<-running_model_1_df[,'(Intercept)']+running_model_1_df[,'b[(Intercept) runner:10]']+ running_model_1_df[,'age']*61+running_model_1_df[,'sigma'] quantile(runner_1.draws, c(0.1,0.5,0.9)) quantile(runner_10.draws, c(0.1,0.5,0.9))