# Chapter 17 R examples from Bayes Rules! textbook:

# Load packages
library(bayesrules)
library(tidyverse)
library(rstanarm)
library(bayesplot)
library(tidybayes)
library(broom.mixed)

# Load data
data(cherry_blossom_sample)
running <- cherry_blossom_sample

# Remove NAs
running <- running %>% 
  select(runner, age, net) %>% 
  na.omit()

# Fitting the hierarchical varying-intercepts model:

# This is a little slow ...

running_model_1 <- stan_glmer(
  net ~ age + (1 | runner), 
  data = running, family = gaussian,
  prior_intercept = normal(100, 10),
  prior = normal(2.5, 1), 
  prior_aux = exponential(1, autoscale = TRUE),
  prior_covariance = decov(reg = 1, conc = 1, shape = 1, scale = 1),
  chains = 4, iter = 5000*2)

# Recall the scatterplots of race time vs. age for each runner:

ggplot(running, aes(x = age, y = net)) + 
  geom_point() + 
  facet_wrap(~ runner)

# Lots of parameters, may be best to examine these individually as in Chapter 16...
#mcmc_trace(running_model_1)
#mcmc_dens_overlay(running_model_1)
#mcmc_acf(running_model_1)

# Posterior summaries of the global parameters:

tidy_summary_1 <- tidy(running_model_1, effects = "fixed",
                       conf.int = TRUE, conf.level = 0.80)
tidy_summary_1

# Picture of the globally estimated regression line, with posterior uncertainty shown:

B0 <- tidy_summary_1$estimate[1]
B1 <- tidy_summary_1$estimate[2]
running %>%
  add_fitted_draws(running_model_1, n = 200, re_formula = NA) %>%
  ggplot(aes(x = age, y = net)) +
    geom_line(aes(y = .value, group = .draw), alpha = 0.1) +
    geom_abline(intercept = B0, slope = B1, color = "blue") +
    lims(y = c(75, 110))

# Posterior summaries of runner-specific intercepts
runner_summaries_1 <- running_model_1 %>%
  spread_draws(`(Intercept)`, b[,runner]) %>% 
  mutate(runner_intercept = `(Intercept)` + b) %>% 
  select(-`(Intercept)`, -b) %>% 
  median_qi(.width = 0.80) %>% 
  select(runner, runner_intercept, .lower, .upper)

runner_summaries_1 %>% 
  filter(runner %in% c("runner:4", "runner:5"))


# 100 posterior plausible models for runners 4 & 5
running %>%
  filter(runner %in% c("4", "5")) %>% 
  add_fitted_draws(running_model_1, n = 100) %>%
  ggplot(aes(x = age, y = net)) +
    geom_line(
      aes(y = .value, group = paste(runner, .draw), color = runner),
      alpha = 0.1) +
    geom_point(aes(color = runner))


# Plot runner-specific models with the global model
ggplot(running, aes(y = net, x = age, group = runner)) + 
  geom_abline(data = runner_summaries_1, color = "gray",
              aes(intercept = runner_intercept, slope = B1)) + 
  geom_abline(intercept = B0, slope = B1, color = "blue") + 
  lims(x = c(50, 61), y = c(50, 135))


tidy_sigma <- tidy(running_model_1, effects = "ran_pars")
tidy_sigma
sigma_0 <- tidy_sigma[1,3]
sigma_y <- tidy_sigma[2,3]
sigma_0^2 / (sigma_0^2 + sigma_y^2)
sigma_y^2 / (sigma_0^2 + sigma_y^2)


## Varying intercepts and slopes model:

# Plot runner-specific models in the data
# for 4 selected runners:
running %>% 
  filter(runner %in% c("4", "5", "20", "29")) %>% 
  ggplot(., aes(x = age, y = net)) + 
    geom_point() + 
    geom_smooth(method = "lm", se = FALSE) + 
    facet_grid(~ runner)

# For all 36 runners:

ggplot(running, aes(x = age, y = net, group = runner)) + 
  geom_smooth(method = "lm", se = FALSE, size = 0.5)

# Posterior simulation (SLOW! I use only 1 chain and fewer iterations here):

running_model_2 <- stan_glmer(
  net ~ age + (age | runner),
  data = running, family = gaussian,
  prior_intercept = normal(100, 10),
  prior = normal(2.5, 1), 
  prior_aux = exponential(1, autoscale = TRUE),
  prior_covariance = decov(reg = 1, conc = 1, shape = 1, scale = 1),
  chains = 1, iter = 1200*2, adapt_delta = 0.99999
)


# Quick summary of global regression parameters
tidy(running_model_2, effects = "fixed", conf.int = TRUE, conf.level = 0.80)

# Group-specific parameters:

# Get MCMC chains for the runner-specific intercepts & slopes
runner_chains_2 <- running_model_2 %>%
  spread_draws(`(Intercept)`, b[term, runner], `age`) %>% 
  pivot_wider(names_from = term, names_glue = "b_{term}",
              values_from = b) %>% 
  mutate(runner_intercept = `(Intercept)` + `b_(Intercept)`,
         runner_age = age + b_age)

# Posterior medians of runner-specific models
runner_summaries_2 <- runner_chains_2 %>% 
  group_by(runner) %>% 
  summarize(runner_intercept = median(runner_intercept),
            runner_age = median(runner_age))

# Check it out
head(runner_summaries_2, 3)

# Plots of runner-specific regression lines:

ggplot(running, aes(y = net, x = age, group = runner)) + 
  geom_abline(data = runner_summaries_2, color = "gray",
              aes(intercept = runner_intercept, slope = runner_age)) + 
  lims(x = c(50, 61), y = c(50, 135))

# Two runners (note the shrinkage)

runner_summaries_2 %>% 
  filter(runner %in% c("runner:1", "runner:10"))

## Model selection

# Which model to choose?  Use pp_check to help judge model fit:

complete_pooled_model <- stan_glm(
  net ~ age, 
  data = running, family = gaussian, 
  prior_intercept = normal(0, 2.5, autoscale = TRUE),
  prior = normal(0, 2.5, autoscale = TRUE), 
  prior_aux = exponential(1, autoscale = TRUE),
  chains = 4, iter = 5000*2)

pp_check(complete_pooled_model) + 
  labs(x = "net", title = "complete pooled model")
dev.new()
pp_check(running_model_1) + 
  labs(x = "net", title = "running model 1")
dev.new()
pp_check(running_model_2) + 
  labs(x = "net", title = "running model 2")


# Checking predictive accuracy:

prediction_summary(model = running_model_1, data = running)

prediction_summary(model = running_model_2, data = running)

# Doing the cross-validation version would take a LOOOONG time.

# Calculate ELPD for the 2 models
elpd_hierarchical_1 <- loo(running_model_1)
elpd_hierarchical_2 <- loo(running_model_2)

elpd_hierarchical_1$estimates 
elpd_hierarchical_2$estimates  

## Posterior prediction of race time at age 61 for two runners in the sample, and a new individual:
## posterior_predict function may not work right:

## Using the varying-intercepts model:

# Simulate posterior predictive models for the 3 runners
predict_next_race <- posterior_predict(
  running_model_1, 
  newdata = data.frame(runner = c("1", "Miles", "10"),
                       age = c(61, 61, 61)))

colMeans(predict_next_race)

# Posterior predictive model plots
mcmc_areas(predict_next_race, prob = 0.8) +
 ggplot2::scale_y_discrete(labels = c("runner 1", "Miles", "runner 10"))


# Store the simulation in a data frame
running_model_1_df <- as.data.frame(running_model_1)

## Posterior prediction of race time at age 61 for two runners in the sample, and a new individual (using base R rather than posterior_predict function):

Miles_chains <- running_model_1_df %>%
  mutate(sigma_mu = sqrt(`Sigma[runner:(Intercept),(Intercept)]`),
         mu_miles = rnorm(5000, `(Intercept)`+`age`*61, sigma_mu),
         y_miles = rnorm(5000, mu_miles, sigma))

# Posterior predictive summaries
Miles_chains %>% 
  mean_qi(y_miles, .width = 0.80)

# Similar to above:
quantile(Miles_chains$y_miles, c(0.1,0.5,0.9))

runner_1.draws<-running_model_1_df[,'(Intercept)']+running_model_1_df[,'b[(Intercept) runner:1]']+ running_model_1_df[,'age']*61+running_model_1_df[,'sigma']

runner_10.draws<-running_model_1_df[,'(Intercept)']+running_model_1_df[,'b[(Intercept) runner:10]']+ running_model_1_df[,'age']*61+running_model_1_df[,'sigma']


quantile(runner_1.draws, c(0.1,0.5,0.9))

quantile(runner_10.draws, c(0.1,0.5,0.9))