# High School students Award data (simulated data, from https://stats.idre.ucla.edu/stat/data/poisson_sim.csv)


award.data <- read.csv("https://stats.idre.ucla.edu/stat/data/poisson_sim.csv")
award.data

####
#### Using Metropolis-Hastings in base R to estimate the Poisson regression model:

library(mvtnorm)
# May need to install the mvtnorm package first?
# If so, type at the command line:  install.packages("mvtnorm", dependencies=T)
# while plugged in to the internet.


y <- award.data[,2]
x1 <- award.data[,4]; # math score
x2 <- ifelse(award.data[,3]==2,1,0); # x2=1 for "academic" track students
x3 <- ifelse(award.data[,3]==3,1,0); # x3=1 for "vocational" track students
x1x2 <-x1*x2  # in case we want to include an interaction term
x1x3 <-x1*x3  # in case we want to include an interaction term


X <- cbind(rep(1,times=length(x1)), x1, x2, x3)

p <- dim(X)[2]  # number of columns of matrix X

beta.prior.mean <- c(0, 3, 2, -1)  
# prior belief that number of awards will be higher for students with higher math scores,
# higher for "academic" track students, and lower for "vocational" track students
beta.prior.sd <- rep(10,times=p)

k<-1 # Can adjust this k up or down if the acceptance rate is too high or low...
proposal.cov.matrix <- k*var(log(y+1/2))*solve(t(X)%*%X)  
# should be reasonable in this case:   should be close to (sigma^2)*(X'X)^{-1}

S <- 100000  # Make this large (especially if you need to thin the chain) as long as it doesn't take too much time
beta.current <- beta.prior.mean # initial value for M-H algorithm, reasonable to use the prior mean vector
acs <- 0 # will be to track "acceptance rate"

beta.values <- matrix(0,nrow=S,ncol=p)  # will store sampled values of beta vector

for (s in 1:S) {
 beta.proposed <- t(rmvnorm(1, beta.current, proposal.cov.matrix))

 log.accept.ratio <- sum(dpois(y,exp(X %*% beta.proposed), log=T)) - sum(dpois(y,exp(X %*% beta.current), log=T)) +
        sum(dnorm(beta.proposed, beta.prior.mean, beta.prior.sd, log=T)) - 
        sum(dnorm(beta.current, beta.prior.mean, beta.prior.sd, log=T) )

  if (log.accept.ratio > log(runif(1)) ) {
    beta.current <- beta.proposed
    acs <- acs + 1
  }

beta.values[s,] <- beta.current
}

acs/S  # gives the acceptance rate

acf(beta.values[,1])  # plot autocorrelation values for beta_0
acf(beta.values[,2])  # plot autocorrelation values for beta_1
acf(beta.values[,3])  # plot autocorrelation values for beta_2
acf(beta.values[,4])  # plot autocorrelation values for beta_3

# Seems to be an issue with serial dependence.

# Thinning out the sampled values by taking every 10th row:

beta.values.thin <- beta.values[10*(1:(S/10) ),]

# Looks much better...

# Trace plots for the sampled beta_0, beta_1, beta_3, beta_4 values:

par(mfrow=c(4,1))
plot(beta.values.thin[,1], type='l')
plot(beta.values.thin[,2], type='l')
plot(beta.values.thin[,3], type='l')
plot(beta.values.thin[,4], type='l')
par(mfrow=c(1,1))

# Geweke diagnostic for checking convergence of an MCMC chain:
# This gives a z-statistic that compares the mean values of
# (by default) the first 10% of the chain vs. the last 50% of the chain:
library(coda)
geweke.diag(beta.values.thin[,2])
# P-value associated with the Geweke diagnostic:
# (a small p-value indicates a LACK of convergence):
2*pnorm(abs(geweke.diag(beta.values.thin[,2])[[1]]),lower=F)


# Further removing first 10% of the MCMC sample as burn-in:

beta.values.thin.b <- beta.values.thin[-(1:1000),]

# Now try trace plots again, minus the burn-in:

par(mfrow=c(4,1))
plot(beta.values.thin.b[,1], type='l')
plot(beta.values.thin.b[,2], type='l')
plot(beta.values.thin.b[,3], type='l')
plot(beta.values.thin.b[,4], type='l')
par(mfrow=c(1,1))

# Nice hairy caterpillars now ...

### Posterior summary:

apply(beta.values.thin.b,2,median)  # Posterior medians for each regression coefficent

cbind(apply(beta.values.thin.b, 2, quantile, probs=0.025),   apply(beta.values.thin.b, 2, quantile, probs=0.975) )
# approximate 0.025 and 0.975 quantiles for each regression coefficent
# to get approximate 95% quantile-based intervals

library(TeachingDemos)
t(apply(beta.values.thin.b, 2, emp.hpd, conf=0.95)) # approximate 95% HPD intervals

## Posterior prediction for model fit:

num.draws<-nrow(beta.values.thin.b)
y.hats <- matrix(0,nrow=num.draws,ncol=nrow(X))
for (i in 1:nrow(beta.values.thin.b)){
y.hats[i,] <- exp( X %*% as.matrix(beta.values.thin.b[i,]))
}

matrix.of.ys <- matrix(y, nrow=nrow(y.hats), ncol=ncol(y.hats), byrow=T)

# In-sample MAE:

mae <- mean(abs(matrix.of.ys - y.hats)); 
print(mae)

# Could fit a different model (like without x1? or with an interaction term?) 
# and compare MAE's across models ... smaller would be better.


### Poisson regression using the 'rstanarm' package:

# Load packages
library(bayesrules)
library(rstanarm)
library(bayesplot)
library(tidyverse)
library(tidybayes)
library(broom.mixed)

prog <- as.factor(award.data$prog)
awards_data <- data.frame(y,x1,x2,x3,prog)
# stan_glm wants to start out with the data in a data frame...

awards_model <- stan_glm(y ~ x1+x2+x3, 
                                 data=awards_data,
                                 family = poisson,
                                 prior_intercept = normal(0, 0.5),
                                 prior = normal(c(3,2,-1), 2.5, autoscale = TRUE), 
                                 chains = 4, iter = 5000*2, 
                                 prior_PD = FALSE)

# Note this asks for 4 different MCMC chains.

# Some built-in MCMC checks:

mcmc_trace(awards_model)
mcmc_dens_overlay(awards_model)
mcmc_acf(awards_model)

# A little autocorrelation, but nothing bad.


# Plotting 5 posterior predictive histograms/bar charts (not densities since Y is a discrete random variable here):

pp_check(awards_model, plotfun = "hist", nreps = 5) + 
  xlab("awards")

# The predictive data sets match the observed data set pretty well ... Poisson model looks good.

#  Plotting 50 posterior plausible models:

awards_data %>%
  add_fitted_draws(awards_model, n = 50) %>%
  ggplot(aes(x = x1, y = y, color = prog)) +
    geom_line(aes(y = .value, group = paste(prog, .draw)), 
              alpha = .1) +
    geom_point(data = awards_data, size = 0.1)

# Visually we can see the academic-track students tend to be predicted to get the most awards.
# And the predicted number of awards increases with math test score.

# Summary of point estimates and interval estimates for the betas:

tidy(awards_model, conf.int = TRUE, conf.level = 0.95)

# Another type of summary of the posterior estimates for the parameters:

#  summary(awards_model,digits=4)


# Posterior prediction of awards counts for a student on the "general" track with a math score of 40:

example_prediction <- posterior_predict(
  awards_model, newdata = data.frame(x1 = 40, x2=0, x3=0))
table(example_prediction)

# A posterior predictive distribution for this individual:
table(example_prediction)/length(example_prediction)

# Posterior predictive mean for this individual:

post.pred.mean <- as.numeric( 
as.numeric(row.names(table(example_prediction))) %*% as.numeric(table(example_prediction)/length(example_prediction)) 
)

print(post.pred.mean)

# Histogram of posterior predictive distribution for this individual:
mcmc_hist(example_prediction, binwidth = 1) + 
  xlab("Predicted number of awards, general student with math score 40")

# Evaluating model fit:

poisson_predictions <- posterior_predict(awards_model, newdata = awards_data)

# Plot the posterior predictive models for each student:
ppc_intervals_grouped(awards_data$y, yrep = poisson_predictions, 
                      x = awards_data$x1, 
                      group = awards_data$prog,
                      prob = 0.5, prob_outer = 0.95,
                      facet_args = list(scales = "fixed"))

# Do the dark blue circles (observed counts) fall near the light blue circles (predicted counts)?
# Do most of the dark blue circles (observed counts) fall within the 95% prediction intervals?

# A numerical summary of in-sample model fit:

prediction_summary(model = awards_model, data = awards_data)

# A (better?) numerical summary of out-of-sample model fit:

poisson_cv <- prediction_summary_cv(model = awards_model, 
                                    data = awards_data, k = 10)
poisson_cv$cv

# Could fit a different model (like without x1? or with an interaction term?) 
# and compare MAE's across models ... smaller would be better.


################################################
##
## Negative Binomial regression example
##
################################################

library(bayesrules)

# Load data
data(pulse_of_the_nation)

# To see the whole data set, you can type:

#  print.data.frame(pulse)

# Eliminate the outliers who have read 100 or more books in a year...
pulse <- pulse_of_the_nation %>% 
  filter(books < 100)

# Looking at some summary plots:

ggplot(pulse, aes(x = books)) + 
  geom_histogram(color = "white")
ggplot(pulse, aes(y = books, x = age)) + 
  geom_point()
ggplot(pulse, aes(y = books, x = wise_unwise)) + 
  geom_boxplot()

# Try a Poisson model first:

books_poisson_sim <- stan_glm(
  books ~ age + wise_unwise, 
  data = pulse, family = poisson,
  prior_intercept = normal(0, 2.5, autoscale = TRUE),
  prior = normal(0, 2.5, autoscale = TRUE), 
  prior_aux = exponential(1, autoscale = TRUE),
  chains = 4, iter = 5000*2, seed = 84735)

## Coding comment:  Note there are two predictors in the model, thus in addition to the intercept beta_0, there is both a beta_1 and a beta_2.
## The line:   prior = normal(0, 2.5, autoscale = TRUE), 
## is equivalent to:   prior = normal(c(0,0), 2.5, autoscale = TRUE), 
## That is, when only one number is given as a prior mean, that number is repeated as the prior mean of ALL the coefficients.

# The posterior predictive distribution indicates a poor fit (this is based on a density plot):

pp_check(books_poisson_sim) + 
  xlab("books")

# Not a close fit at all!

# Since Y is discrete, maybe you'd prefer to show the predictive distribution using bar charts:

pp_check(books_poisson_sim, plotfun = "hist", nreps = 5) + 
  xlab("books")

# Either way, the predictive distributions don't look like the observed data.

## Why?  The variance is much greater than the mean for this data set:
# (The Poisson distribution has the mean and variance equal.)

mean(pulse$books)
var(pulse$books)

# or in tidyverse style:

pulse %>% 
  summarize(mean = mean(books), var = var(books))

# More relevant for the regression model, which models the response conditional on predictor values:

# Mean and variability in readership 
# among subjects of similar age and wise_unwise response
pulse %>% 
  group_by(cut(age,3), wise_unwise) %>% 
  summarize(mean = mean(books), var = var(books))

## The variance is way bigger than the mean, for subsets of observations with similar predictor values.

## Try a negative binomial regression model:

library(rstanarm)

books_negbin_sim <- stan_glm(
  books ~ age + wise_unwise, 
  data = pulse, family = neg_binomial_2,
  prior_intercept = normal(0, 2.5, autoscale = TRUE),
  prior = normal(0, 2.5, autoscale = TRUE), 
  prior_aux = exponential(1, autoscale = TRUE),
  chains = 4, iter = 5000*2)

## Coding comment:  Note there are two predictors in the model, thus in addition to the intercept beta_0, there is both a beta_1 and a beta_2.
## The line:   prior = normal(0, 2.5, autoscale = TRUE), 
## is equivalent to:   prior = normal(c(0,0), 2.5, autoscale = TRUE), 
## That is, when only one number is given as a prior mean, that number is repeated as the prior mean of ALL the coefficients.

pp_check(books_negbin_sim) + 
  xlim(0, 75) + 
  xlab("books")

# Some built-in MCMC checks:

mcmc_trace(books_negbin_sim)
mcmc_dens_overlay(books_negbin_sim)
mcmc_acf(books_negbin_sim)

# Looks perfect.

# Numerical summaries
tidy(books_negbin_sim, conf.int = TRUE, conf.level = 0.95)

# Evaluating model fit:

nb_predictions <- posterior_predict(books_negbin_sim, newdata = pulse)

# Posterior prediction of number of books read for a person of age 30 who would rather be wise but unhappy:

example_prediction_nb <- posterior_predict(
  books_negbin_sim, newdata = data.frame(age=30, wise_unwise='Wise but Unhappy'))
table(example_prediction_nb)

# A posterior predictive distribution for this individual:
table(example_prediction_nb)/length(example_prediction_nb)

# Posterior predictive mean for this individual:

post.pred.mean <- as.numeric( 
as.numeric(row.names(table(example_prediction_nb))) %*% as.numeric(table(example_prediction_nb)/length(example_prediction_nb)) 
)

print(post.pred.mean)


### Easiest to see with a plot here:
# Histogram of posterior predictive distribution for this individual:
mcmc_hist(example_prediction_nb, binwidth = 1) + 
  xlab("Predicted number of books read, person age 30, rather be wise but unhappy")


# Plot the posterior predictive models for each observation:

# Big data set, so this take a while...

ppc_intervals_grouped(pulse$books, yrep = nb_predictions, 
                      x = pulse$age, 
                      group = pulse$wise_unwise,
                      prob = 0.5, prob_outer = 0.95,
                      facet_args = list(scales = "fixed"))

# Do the dark blue circles (observed counts) fall near the light blue circles (predicted counts)?
# Do most of the dark blue circles (observed counts) fall within the 95% prediction intervals?

# A numerical summary of in-sample model fit:

prediction_summary(model = books_negbin_sim, data = pulse)

# A (better?) numerical summary of out-of-sample model fit:
# This takes a long time to run since it's a large data set.

nb_cv <- prediction_summary_cv(model = books_negbin_sim, 
                                    data = pulse, k = 10)
nb_cv$cv

# Comparing a couple of other models:
# Setting chains=1 for speed reasons...

books_negbin_sim_only_wise <- stan_glm(
  books ~ wise_unwise, 
  data = pulse, family = neg_binomial_2,
  prior_intercept = normal(0, 2.5, autoscale = TRUE),
  prior = normal(0, 2.5, autoscale = TRUE), 
  prior_aux = exponential(1, autoscale = TRUE),
  chains = 1,iter = 5000*2)

books_negbin_sim_interact <- stan_glm(
  books ~ age + wise_unwise + age:wise_unwise, 
  data = pulse, family = neg_binomial_2,
  prior_intercept = normal(0, 2.5, autoscale = TRUE),
  prior = normal(0, 2.5, autoscale = TRUE), 
  prior_aux = exponential(1, autoscale = TRUE),
  chains = 1,iter = 5000*2)

# This takes a while .... 

loo(books_negbin_sim)

loo(books_negbin_sim_only_wise)

loo(books_negbin_sim_interact)


## Fitting the model using regular R coding can be done with the negative binomial model, 
## but the code is not as easy since there is an extra parameter (mean plus dispersion)
## rather than the one-parameter Poisson.
## I won't provide the base R code for it here...