# The needed packages and # The Spotify data set: # Load packages library(bayesrules) library(tidyverse) library(rstanarm) library(bayesplot) library(tidybayes) library(broom.mixed) library(forcats) # Load data data(spotify) # printing the first 30 rows: print(as.data.frame(spotify)[1:30,]) # For this chapter we just use a subset of the variables: spotify <- spotify %>% select(artist, title, popularity) %>% mutate(artist = fct_reorder(artist, popularity, .fun = 'mean')) # First few rows head(spotify, 3) # Number of songs nrow(spotify) # Number of artists nlevels(spotify$artist) # The 44 artists in this 'spotify' data set: levels(spotify$artist) # Which of these artists is Dr. Hitchcock's favorite / 2nd favorite in this set? artist_means <- spotify %>% group_by(artist) %>% summarize(count = n(), popularity = mean(popularity)) artist_means %>% slice(1:2, 43:44) # the number of songs per artist runs as low as 2 and as high as 40: artist_means %>% summarize(min(count), max(count)) # density plot of popularity as a whole: ggplot(spotify, aes(x = popularity)) + geom_density() # Fitting an intercept-only regression model: spotify_complete_pooled <- stan_glm( popularity ~ 1, data = spotify, family = gaussian, prior_intercept = normal(50, 2.5, autoscale = TRUE), prior_aux = exponential(1, autoscale = TRUE), chains = 4, iter = 5000*2) complete_summary <- tidy(spotify_complete_pooled, effects = c("fixed", "aux"), conf.int = TRUE, conf.level = 0.90) complete_summary # Posterior predictive means by artist: predictions_complete <- posterior_predict(spotify_complete_pooled, newdata = artist_means) ppc_intervals(artist_means$popularity, yrep = predictions_complete, prob_outer = 0.80) + ggplot2::scale_x_continuous(labels = artist_means$artist, breaks = 1:nrow(artist_means)) + xaxis_text(angle = 90, hjust = 1) # No pooling: Separate popularity densities for each artist: ggplot(spotify, aes(x = popularity, group = artist)) + geom_density() # No pooling model estimation: spotify_no_pooled <- stan_glm( popularity ~ artist - 1, data = spotify, family = gaussian, prior = normal(50, 2.5, autoscale = TRUE), prior_aux = exponential(1, autoscale = TRUE), chains = 4, iter = 5000*2) # Posterior predictive distributions: # Simulate the posterior predictive models predictions_no <- posterior_predict( spotify_no_pooled, newdata = artist_means) # Plot the posterior predictive intervals ppc_intervals(artist_means$popularity, yrep = predictions_no, prob_outer = 0.80) + ggplot2::scale_x_continuous(labels = artist_means$artist, breaks = 1:nrow(artist_means)) + xaxis_text(angle = 90, hjust = 1) ## Hierarchical Modeling: # Checking the distribution of artist sample means: ggplot(artist_means, aes(x = popularity)) + geom_density() # Fitting the model: spotify_hierarchical <- stan_glmer( popularity ~ (1 | artist), data = spotify, family = gaussian, prior_intercept = normal(50, 2.5, autoscale = TRUE), prior_aux = exponential(1, autoscale = TRUE), prior_covariance = decov(reg = 1, conc = 1, shape = 1, scale = 1), chains = 4, iter = 5000*2) # MCMC diagnostics: # There are lots of parameters here, so there are a bunch of plots here: #mcmc_trace(spotify_hierarchical) # don't run, takes a lot of time to show 45+ trace plots ... mcmc_dens_overlay(spotify_hierarchical) # You can do these plots for one specific parameter at a time: mcmc_trace(spotify_hierarchical, pars='(Intercept)') # trace plot for overall mu mcmc_dens_overlay(spotify_hierarchical, pars='sigma') # posterior density for sigma_y mcmc_dens_overlay(spotify_hierarchical, pars='Sigma[artist:(Intercept),(Intercept)]') # posterior density for sigma_mu^2 (NOTE this is for the SQUARE of sigma_mu) mcmc_dens_overlay(spotify_hierarchical, pars='(Intercept)') # posterior density for overall mu mcmc_dens_overlay(spotify_hierarchical, regex_pars = "Vampire_Weekend") # posterior density for VW's (mu_j - mu) # The ACF plots are kind of unreadable with so many parameters to estimate... #mcmc_acf(spotify_hierarchical) # But you can look at them individually using the 'pars' or 'regex_pars' arguments as above: mcmc_acf(spotify_hierarchical, pars='(Intercept)') # ACF plot for overall mu # Posterior summary about mu: tidy(spotify_hierarchical, effects = "fixed", conf.int = TRUE, conf.level = 0.80) # Posterior summary about sigma_y and sigma_mu: tidy(spotify_hierarchical, effects = "ran_pars") # Posterior summaries about the mu_j's: # Get posterior summaries for mu_j # Get MCMC chains for each mu_j artist_chains <- spotify_hierarchical %>% spread_draws(`(Intercept)`, b[,artist]) %>% mutate(mu_j = `(Intercept)` + b) artist_summary_scaled <- artist_chains %>% select(-`(Intercept)`, -b) %>% mean_qi(.width = 0.80) %>% mutate(artist = fct_reorder(artist, mu_j)) # Check out the results artist_summary_scaled %>% select(artist, mu_j, .lower, .upper) %>% head(4) # Plotting the estimates and 80% credible intervals for the mu_j's: ggplot(artist_summary_scaled, aes(x = artist, y = mu_j, ymin = .lower, ymax = .upper)) + geom_pointrange() + xaxis_text(angle = 90, hjust = 1) # The width of the intervals depends on the sample sizes for the artist: artist_means %>% filter(artist %in% c("Frank Ocean", "Lil Skies")) # Frank Ocean's mu_j has a narrow interval; Lil Skies's mu_j has a wide interval. ### Posterior prediction: # Store the simulation in a data frame spotify_hierarchical_df <- as.data.frame(spotify_hierarchical) # # Simulate Vampire Weekend's posterior predictive model VW_chains <- spotify_hierarchical_df %>% rename(b = `b[(Intercept) artist:Vampire_Weekend]`) %>% select(`(Intercept)`, b, sigma) %>% mutate(mu_VW = `(Intercept)` + b, y_VW = rnorm(20000, mean = mu_VW, sd = sigma)) # First few simulated predicted song popularities head(VW_chains, 5) # Summary of posterior predictive distribution for Vampire Weekend: # Predicting popularity of a (new) individual Vampire Weekend song: # Posterior summary of Y_new,j: VW_chains %>% mean_qi(y_VW, .width = 0.80) # Posterior summary of mu_j artist_summary_scaled %>% filter(artist == "artist:Vampire_Weekend") # We can predict Vampire Weekend's mean popularity with much more precision # than the popularity of an individual Vampire Weekend song. # Posterior prediction for new artist: taytay_chains <- spotify_hierarchical_df %>% mutate(sigma_mu = sqrt(`Sigma[artist:(Intercept),(Intercept)]`), mu_taytay = rnorm(20000, `(Intercept)`, sigma_mu), y_taytay = rnorm(20000, mu_taytay, sigma)) # Posterior predictive summaries taytay_chains %>% mean_qi(y_taytay, .width = 0.80) # Summarizing the posterior predictions for these two artists: ### posterior_predict function may be wonky in R: prediction_shortcut <- posterior_predict( spotify_hierarchical, newdata = data.frame(artist = c("Vampire Weekend", "Taylor Swift"))) # Posterior predictive model plots mcmc_areas(prediction_shortcut, prob = 0.8) + ggplot2::scale_y_discrete(labels = c("Vampire Weekend", "Taylor Swift")) ## This code in base R should do something similar: plot(density(VW_chains$y_VW),main="Song Popularity",sub="posterior predictive medians marked on axis",xlab="Song Popularity") lines(density(taytay_chains$y_taytay),col='red') rug(median(VW_chains$y_VW), col="black", ticksize=0.2, lwd=2); rug(median(taytay_chains$y_taytay), col="red", ticksize=0.2,lwd=2) leg.txt <- c("Vampire Weekend", "Taylor Swift") legend("topright", legend = leg.txt, col = c("black","red"), pch=15:15) # If you want 90% prediction intervals for either artist's "new" song's popularity: #rug(quantile(VW_chains$y_VW,.05), col="black", ticksize=0.1, lwd=2); rug(quantile(VW_chains$y_VW,.95), col="black", ticksize=0.1, lwd=2); #rug(quantile(taytay_chains$y_taytay,.05), col="red", ticksize=0.1,lwd=2); rug(quantile(taytay_chains$y_taytay,.95), col="red", ticksize=0.1,lwd=2); ## Posterior predictive plots to show shrinkage: ### posterior_predict function may be wonky in R: predictions_hierarchical <- posterior_predict(spotify_hierarchical, newdata = artist_means) ppc_intervals(artist_means$popularity, yrep = predictions_hierarchical, prob_outer = 0.80) + ggplot2::scale_x_continuous(labels = artist_means$artist, breaks = 1:nrow(artist_means)) + xaxis_text(angle = 90, hjust = 1) + geom_hline(yintercept = 58.4, linetype = "dashed") ## This code in base R should do something similar: artist.mean.draws<-spotify_hierarchical_df[,2:45]+spotify_hierarchical_df[,1] artist.popularity.draws<-matrix(NA,nrow=20000,ncol=44) for (i in 1:44){ artist.popularity.draws[,i]<-rnorm(20000, mean = artist.mean.draws[,i], sd = spotify_hierarchical_df[,46]) } ppc_intervals(artist_means$popularity, yrep = artist.popularity.draws, prob_outer = 0.80) + ggplot2::scale_x_continuous(labels = artist_means$artist, breaks = 1:nrow(artist_means)) + xaxis_text(angle = 90, hjust = 1) + geom_hline(yintercept = 58.4, linetype = "dashed")