# # Chapter 14 R examples: # Load packages library(bayesrules) library(tidyverse) library(e1071) library(janitor) # Load data data(penguins_bayes) penguins <- penguins_bayes # table of category counts for the sample: penguins %>% tabyl(species) # or in base R: table(penguins$species) ## Classifying based only on one categorical predictor: # A stacked bar plot for the three species: ggplot(penguins %>% drop_na(above_average_weight), aes(fill = above_average_weight, x = species)) + geom_bar(position = "fill") # Species by weight tables: penguins %>% select(species, above_average_weight) %>% na.omit() %>% tabyl(species, above_average_weight) %>% adorn_totals(c("row", "col")) # Same thing in base R: addmargins(table(penguins$species, penguins$above_average_weight)) # Conditional normal distributions of bill length for the three species: ggplot(penguins, aes(x = bill_length_mm, fill = species)) + geom_density(alpha = 0.7) + geom_vline(xintercept = 50, linetype = "dashed") # sample means and sd's for each species: # Calculate sample mean and sd bill length for each Y group penguins %>% group_by(species) %>% summarize(mean = mean(bill_length_mm, na.rm = TRUE), sd = sd(bill_length_mm, na.rm = TRUE)) ## Some likelihood calculations: # L(y = A | x_2 = 50) dnorm(50, mean = 38.8, sd = 2.66) # L(y = C | x_2 = 50) dnorm(50, mean = 48.8, sd = 3.34) # L(y = G | x_2 = 50) dnorm(50, mean = 47.5, sd = 3.08) # Conditional normal distributions of bill length AND flipper length, for the three species: ggplot(penguins, aes(x = bill_length_mm, fill = species)) + geom_density(alpha = 0.6) ggplot(penguins, aes(x = flipper_length_mm, fill = species)) + geom_density(alpha = 0.6) # Symbolic scatterplot: ggplot(penguins, aes(x = flipper_length_mm, y = bill_length_mm, color = species)) + geom_point() # Calculate sample mean and sd flipper length for each Y group penguins %>% group_by(species) %>% summarize(mean = mean(flipper_length_mm, na.rm = TRUE), sd = sd(flipper_length_mm, na.rm = TRUE)) ## Some likelihood calculations: # L(y = A | x_3 = 195) dnorm(195, mean = 190, sd = 6.54) # L(y = C | x_3 = 195) dnorm(195, mean = 196, sd = 7.13) # L(y = G | x_3 = 195) dnorm(195, mean = 217, sd = 6.48) #### Now, here's the shortcut way. #### Using the naiveBayes function: library(e1071) naive_model_1 <- naiveBayes(species ~ bill_length_mm, data = penguins) naive_model_2 <- naiveBayes(species ~ bill_length_mm + flipper_length_mm, data = penguins) naive_model_3 <- naiveBayes(species ~ above_average_weight + bill_length_mm + flipper_length_mm, data = penguins) # Predictions for one specific penguin: our_penguin <- data.frame(bill_length_mm = 50, flipper_length_mm = 195) # getting the posterior probabilities: predict(naive_model_1, newdata = our_penguin, type = "raw") # the predicted class: predict(naive_model_1, newdata = our_penguin) # getting the posterior probabilities: predict(naive_model_2, newdata = our_penguin, type = "raw") # the predicted class: predict(naive_model_2, newdata = our_penguin) # Adding in the weight information for our specific penguin: our_penguin_plus <- data.frame(above_average_weight='0', bill_length_mm = 50, flipper_length_mm = 195) # getting the posterior probabilities: predict(naive_model_3, newdata = our_penguin_plus, type = "raw") # the predicted class: predict(naive_model_3, newdata = our_penguin_plus) # In-sample classifications for ALL the penguins in the sample: # We do this with EACH of the three models: penguins <- penguins %>% mutate(class_1 = predict(naive_model_1, newdata = .), class_2 = predict(naive_model_2, newdata = .), class_3 = predict(naive_model_3, newdata = .)) ## Comparing classification performance based on confusion matrices: # Confusion matrix for naive_model_1: penguins %>% tabyl(species, class_1) %>% adorn_percentages("row") %>% adorn_pct_formatting(digits = 2) %>% adorn_ns() # Confusion matrix for naive_model_2: penguins %>% tabyl(species, class_2) %>% adorn_percentages("row") %>% adorn_pct_formatting(digits = 2) %>% adorn_ns() # Confusion matrix for naive_model_3: penguins %>% tabyl(species, class_3) %>% adorn_percentages("row") %>% adorn_pct_formatting(digits = 2) %>% adorn_ns() ## Comparing classification performance based on cross-validation classification accuracy: # Using k=5 for speed here... cv_model_1 <- naive_classification_summary_cv( model = naive_model_1, data = penguins, y = "species", k = 5) cv_model_2 <- naive_classification_summary_cv( model = naive_model_2, data = penguins, y = "species", k = 5) cv_model_3 <- naive_classification_summary_cv( model = naive_model_3, data = penguins, y = "species", k = 5) cv_model_1$cv cv_model_2$cv cv_model_3$cv