# # Chapter 14 R examples:

# Load packages
library(bayesrules)
library(tidyverse)
library(e1071)
library(janitor)

# Load data
data(penguins_bayes)
penguins <- penguins_bayes

# table of category counts for the sample:

penguins %>% 
  tabyl(species)

# or in base R:

table(penguins$species)


## Classifying based only on one categorical predictor:

# A stacked bar plot for the three species:

ggplot(penguins %>% drop_na(above_average_weight), 
       aes(fill = above_average_weight, x = species)) + 
  geom_bar(position = "fill")

# Species by weight tables:

penguins %>% 
  select(species, above_average_weight) %>% 
  na.omit() %>% 
  tabyl(species, above_average_weight) %>% 
  adorn_totals(c("row", "col"))

# Same thing in base R:

addmargins(table(penguins$species, penguins$above_average_weight))

# Conditional normal distributions of bill length for the three species:

ggplot(penguins, aes(x = bill_length_mm, fill = species)) + 
  geom_density(alpha = 0.7) + 
  geom_vline(xintercept = 50, linetype = "dashed")

# sample means and sd's for each species:

# Calculate sample mean and sd bill length for each Y group
penguins %>% 
  group_by(species) %>% 
  summarize(mean = mean(bill_length_mm, na.rm = TRUE), 
            sd = sd(bill_length_mm, na.rm = TRUE))

## Some likelihood calculations:

# L(y = A | x_2 = 50)
dnorm(50, mean = 38.8, sd = 2.66)

# L(y = C | x_2 = 50)
dnorm(50, mean = 48.8, sd = 3.34)

# L(y = G | x_2 = 50)
dnorm(50, mean = 47.5, sd = 3.08)

# Conditional normal distributions of bill length AND flipper length, for the three species:

ggplot(penguins, aes(x = bill_length_mm, fill = species)) + 
  geom_density(alpha = 0.6)

ggplot(penguins, aes(x = flipper_length_mm, fill = species)) + 
  geom_density(alpha = 0.6)

# Symbolic scatterplot:

ggplot(penguins,
       aes(x = flipper_length_mm, y = bill_length_mm, color = species)) + 
  geom_point()

# Calculate sample mean and sd flipper length for each Y group
penguins %>% 
  group_by(species) %>% 
  summarize(mean = mean(flipper_length_mm, na.rm = TRUE), 
            sd = sd(flipper_length_mm, na.rm = TRUE))

## Some likelihood calculations:

# L(y = A | x_3 = 195)
dnorm(195, mean = 190, sd = 6.54)

# L(y = C | x_3 = 195)
dnorm(195, mean = 196, sd = 7.13)

# L(y = G | x_3 = 195)
dnorm(195, mean = 217, sd = 6.48)

#### Now, here's the shortcut way.
#### Using the naiveBayes function:

library(e1071)

naive_model_1 <- naiveBayes(species ~ bill_length_mm, data = penguins)
naive_model_2 <- naiveBayes(species ~ bill_length_mm + flipper_length_mm, data = penguins)
naive_model_3 <- naiveBayes(species ~ above_average_weight + bill_length_mm + flipper_length_mm, data = penguins)

# Predictions for one specific penguin:

our_penguin <- data.frame(bill_length_mm = 50, flipper_length_mm = 195)

# getting the posterior probabilities:
predict(naive_model_1, newdata = our_penguin, type = "raw")

# the predicted class:
predict(naive_model_1, newdata = our_penguin)

# getting the posterior probabilities:
predict(naive_model_2, newdata = our_penguin, type = "raw")

# the predicted class:
predict(naive_model_2, newdata = our_penguin)

# Adding in the weight information for our specific penguin:

our_penguin_plus <- data.frame(above_average_weight='0', bill_length_mm = 50, flipper_length_mm = 195)

# getting the posterior probabilities:
predict(naive_model_3, newdata = our_penguin_plus, type = "raw")

# the predicted class:
predict(naive_model_3, newdata = our_penguin_plus)


# In-sample classifications for ALL the penguins in the sample:

# We do this with EACH of the three models:

penguins <- penguins %>% 
  mutate(class_1 = predict(naive_model_1, newdata = .),
         class_2 = predict(naive_model_2, newdata = .),
         class_3 = predict(naive_model_3, newdata = .))

## Comparing classification performance based on confusion matrices:

# Confusion matrix for naive_model_1:
penguins %>% 
  tabyl(species, class_1) %>% 
  adorn_percentages("row") %>% 
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns()

# Confusion matrix for naive_model_2:
penguins %>% 
  tabyl(species, class_2) %>% 
  adorn_percentages("row") %>% 
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns()

# Confusion matrix for naive_model_3:
penguins %>% 
  tabyl(species, class_3) %>% 
  adorn_percentages("row") %>% 
  adorn_pct_formatting(digits = 2) %>%
  adorn_ns()


## Comparing classification performance based on cross-validation classification accuracy:
# Using k=5 for speed here...

cv_model_1 <- naive_classification_summary_cv(
  model = naive_model_1, data = penguins, y = "species", k = 5)

cv_model_2 <- naive_classification_summary_cv(
  model = naive_model_2, data = penguins, y = "species", k = 5)

cv_model_3 <- naive_classification_summary_cv(
  model = naive_model_3, data = penguins, y = "species", k = 5)

cv_model_1$cv

cv_model_2$cv

cv_model_3$cv