knitr::opts_chunk$set(echo = TRUE)
print("Too late to cry, too soon to laugh...")
library(discrtr) # A companion package for the book Introduction to Discrete Choice Analysis with `R` 
library(dplyr) # A Grammar of Data Manipulation
library(ggplot2) # Create Elegant Data Visualisations Using the Grammar of Graphics
library(ggridges) # Ridgeline Plots in 'ggplot2'
library(kableExtra) # Construct Complex Table with 'kable' and Pipe Syntax
library(mvord) # Multivariate Ordinal Regression Models
library(ordinal) # Regression Models for Ordinal Data
library(plyr) # Tools for Splitting, Applying and Combining Data
library(tidyr) # Tidy Messy Data
data("mc_attitudes",
     package = "discrtr")
mc_attitudes %>%
  select(Community:Travel_Alone) %>%
  summary()
df <- data.frame(x = rep(seq(-6, 6, 0.1))) %>%
  mutate(f = dlogis(x, 
                    location = 0,
                    scale = 1))

ggplot() +
  geom_area(data = df,
            aes(x = x, 
                y = f),
            color = "black",
            fill = "orange",
            alpha = 0.5) 
df <- data.frame(x = rep(seq(-6, 6, 0.1))) %>%
  mutate(f = dlogis(x, 
                    location = 0,
                    scale = 1))

ggplot() +
  geom_area(data = df,
            aes(x = x, 
                y = f),
            color = "black",
            fill = "orange",
            alpha = 0.5) +
  geom_area(data = df %>%
              filter(x <= -3),
            aes(x = x, 
                y = f),
            color = "black",
            fill = "orange",
            alpha = 1) + 
  geom_vline(xintercept = -3)
ggplot() +
  geom_area(data = df,
            aes(x = x, 
                y = f),
            color = "black",
            fill = "orange",
            alpha = 0.5) +
  geom_area(data = df %>%
              filter(x >= -3 & x <= -0.5),
            aes(x = x, 
                y = f),
            color = "black",
            fill = "orange",
            alpha = 1) + 
  geom_vline(xintercept = c(-3, -0.5))
df <- data.frame(x = rep(seq(-6, 6, 0.1))) %>%
  mutate(f = dlogis(x, 
                    location = 0))

lj <- c(-2, -0.5, 1.5, 4)

ggplot() +
  geom_area(data = df,
            aes(x = x, 
                y = f),
            color = "black",
            fill = "orange",
            alpha = 0.5) +
  geom_area(data = df %>%
              filter(x >= lj[3] & x <= lj[4]),
            aes(x = x, 
                y = f),
            color = NA,
            fill = "orange",
            alpha = 1) +
  geom_vline(xintercept = lj) +
  geom_text(aes(label = c("SD", "D", "N", "A", "SA"),
                x = c(-3, lj + 0.25)), 
            y = 0.0075) +
  scale_x_continuous(breaks = lj, 
                     labels = expression(lambda[1], 
                                         lambda[2], 
                                         lambda[3], 
                                         lambda[4]))
mod_community_clm <- clm(Community ~ age + vehicle + 
                           Rate_Non_Canadian + Rate_Public, 
                         data = mc_attitudes)

summary(mod_community_clm)
mod_community_clm$convergence
# Coefficients of the utility function
beta_n <- mod_community_clm$coefficients[5:8]

# Thresholds
l_j <- mod_community_clm$coefficients[1:4]
# Profile 1: Age = 21, No vehicle, other variables at zero
p1 <- c(21, 
        0, 
        0,
        0)
# Age = 21, vehicle, other variables at zero
p2 <- c(21, 
        1, 
        0,
        0)
V_1 <- beta_n %*% p1
V_2 <- beta_n %*% p2
# Calculate the values of the logistic distribution for profile 1 in the range defined by x
dV_1 <- data.frame(x = rep(seq(-2.9, 4.4, 0.1)), 
                   profile = "Profile 1") %>%
  mutate(f = dlogis(x, 
                    location = V_1))

# Calculate the values of the logistic distribution for profile 2 in the range defined by x
dV_2 <- data.frame(x = rep(seq(-2.9, 4.4, 0.1)), 
                   profile = "Profile 2") %>%
  mutate(f = dlogis(x, 
                    location = V_2))

# Bind the distributions for the two profiles
dV <- rbind(dV_1,
            dV_2)
# Create a base plot to compare distributions
base_plot <- ggplot() +
  # Plot the thresholds
  geom_vline(xintercept = mod_community_clm$coefficients[1:4]) +
  # Label the responses
  geom_text(aes(label = c("SD", "D", "N", "A", "SA"),
                x = c(-3, l_j + 0.25)), 
            y = 0.75) +
  # Label the ticks in the x axis with the threshold coefficients
  scale_x_continuous(name = "x",
                     breaks = l_j, 
                     labels = expression(lambda[1], 
                                         lambda[2], 
                                         lambda[3], 
                                         lambda[4]))

```rComparing the distribution of hypothetical profiles 1 and 2"}

Render the plot and annotate the effect of vehicle

base_plot + # Plot the distributions as ridges geom_ridgeline(data = dV, aes(x = x, y = profile, fill = profile, group = profile, height = f), alpha = 0.5) + # Mark the center of the two distributions geom_vline(data = data.frame(profile = c("Profile 1", "Profile 2"), x = c(V_1, V_2)), aes(xintercept = x, color = profile, group = profile), linetype = "dashed") + # Annotate the shift in the location of the distribution due to # having access to a vehicle annotate("segment", x = V_1 + 0.3, xend = V_1, y = 1.75, yend = 1.75, arrow = arrow(), color = "black") + annotate("segment", x = V_2 - 0.3, xend = V_2, y = 1.75, yend = 1.75, arrow = arrow(), color = "black")+ annotate("text", parse = TRUE, label = "beta[v]", x = V_1 - 0.15, y = 1.75, color = "black")

```r
profiles <- data.frame(profile = c("Profile 1",
                                   "Profile 2"),
                       age = 21, 
                       vehicle = c("No", "Yes"),
                       Rate_Non_Canadian = 0,
                       Rate_Public = 0)

profiles <- cbind(profiles,
                  predict(mod_community_clm, 
                          newdata = profiles))

profiles %>%
  select(-c(age, vehicle, Rate_Non_Canadian, Rate_Public))
profiles %>%
  select(-c(age, vehicle, Rate_Non_Canadian, Rate_Public)) %>%
  # Pivot longer to gather all probabilities in a sigle column
  pivot_longer(cols = -profile,
               names_to = "Response",
               values_to = "probability") %>%
  # Code the response in order
  mutate(Response = factor(Response,
                           levels = c("fit.STRONGLY DISAGREE",
                                      "fit.DISAGREE",
                                      "fit.NEUTRAL",
                                      "fit.AGREE",
                                      "fit.STRONGLY AGREE"),
                           labels = c("SD", "D", "N", "A", "SA"),
                           ordered = TRUE)) %>%
  ggplot() +
  geom_col(aes(x = Response,
               y = probability,
               fill = profile),
           color = "black",
           position = "dodge") +
  theme(axis.text.x = element_text(angle = 90))
# Define profile 3
p3 <- c(21, 
        1,
        0,
        median(mc_attitudes$Rate_Public))

# Calculate the utility
V_3 <- beta_n %*% p3

# Calculate the values of the logistic distribution for profile 2 in the range defined by x
dV_3 <- data.frame(x = rep(seq(-2.9, 4.4, 0.1)), 
                   profile = "Profile 3") %>%
  mutate(f = dlogis(x, 
                    location = V_3))

# Bind the distributions for the two profiles
dV <- rbind(dV,
            dV_3)

```rComparing the distribution of hypothetical profiles 1, 2, and 3"}

Render the plot and add the distributions

base_plot + # Plot the distributions as ridges geom_ridgeline(data = dV, aes(x = x, y = profile, fill = profile, group = profile, height = f), alpha = 0.5) + # Mark the center of the three distributions geom_vline(data = data.frame(profile = c("Profile 1", "Profile 2", "Profile 3"), x = c(V_1, V_2, V_3)), aes(xintercept = x, color = profile, group = profile), linetype = "dashed")

```r
mod_community_clm$coefficients["Rate_Public"] * median(mc_attitudes$Rate_Public)
profiles <- data.frame(profile = c("Profile 1",
                                   "Profile 2",
                                   "Profile 3"),
                       age = 21, 
                       vehicle = c("No", 
                                   "Yes", 
                                   "Yes"),
                       Rate_Non_Canadian = 0,
                       Rate_Public = c(0, 
                                       0,
                                       median(mc_attitudes$Rate_Public)))

profiles <- cbind(profiles,
                  predict(mod_community_clm, 
                          newdata = profiles))

# Prepare data for plotting
profiles %>%
  select(-c(age, vehicle, Rate_Non_Canadian, Rate_Public)) %>%
  # Pivot longer to gather all probabilities in a single column
  pivot_longer(cols = -profile,
               names_to = "Response",
               values_to = "probability") %>%
  # Code the response in order
  mutate(Response = factor(Response,
                           levels = c("fit.STRONGLY DISAGREE",
                                      "fit.DISAGREE",
                                      "fit.NEUTRAL",
                                      "fit.AGREE",
                                      "fit.STRONGLY AGREE"),
                           labels = c("SD", "D", "N", "A", "SA"),
                           ordered = TRUE)) %>%
  # Plot
  ggplot() +
  geom_col(aes(x = Response,
               y = probability,
               fill = profile),
           color = "black",
           position = "dodge") +
  theme(axis.text.x = element_text(angle = 90))
mod_community_flex_l <- clm(Community ~ age + vehicle +
                              Rate_Non_Canadian + Rate_Public,
                            nominal = ~ gender, 
                            data = mc_attitudes)

summary(mod_community_flex_l)

```rModel with parameterized thresholds for two versions of hypothetical profile 1: woman and man"}

Coefficients of the utility function

beta_n <- mod_community_flex_l$coefficients[9:12]

Thresholds

l_j <- mod_community_flex_l$coefficients[1:8]

Profile 1: Age = 21, No vehicle, other variables at zero

p1 <- c(21, 0, 0, 0)

Calculate the utility of these two hypothetical individuals

V_1 <- beta_n %*% p1

Calculate the values of the logistic distribution for profile 1 in the range defined by x

dV_1 <- data.frame(x = rep(seq(-3.2, 4.7, 0.1))) %>% mutate(f = dlogis(x, location = V_1))

ggplot() + # Plot the distribution geom_area(data = dV_1, aes(x = x, y = f), color = "black", fill = "orange", alpha = 0.5) + # Plot the thresholds geom_vline(data = data.frame(profile = rep(c("Man", "Woman"), each = 4), l_j = c(l_j[1:4], l_j[1:4] + l_j[5:8])), aes(xintercept = l_j, color = profile)) + scale_x_continuous(name = "x", breaks = c(l_j[1:4], l_j[1:4] + l_j[5:8]), labels = expression(lambda[1][m], lambda[2][m], lambda[3][m], lambda[4][m], lambda[1][w], lambda[2][w], lambda[3][w], lambda[4][w])) + theme(axis.text.x = element_text(angle = 90))

```r
mod_community_ncs <- clm(Community ~ age + vehicle +
                           Rate_Non_Canadian +
                           Rate_Public,
                         scale = ~ visa, 
                         data = mc_attitudes)

summary(mod_community_ncs)
exp(mod_community_ncs$coefficients[9])

```rModel with parameterized scale for two versions of hypothetical profile 4: domestic and international student"}

Coefficients of the utility function

beta_n <- mod_community_ncs$coefficients[5:8]

Thresholds

l_j <- mod_community_ncs$coefficients[1:4]

Scale

s_I <- exp(mod_community_ncs$coefficients[9])

Profile 4: Age = 21, No vehicle, other variables at zero

proportion of non-Canadian residents is the in-sample median

p1 <- c(21, 0, median(mc_attitudes$Rate_Non_Canadian), 0)

Calculate the utility of these two hypothetical individuals

V_1 <- beta_n %*% p1

Calculate the values of the logistic distribution for profile 1

when student is Domestic

dV_D <- data.frame(x = rep(seq(-3.2, 4.7, 0.1)), profile = "Domestic") %>% mutate(f = dlogis(x, location = V_1))

Calculate the values of the logistic distribution for profile 1

when student is International

dV_I <- data.frame(x = rep(seq(-3.2, 4.7, 0.1)), profile = "International") %>% mutate(f = dlogis(x, location = V_1, scale = s_I))

Bind the distributions for the two profiles

dV <- rbind(dV_D, dV_I)

Render the plot and add the distributions

ggplot() + # Plot the distributions as ridges geom_ridgeline(data = dV, aes(x = x, y = profile, fill = profile, group = profile, height = f), alpha = 0.5) + # Plot the thresholds geom_vline(xintercept = mod_community_ncs$coefficients[1:4]) + # Label the responses geom_text(aes(label = c("SD", "D", "N", "A", "SA"), x = c(-3, l_j + 0.25)), y = 0.75) + # Label the ticks in the x axis with the threshold coefficients scale_x_continuous(name = "x", breaks = l_j, labels = expression(lambda[1], lambda[2], lambda[3], lambda[4]))

```r
mod_bivariate <- mvord(formula = MMO2(Community, Neighbors) ~ 0 + age + vehicle +
                         Rate_Non_Canadian +
                         Rate_Public, 
                       link = mvlogit(df = 8L),
                       # {mvord} does not like tbl or tbl_df objects:
                       # convert to plain data.frame
                       data = data.frame(mc_attitudes))
summary(mod_bivariate)
# Select an individual in the sample to fit the probabilities
n_ind <- 1

# Create a grid of values for chosen individual
fit_grid_ind <- expand.grid(Community = c("STRONGLY DISAGREE", 
                                          "DISAGREE", 
                                          "NEUTRAL", 
                                          "AGREE", 
                                          "STRONGLY AGREE"),
                            Neighbors = c("STRONGLY DISAGREE", 
                                          "DISAGREE", 
                                          "NEUTRAL", 
                                          "AGREE", 
                                          "STRONGLY AGREE"),
                            # Retrieve the attributes of the n_ind record
                            # in the table
                            age = mc_attitudes$age[n_ind],
                            vehicle = mc_attitudes$vehicle[n_ind],
                            Rate_Non_Canadian = mc_attitudes$Rate_Non_Canadian[n_ind],
                            Rate_Public = mc_attitudes$Rate_Public[n_ind])
# Join the prediction grid to the predicted probabilities
joint.probs <- data.frame(fit_grid_ind %>%
                            select(Community, Neighbors),
                          # Use `predict()` and the prediction grid
                          # to predict joint probabilities
                          joint.prob = predict(mod_bivariate, 
                                               type = "prob", 
                                               newdata = fit_grid_ind)) %>%
  # Revalue the ordinal responses for ease of presentation
  mutate(Community = revalue(Community,
                             c("STRONGLY DISAGREE" = "Community_SD",
                               "DISAGREE" = "Community_D",
                               "NEUTRAL" = "Community_N",
                               "AGREE" = "Community_A",
                               "STRONGLY AGREE" = "Community_SA")),
         Neighbors = revalue(Neighbors,
                             c("STRONGLY DISAGREE" = "Neighbors_SD",
                               "DISAGREE" = "Neighbors_D",
                               "NEUTRAL" = "Neighbors_N",
                               "AGREE" = "Neighbors_A",
                               "STRONGLY AGREE" = "Neighbors_SA"))) %>%
  pivot_wider(names_from = Neighbors, values_from = joint.prob)

joint.probs %>%
  kable("latex",
        digits = 5,
        caption = "\\label{tab:joint-probs-in-sample-individuals}
        Fitted joint probabilities for in-sample individual")
joint.probs %>% 
  select(-Community) %>% 
  sum()
pred_grid <- expand.grid(Community = c("STRONGLY DISAGREE", 
                                       "DISAGREE", 
                                       "NEUTRAL", 
                                       "AGREE", 
                                       "STRONGLY AGREE"),
                         Neighbors = c("STRONGLY DISAGREE", 
                                       "DISAGREE", 
                                       "NEUTRAL", 
                                       "AGREE", 
                                       "STRONGLY AGREE"),
                         # Age = 20 is the value for the first quartile in the sample
                         # and age = 23 is the value for the third quartile
                         age = c(20, 23), 
                         vehicle = levels(mc_attitudes$vehicle),
                         Rate_Non_Canadian = median(mc_attitudes$Rate_Non_Canadian),
                         Rate_Public = median(mc_attitudes$Rate_Public))
# Create a data frame with the prediction grid and 
# the predicted joint probabilities
pred_grid <- data.frame(pred_grid, 
                        joint.prob = predict(mod_bivariate, 
                                             type = "prob", 
                                             newdata = pred_grid)) %>%
  # Convert to factors and revalue for presentation
  mutate(age = factor(age,
                      levels = c(20, 23),
                      labels = c("Age = 20",
                                 "Age = 23")),
         vehicle = revalue(vehicle,
                           c("Yes" = "Vehicle", 
                             "No" = "No Vehicle")),
         Community = revalue(Community,
                             c("STRONGLY DISAGREE" = "SD", 
                               "DISAGREE" = "D", 
                               "NEUTRAL" = "N", 
                               "AGREE" = "A", 
                               "STRONGLY AGREE" = "SA")),
         Neighbors = revalue(Neighbors,
                             c("STRONGLY DISAGREE" = "SD", 
                               "DISAGREE" = "D", 
                               "NEUTRAL" = "N", 
                               "AGREE" = "A", 
                               "STRONGLY AGREE" = "SA")))
pred_grid %>%
  group_by(age, vehicle) %>%
  summarize(prob = sum(joint.prob),
            .groups = "drop")
pred_grid %>%  
  ggplot(aes(x = Community, y = Neighbors)) + 
  geom_tile(aes(fill = joint.prob)) +
  scale_fill_gradient(name = "joint probability",
                      low="white", 
                      high="black") +
  coord_equal() + 
  theme(legend.position = "bottom") +
  facet_grid(vehicle ~ age)

``rPlot of odds ratios by age for No Vehicle/Vehicle status"} pred_grid %>% # Pivot the table to create columns of joint probabilities # forNo VehicleandVehiclepivot_wider(names_from = vehicle, values_from = joint.prob) %>% # Calculate the ratio of the joint probability for the two # profiles mutate(or =No Vehicle`/Vehicle) %>% # Plot as tiles ggplot(aes(x = Community, y = Neighbors)) + geom_tile(aes(fill = or)) + # Use a divergent fill scale with midpoint 1 scale_fill_gradient2(name = "odds ratio (No Vehicle/Vehicle)", midpoint = 1) + coord_equal() + theme(legend.position = "bottom", legend.text = element_text(angle = 90)) + # Facet by age facet_grid(~ age)

```rPlot of odds ratios by age for Age 23/Age 20"}
pred_grid %>% 
  pivot_wider(names_from = age, values_from = joint.prob) %>%
  mutate(or = `Age = 23`/`Age = 20`) %>%
  ggplot(aes(x = Community, y = Neighbors)) + 
  geom_tile(aes(fill = or)) +
  scale_fill_gradient2(name = "odds ratio (Age 23/Age 20)",
                       midpoint = 1) +
  coord_equal() + 
  theme(legend.position = "bottom",
        legend.text = element_text(angle = 90)) +
  facet_grid(~ vehicle)
data("mc_modality",
      package = "discrtr")


paezha/discrtr documentation built on March 1, 2023, 5:25 p.m.