knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)

library(tidyverse)
library(DT)
library(plotly)
library(ggiraph)
library(scales)
library(feather)
library(gtsummary)

# data <- read_rds("output.rds") %>% 
#   mutate(
#     prop_satd = n_methods_satd / n_methods
#   ) %>% 
#   relocate(
#     prop_satd,
#     .after = n_methods_satd
#   )
# 
# data_bags <- read_rds("output_bags.rds") %>% 
#   mutate(
#     prop_satd = n_methods_satd / n_methods
#   ) %>% 
#   relocate(
#     prop_satd,
#     .after = n_methods_satd
#   )
# 


data_bags_2 <- read_rds("output_bags_2.rds") %>% 
  mutate(
    across(
      .cols = project,
      .fns = ~str_remove(string = .x, "resulbags_2")
    )
  ) %>% 
  mutate(
    prop_satd = n_methods_satd / n_methods
  ) %>% 
  relocate(
    prop_satd,
    .after = n_methods_satd
  )

Reviewed Bags of Words

Methods containing SATD comments contain more PMD Alerts?

The following plot shows that, in the majority of the cases, there is a larger proportion of methods containing PMD Alerts among methods containing Self-Admitted Technical Debt comments than among methods that do not contain SATD.

plot <- data_bags_2 %>% 
  mutate(
    across(
      .cols = project,
      .fns = ~str_remove(string = .x, "resulbags_2")
    )
  ) %>% 
  select(
    project,
    n_classes,
    n_methods,
    prop_alerts_satd,
    prop_alerts_no_satd,
    greater_class,
    p_value,
    prop_satd
  ) %>% 
  mutate(
    rank_class = rank(n_classes),
    project = fct_reorder(.f = project, .x = n_classes),
    greater_class = if_else(p_value < 0.05, greater_class, "DRAW" )
  ) %>% 
  ggplot( 
    aes(
      tooltip = paste0(
        "Project: ", project, "\n",
        "Classes: ", number(n_classes, accuracy = 1, big.mark = ","), "\n",
        "Methods: ", number(n_methods, accuracy = 1, big.mark = ","), "\n",
        "Prop SATD: ", percent(prop_satd, accuracy = 0.1), "\n",
        "Prop alerts when SATD: ", percent(prop_alerts_satd, accuracy = 1), "\n",
        "Prop alerts when NO SATD: ", percent(prop_alerts_no_satd, accuracy = 1), "\n",
        "P-value:", number(p_value, accuracy = 0.0001)
      )
    )
  ) +
  geom_point_interactive(
    aes(
      x = prop_alerts_satd,
      y = project,
      color = "SATD"
    )
  ) +
  geom_point_interactive(
    aes(
      x = prop_alerts_no_satd ,
      y = project,
      color = "NO SATD"
    )
  ) +
  geom_segment_interactive(
    aes(
      y = rank_class,
      yend = rank_class,
      x = prop_alerts_satd,
      xend = prop_alerts_no_satd,
      color = greater_class
    )
  ) +
  scale_x_continuous(
    labels = percent_format(accuracy = 1),
    breaks = seq(from = 0, to = 1, by = 0.2 ),
    limits = c(0, 1)
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    plot.caption = element_text(hjust = 0)
  ) +
  labs(
    x = "Proportion of Methods with PMD Alerts",
    color = "Methods with SATD?",
    y = "Java Project",
    caption = "Each point shows the proportion of PMD methods that contain PMD alerts.\nThe green points show the proportion for methods containing SATD. The red, those that don't contain SATD\nThe color of the segment shows if SATD or NOT SATD have a larger proportion. Gray represents differences that are not significant"
  ) +
  ggtitle(
    label = "SATD -> PMD Alerts?",
    subtitle = "Mouse hover for details"
  ) +
  scale_color_manual(
    values = c("NO SATD" = "darkred", "SATD" = "darkgreen", "DRAW" = "gray")
  )



plot <- girafe(ggobj = plot, width_svg = 9, height_svg = 5 )



renderGirafe(plot)

The Prevalence of Methods Containg SATD Comments is Low

plot_data <- data_bags_2 %>% 
  mutate(
    across(
      .cols = project,
      .fns = ~str_remove(string = .x, "resulbags_2")
    )
  ) %>% 
  select(
    project,
    n_classes,
    n_methods,
    prop_alerts_satd,
    prop_alerts_no_satd,
    greater_class,
    p_value,
    prop_satd
  ) %>% 
  mutate(
    project = fct_reorder(.f = project, .x = n_classes),
  ) 

median_prevalence <- median(plot_data$prop_satd)

plot_2 <- ggplot( 
    plot_data,
    aes(
      tooltip = paste0(
        "Project: ", project, "\n",
        "Classes: ", number(n_classes, accuracy = 1, big.mark = ","), "\n",
        "Methods: ", number(n_methods, accuracy = 1, big.mark = ","), "\n",
        "Prop SATD: ", percent(prop_satd, accuracy = 0.1), "\n",
        "Prop alerts when SATD: ", percent(prop_alerts_satd, accuracy = 1), "\n",
        "Prop alerts when NO SATD: ", percent(prop_alerts_no_satd, accuracy = 1), "\n",
        "P-value:", number(p_value, accuracy = 0.0001)
      )
    )
  ) +
  geom_col_interactive(
    aes(
      y = prop_satd,
      x = project
    ),
    fill = "darkblue"
  ) +
  geom_hline_interactive(
    yintercept = median_prevalence,
    color = "darkblue",
    tooltip = paste0("Median proportion: ", percent(median_prevalence, accuracy = 0.1) )
  ) +
  coord_flip() +
  scale_y_continuous(labels = percent_format(accuracy = 0.1)) +
  theme_minimal() +
  ggtitle(
    label = "Proportion of Methods containg SATD comments"
  )

plot_2 <- girafe(ggobj = plot_2, width_svg = 9, height_svg = 5 )


renderGirafe(plot_2)

Data

DT::datatable(
  data_bags_2,
  colnames = c(
    "Project", 
    "Classes", 
    "Methods", 
    "Methods SATD",
    "Prop SATD",
    "Prop Alerts SATD",
    "Prop Alerts NOT SATD",
    "SATD?",
    "P-Value"
  ),
  options = list(pageLength = 100) 
) %>% 
  formatPercentage(
    columns = ~prop_alerts_satd + prop_alerts_no_satd 
  ) %>% 
  formatPercentage(
    columns = ~prop_satd,
    digits = 1
  ) %>% 
  formatRound(columns = ~p_value, digits = 4)

Including other effects

Including size

modelo <- read_rds("modelo_bags_2.rds")

summary(modelo)
synthetic_predictions <- read_feather("predictions_bags_2.rds")

plot_reg <- ggplot(
  synthetic_predictions
) +  
  geom_line(
    aes(
      y = prediction,
      x = size,
      color = has_satd
    )
  ) +
  geom_point_interactive(
    aes(
      y = prediction,
      x = size,
      color = has_satd,
      tooltip = paste0(
        "Lines of code: ", number(size, accuracy = 1), "\n",
        has_satd, "\n",
        "Probability PMD Alert:", percent(prediction, accuracy = 1)
      )
    )
  ) +
  scale_x_log10() +
  scale_y_continuous(
    labels = percent_format(accuracy = 1)
  ) +
  theme_minimal() +
  theme(
    legend.position = "top"
  ) +
  labs(
    y = "Prob PMD alert\n",
    x = "\nLines of Code",
    color = ""
  ) +
  scale_color_manual(
    values = c("NO SATD" = "darkred", "SATD" = "darkgreen", "DRAW" = "gray")
  )


plot_reg <- girafe(
  ggobj = plot_reg, 
  width_svg = 9, 
  height_svg = 5

) %>% 
  girafe_options(
    opts_tooltip(use_fill = TRUE)
  )


renderGirafe(plot_reg)
# plot_violin <- data_regression %>% 
#   mutate(
#     has_satd = if_else(has_satd, "SATD", "NO SATD"),
#     has_alert = if_else(has_alert, "PMD ALERT", "NO PMD ALERT"),
#   ) %>% 
#   ggplot() +
#     geom_violin(
#       aes(
#         x = has_alert,
#         y = size
#       ),
#       adjust = 2.5,
#       draw_quantiles = c(0.25, 0.5, 0.75),
#       alpha = 0.5,
#       fill = "darkblue"
#     ) +
#     scale_y_log10(
#       breaks = breaks_log(n = 20)
#     ) +
#     facet_wrap(
#       ~has_satd
#     ) +
#     theme_minimal()
# 
# plot_violin <- girafe(
#   ggobj = plot_violin, 
#   width_svg = 9, 
#   height_svg = 5
# 
# ) %>% 
#   girafe_options(
#     opts_tooltip(use_fill = TRUE)
#   )
# 
# 
# renderGirafe(plot_violin)
# 


crotman/kludgenudger documentation built on Oct. 19, 2021, 7:30 p.m.