R/comparisonPlots.R
In voiceR: Voice Analytics for Social Scientists

Documented in comparisonPlots

#' Create boxplots for extracted audio features
#'
#' Generates boxplots for each exracted feature. Plots can be split by experimental condition.
#'
#' @param audioData A data.frame generated by the autoExtract() function.
#' @param by An optional character vector indicating column(s) from which the comparison groups are to be retrieved.
#' @param measures An optional character vector indicating the name of the variables to be plotted.
#' @param normalSig Set significance level to test for normality assumptions. Default corresponds to "0.05".
#' @param avoidNormalCheck Logical value forcing to avoid checking for normality. When defined as TRUE, it is assumed that the data is normally distributed. Default corresponds to FALSE.
#' @return A list containing the generated boxplots.
#' @examples
#' comparisonPlots(testAudioData, by = "Condition")
#'
#' @importFrom ggplot2 ggplot geom_boxplot aes theme element_blank ggplot_build annotation_custom unit
#' @importFrom gtable gtable_add_grob
#' @importFrom gridExtra tableGrob ttheme_minimal
#' @importFrom grid segmentsGrob gpar
#' @importFrom ggpubr stat_compare_means stat_pvalue_manual
#' @importFrom utils combn
#' @importFrom FSA dunnTest
#' @importFrom stats as.formula kruskal.test TukeyHSD aov
#' @importFrom stringr str_split
#' @importFrom rcompanion scheirerRayHare
#' @importFrom rlang !! sym
#' @export

comparisonPlots <- function(audioData, by = c(), measures = c("duration", "voice_breaks_percent", "RMS_env", "mean_loudness", "mean_F0", "sd_F0", "mean_entropy", "mean_HNR")
, normalSig = 0.05, avoidNormalCheck = FALSE){

  if(!is.data.frame(audioData)) stop("audioData should be a data.frame produced by autoExtract")
  if(!all(measures %in% c("duration", "voice_breaks_percent", "RMS_env", "mean_loudness", "mean_F0", "sd_F0", "mean_entropy", "mean_HNR"))) {
    stop("measures should contain: duration, voice_breaks_percent, RMS_env, mean_loudness, mean_F0, sd_F0, mean_entropy, or mean_HNR")
  }
  if(!is.numeric(normalSig) & (normalSig < 0 | normalSig > 1)) stop("Error normalSig should be numeric and no smaller than 0 and no bigger than 1")
  if(length(avoidNormalCheck) != 1) stop("avoidNormalCheck should have length 1")

  #Measures for which we want to generate plots
  my_comparisons <- list( c("1", "2") )
  #Empty list, in which we will store the plots
  plots <- list()
  i <- 1

  #If we only use one variable to create comparisons, just use that variable for both dimensions
  if(length(by) == 1){
    dimension1 <- by
    dimension2 <- by

  }
  #Otherwise use a different variable for each dimension
  else if(length(by) == 2){
    dimension1 <- by[1]
    dimension2 <- by[2]
  }
  else{
    warning("No comparison Variables set.")
  }


  #When there's at least one comparison variable create the plots
  if(length(by) >= 1){
    numberCategoriesDim1 <- length(unique(audioData[,dimension1]))
    audioData[,dimension1] <- as.factor(audioData[,dimension1])
    numberCategoriesDim2 <- length(unique(audioData[,dimension2]))
    audioData[,dimension2] <- as.factor(audioData[,dimension2])

    #Create a plot for each measure
    for (measure in measures) {
      #If there are many NAs do not plot
      if(sum(is.na(audioData[,measure])) >= 0.92 * length(audioData[,measure])){
        warning(paste(measure, "contains all NAs. Plot for ", measure, "not generated."))
        next
      }
      #Check if data is normally distributed
      normalTable <- tableNormality(audioData, measure = measure, includeDimensions = "Dimension" %in% by)
      if(is.character(normalTable)) stop("Sample size not big enough. Please consider increasing your sample.")
      if(min(normalTable$N) < 3){
        if("Dimension" %in% by){
          warning("Only checking condition, as N per Condition and Dimension is lower than 3")
          dimension1 <- "Condition"
          dimension2 <- "Condition"
        }
        else{
          stop("Not enough data to check comparisons")
        }



      }
      pValue <- min(normalTable$pValue)


      #Generate the plot comparing means depending on number of comparison variables and number of factors per comparison variable:
      # One factor:
      #   - Normality and only 2 groups: t-test.
      #   - Normality and more than 2 groups: ANOVA and Tukey's test.
      #   - Non-normality and only 2 groups: Wilcoxon's Test.
      #   - Non-normality and more than 2 groups: Kruskal-Wallis test and Mann-Whitney U test.
      # Two factors:
      #   - Normality: 2-way ANOVA.
      #   - Non-normality: Scheirer-Ray-Hare test.
      plots[[measure]] <- local({
        i <- i
        if(numberCategoriesDim2 == 2 & length(by) == 1){
          if(pValue < normalSig & !avoidNormalCheck){
            p1 <- ggplot(audioData, aes(x=!!sym(dimension1), y=!!sym(measure), color = !!sym(dimension2))) +
              geom_boxplot() +
              stat_compare_means(method = "wilcox.test", label.y = max(audioData[,measure], na.rm = TRUE)*1.1, label.x = 1) +
              theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

          }
          else{
            p1 <- ggplot(audioData, aes(x=!!sym(dimension1), y=!!sym(measure), color = !!sym(dimension2))) +
              geom_boxplot() +
              stat_compare_means(method = "t.test", label.y = max(audioData[,measure], na.rm = TRUE)*1.1, label.x = 1) +
              theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
          }
        }
        else{
          if(length(by) == 1){
            x <- combn(seq(1,numberCategoriesDim1), 2)
            my_comparisons <-  lapply(seq_len(ncol(x)), function(i) x[,i])
            if(pValue < normalSig & !avoidNormalCheck){

              dunnData <- dunnTest(as.formula(paste0(measure, "~", dimension1)), data = audioData)
              dunnData <- dunnData$res
              dunnData$significance <- NA
              if(length(dunnData[dunnData$P.adj > 0.1, "significance"]) > 0)
                dunnData[dunnData$P.adj > 0.1, "significance"] <- "ns"
              if(length(dunnData[dunnData$P.adj < 0.1, "significance"]) > 0)
                dunnData[dunnData$P.adj < 0.1, "significance"] <- "+"
              if(length(dunnData[dunnData$P.adj < 0.05, "significance"]) > 0)
                dunnData[dunnData$P.adj < 0.05, "significance"] <- "*"
              if(length(dunnData[dunnData$P.adj < 0.01, "significance"]) > 0)
                dunnData[dunnData$P.adj < 0.01, "significance"] <- "**"
              if(length(dunnData[dunnData$P.adj < 0.001, "significance"]) > 0)
                dunnData[dunnData$P.adj < 0.001, "significance"] <- "***"



              dunnData$group1 <- str_split(dunnData$Comparison, "-", simplify = T)[,1]
              dunnData$group2 <- str_split(dunnData$Comparison, "-", simplify = T)[,2]
              dunnData$group1 <- gsub(" ", "", dunnData$group1, fixed = TRUE)
              dunnData$group2 <- gsub(" ", "", dunnData$group2, fixed = TRUE)
              dunnData <- dunnData[dunnData$P.adj < 0.1,]

              p1 <- ggplot(audioData, aes(x=!!sym(dimension1), y=!!sym(measure), color = !!sym(dimension2))) +
                geom_boxplot() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

              ggplotData <- ggplot_build(p1)
              multiplier <- (max(audioData[,measure], na.rm = TRUE)-min(audioData[,measure], na.rm = TRUE))*0.08
              ypos <- seq((max(ggplotData$data[[1]]$ymax, na.rm = TRUE)+multiplier), max(ggplotData$data[[1]]$ymax, na.rm = TRUE) + multiplier*(nrow(dunnData)+1) , by = multiplier)[1:nrow(dunnData)]
              kruskalTestData <- kruskal.test(as.formula(paste0(measure, "~", dimension1)), audioData)
              p1 <- p1 +
                stat_compare_means(method = "kruskal.test", label.y = ggplotData$layout$panel_scales_y[[1]]$range$range[2] - (ggplotData$layout$panel_scales_y[[1]]$range$range[2] - ggplotData$layout$panel_scales_y[[1]]$range$range[1])*0.15, label.x = 1.3)

              if(nrow(dunnData) > 0){
                p1 <- p1 + stat_pvalue_manual(dunnData, label = "significance", y.position = ypos, tip.length = 0)
              }



            }
            else{
              tukeyData <- TukeyHSD(aov(as.formula(paste0(measure, "~", dimension1)), data = audioData))
              tukeyData <- as.data.frame(tukeyData$Condition)
              tukeyData$significance <- NA
              if(length(tukeyData[tukeyData$`p adj` > 0.1, "significance"]) > 0)
                tukeyData[tukeyData$`p adj` > 0.1, "significance"] <- "ns"
              if(length(tukeyData[tukeyData$`p adj` < 0.1, "significance"]) > 0)
                tukeyData[tukeyData$`p adj` < 0.1, "significance"] <- "+"
              if(length(tukeyData[tukeyData$`p adj` < 0.05, "significance"]) > 0)
                tukeyData[tukeyData$`p adj` < 0.05, "significance"] <- "*"
              if(length(tukeyData[tukeyData$`p adj` < 0.01, "significance"]) > 0)
                tukeyData[tukeyData$`p adj` < 0.01, "significance"] <- "**"
              if(length(tukeyData[tukeyData$`p adj` < 0.001, "significance"]) > 0)
                tukeyData[tukeyData$`p adj` < 0.001, "significance"] <- "***"



              tukeyData$group1 <- str_split(rownames(tukeyData), "-", simplify = T)[,1]
              tukeyData$group2 <- str_split(rownames(tukeyData), "-", simplify = T)[,2]
              tukeyData <- tukeyData[tukeyData$`p adj` < 0.1,]


              p1 <- ggplot(audioData, aes(x=!!sym(dimension1), y=!!sym(measure), color = !!sym(dimension2))) +
                geom_boxplot() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

              ggplotData <- ggplot_build(p1)
              multiplier <- (max(audioData[,measure], na.rm = TRUE)-min(audioData[,measure], na.rm = TRUE))*0.08
              ypos <- seq((max(ggplotData$data[[1]]$ymax, na.rm = TRUE)+multiplier), max(ggplotData$data[[1]]$ymax, na.rm = TRUE) + multiplier*(nrow(tukeyData)+1) , by = multiplier)[1:nrow(tukeyData)]
              AnovaTestData <- summary(aov(as.formula(paste0(measure, "~", dimension1)), audioData))

              p1 <- p1  +
                stat_compare_means(method = "anova", label.y = ggplotData$layout$panel_scales_y[[1]]$range$range[2] - (ggplotData$layout$panel_scales_y[[1]]$range$range[2] - ggplotData$layout$panel_scales_y[[1]]$range$range[1])*0.15, label.x = 1.3)

              if(nrow(tukeyData) > 0){
                p1 <- p1 + stat_pvalue_manual(tukeyData, label = "significance", y.position = ypos, tip.length = 0)
              }



            }
          }
          if(length(by) == 2){
            if(pValue < normalSig & !avoidNormalCheck){
              res.schreirer <- scheirerRayHare(as.formula(paste(measure, "~", dimension1, "*", dimension2)), audioData, verbose = FALSE)
              row.names(res.schreirer)[3] <- "Interaction"



              annotationTable <- data.frame('Scheirer-Ray-Hare' = rownames(res.schreirer)[1:3], pValue = round(res.schreirer[1:3,"p.value"], 2))
              annotationTable$Scheirer.Ray.Hare[3] <- "Interaction"





              p1 <- ggplot(audioData, aes(x=!!sym(dimension1), y=!!sym(measure), color = !!sym(dimension2))) +
                geom_boxplot() +
                theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
              ggplotData <- ggplot_build(p1)
              tbl <- tableGrob(annotationTable, theme = ttheme_minimal(), rows=NULL, cols= c("Scheirer-Ray-Hare", "p-Value"))
              tbl <- gtable_add_grob( tbl, grobs = segmentsGrob( name = 'segment', y1 = unit( 0, 'npc' ), gp = gpar( lty = 1, lwd = 1 ) ),
                                      t = 1, l = 1, r = ncol( tbl ) )

              p1 <-  p1 +  annotation_custom(tbl,  ymin = ggplotData$layout$panel_scales_y[[1]]$range$range[2] - (ggplotData$layout$panel_scales_y[[1]]$range$range[2] - ggplotData$layout$panel_scales_y[[1]]$range$range[1])*0.25, xmin = 1, xmax = 2)
            }
            else{
              res.aov2 <- aov(as.formula(paste(measure, "~", dimension1, "*", dimension2)), data = audioData)
              res.aov2 <- summary(res.aov2)[[1]]
              annotationTable <- data.frame('2-way ANOVA' = rownames(res.aov2)[1:3], pValue = round(res.aov2[1:3,"Pr(>F)"], 2))
              row.names(annotationTable)[3] <- "Interaction"

              text <- "    Two-way ANOVA p-value\n"
              for (i in 1:(nrow(res.aov2)-1)) {
                text <- paste(text, trimws(rownames(res.aov2)[i]), paste0(rep(" ", 30 - length(trimws(rownames(res.aov2)[i]))), collapse = ""), round(res.aov2$`Pr(>F)`[i], 2), "\n")
              }

              p1 <- ggplot(audioData, aes(x=!!sym(dimension1), y=!!sym(measure), color = !!sym(dimension2))) +
                geom_boxplot() +
                theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
              ggplotData <- ggplot_build(p1)
              tbl <- tableGrob(annotationTable, theme = ttheme_minimal(), rows=NULL, cols= c("2-way ANOVA", "p-Value"))
              tbl <- gtable_add_grob( tbl, grobs = segmentsGrob( name = 'segment', y1 = unit( 0, 'npc' ), gp = gpar( lty = 1, lwd = 1 ) ),
                                              t = 1, l = 1, r = ncol( tbl ) )

              p1 <-  p1 +  annotation_custom(tbl,  ymin = ggplotData$layout$panel_scales_y[[1]]$range$range[2] - (ggplotData$layout$panel_scales_y[[1]]$range$range[2] - ggplotData$layout$panel_scales_y[[1]]$range$range[1])*0.25, xmin = 1, xmax = 2)
            }

          }

        }
        p1
      })


    }
  }
  else{
    for (measure in measures) {

      plots[[measure]] <- local({

        p1 <- ggplot(audioData, aes(x=factor(0), y=!!sym(measure))) +
          geom_boxplot() + theme(axis.title.x= element_blank(),
                                                   axis.text.x= element_blank(), axis.ticks.x= element_blank()) +
          theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

        p1
      })


    }
  }

  return(plots)
}