R/featurePlot.R
In politeness: Detecting Politeness Features in Text

Documented in featurePlot

# Used to avoid incorrect notes of "no visible binding"
utils::globalVariables(c("feature","count","cond","count_minus","count_plus"))

#' Feature plot
#'
#' @description Plots the prevalence of politeness features in documents, divided by a binary covariate.
#' @param df_polite a data.frame with politeness features calculated from a document set, as output by \code{\link{politeness}}.
#' @param split a vector of covariate values. must have a length equal to the number of documents included in \code{df_polite}. No NA values allowed.
#' @param split_levels character vector of length 2 default NULL. Labels for covariate levels for legend. If NULL, this will be inferred from \code{split}.
#' @param split_name character default NULL. Name of the covariate for legend.
#' @param split_cols character vector of length 2. Name of colors to use.
#' @param top_title character default "". Title of plot.
#' @param drop_blank Features less prevalent than this in the sample value are excluded from the plot. To include all features, set to \code{0}
#' @param middle_out Features less distinctive than this value (measured by p-value of t-test) are excluded. Defaults to 1 (i.e. include all).
#' @param features character vector of feature names. If NULL all will be included.
#' @param ordered logical should features be ordered according to features param? default is FALSE.
#' @param CI Coverage of error bars. Defaults to 0.68 (i.e. standard error).
#' @details Length of \code{split} must be the same as number of rows of \code{df_polite}. Typically \code{split} should be a two-category variable. However, if a continuous covariate is given, then the top and bottom terciles of that distribution are treated as the two categories (while dropping data from the middle tercile).
#' @return a ggplot of the prevalence of politeness features, conditional on \code{split}. Features are sorted by variance-weighted log odds ratio.
#' @examples
#'
#' data("phone_offers")
#'
#' polite.data<-politeness(phone_offers$message, parser="none", drop_blank=FALSE)
#'
#' politeness::featurePlot(polite.data,
#'                            split=phone_offers$condition,
#'                            split_levels = c("Tough","Warm"),
#'                            split_name = "Condition",
#'                            top_title = "Average Feature Counts")
#'
#'
#' politeness::featurePlot(polite.data,
#'                            split=phone_offers$condition,
#'                            split_levels = c("Tough","Warm"),
#'                            split_name = "Condition",
#'                            top_title = "Average Feature Counts",
#'                            features=c("Positive.Emotion","Hedges","Negation"))
#'
#'
#' polite.data<-politeness(phone_offers$message, parser="none", metric="binary", drop_blank=FALSE)
#'
#' politeness::featurePlot(polite.data,
#'                            split=phone_offers$condition,
#'                            split_levels = c("Tough","Warm"),
#'                            split_name = "Condition",
#'                            top_title = "Binary Feature Use")
#'
#' @export

featurePlot<-function(df_polite,
                      split = NULL,
                      split_levels = NULL,
                      split_name = NULL,
                      split_cols = c("firebrick","navy"),
                      top_title = "",
                      drop_blank = 0.05,
                      middle_out = 0.5,
                      features=NULL,
                      ordered=FALSE,
                      CI=.68){
  if(is.matrix(df_polite)){
    df_polite <- as.data.frame(df_polite)
  }
  if(!is.null(features)){
    df_polite <- df_polite[,features]
  }
  # confirm that CI is meaningful
  if(!(is.numeric(CI)&(CI>0)&(CI<1))){
    stop("CI must be numeric betwwen 0 and 1")
  }
  if(is.null(split)){
    stop("Must include covariate split")
  }
  # confirm that split is the right type
  if(sum(is.na(split))>0){
    stop("split must not have NAs")
  }
  if(is.factor(split)){
    split<-as.character(split)
  }
  if(is.list(split)){
    split<-unlist(split)
  }
  # confirm that split only has two values
  if( length(unique(split)) != 2){
    if(is.character(split)){
      stop("split must have exactly two values")
    }
    # if split has more than 2 values transform it into a binary variable by taking the top 33% and top 33%
    # if the cut at 33% and 67% is the same we throw an error
    cuts <- stats::quantile(split, c(1/3, 2/3))
    cut_low <- cuts[1]
    cut_high <- cuts[2]

    if(cut_low == cut_high){
      stop("Cannot convert split into binary variable 33% and 67% percentiles are some value")
    }

    split <- ifelse(split <= cut_low,0,
                    ifelse(split>=cut_high,1, NA_integer_))

    df_polite <- df_polite[!is.na(split),]
    split <- split[!is.na(split)]
    warning("Converting split into binary variable by taking bottom and top 33% of values,
            and removing middle 33% of values in df_polite and split.")

  }
  if( length(split) != nrow(df_polite)){
    stop("split must be same length as document set")
  }

  binary <- setequal(unique(unlist(df_polite)),0:1)
  averages <- 1*(mean(df_polite==round(df_polite))!=1)
  if(averages==1){
    df_polite<-df_polite*100
  }

  num_features <- ncol(df_polite)
  l_polite_split <- split(data.frame(df_polite), split)

  if(is.null(split_levels)){
    split_levels <- names(l_polite_split)
  }
  # this makes sure split colors correctly match split levels
  names(split_cols) <- split_levels

  split.data<-data.frame(feature=rep(colnames(df_polite),2),
                         count=c(colMeans(l_polite_split[[1]],na.rm=TRUE),
                                 colMeans(l_polite_split[[2]],na.rm=TRUE)),
                         cond=factor(c(rep(split_levels[1],num_features),
                                       rep(split_levels[2],num_features)), levels = split_levels),
                         se=c(apply(l_polite_split[[1]],2,function(x) stats::sd(x)/sqrt(length(x))),
                              apply(l_polite_split[[2]],2,function(x) stats::sd(x)/sqrt(length(x)))))

  SEscaler<-stats::qnorm(1-((1-CI)/2))
  ######################################################
  nonblanks <- colnames(df_polite)[colMeans(df_polite)>=drop_blank]


  split.enough<-names(df_polite)
  if(middle_out<1){
    split.p<-unlist(lapply(names(df_polite), function(x) stats::t.test(l_polite_split[[1]][,x],
                                                                       l_polite_split[[2]][,x])$p.value))
    split.enough<-names(df_polite)[(split.p<middle_out)&(!is.na(split.p))]
  }

  if(sum((split.data$feature%in%nonblanks)&(split.data$feature%in%split.enough))==0){
    stop("All features were excluded. Adjust exclusion settings.")
  }
  split.data<-split.data[(split.data$feature%in%nonblanks)&(split.data$feature%in%split.enough),]
  ######################################################
  # Custom feature ordering
  ######################################################
  if((!is.null(features))&ordered){
    split.data$feature<-factor(split.data$feature,
                               ordered=TRUE,
                               levels=features)

  } else{
    wide<-stats::reshape(split.data,
                         idvar = "feature",
                         timevar = "cond",
                         direction = "wide")
    wide$count.total<-rowMeans(wide[,grepl("count",names(wide))])
    wide$slogodds<-slogodds(wide[,paste0("count.",split_levels[1])],
                            wide[,paste0("count.",split_levels[2])])$slor
    f.order<-unique(wide$feature)[order(wide$slogodds)]

    split.data$feature<-factor(split.data$feature,
                               ordered=TRUE,
                               levels=f.order)
  }
  ######################################################
  if(binary){
    map.type<-"Percentage of Documents Using Feature"
    split.data$se <- sqrt(((split.data$count)*(1-split.data$count))/nrow(df_polite))
    y.breaks <- seq(0,1,.25)
    y.labels <- paste0(seq(0,100,25),"%")
    y.trans <- "identity"
  } else if(averages){
    map.type<-"Feature Count per 100 Words"
    if(max(split.data$count)>20){
      tick.set<-c(1,5,10,25, 50)
    } else{
      tick.set<-c(0.1,0.5,1,2,5,10,25)
    }
    y.labels <- y.breaks <- tick.set
    y.trans <- "sqrt"
  } else {
    map.type<-"Feature Count per Document"
    if(max(split.data$count)>20){
      tick.set<-c(1,5,10,20,50,100,200,500,1000)
    }
    else{
      tick.set<-c(0.1,0.5,1,2,5,10,20,50)
    }
    y.labels <- y.breaks <- tick.set
    y.trans <- "sqrt"
  }
  ######################################################

  split.data$cond<-factor(split.data$cond,ordered=TRUE,levels=rev(split_levels))
  split.data$count_minus<-split.data$count-split.data$se*SEscaler
  split.data$count_plus<-split.data$count+split.data$se*SEscaler
  ######################################################
  ggplot2::ggplot(data=split.data,
                  ggplot2::aes(x=feature,
                               y=count,
                               fill=cond),
                  width=2) +
    ggplot2::geom_bar(position=ggplot2::position_dodge(width = 0.8),
                      stat="identity") +
    ggplot2::geom_errorbar(ggplot2::aes(ymin=count_minus, ymax=count_plus), width=0.3,
                           position=ggplot2::position_dodge(width = 0.8)) +
    ggplot2::coord_flip() +
    ggplot2::scale_x_discrete(name="", breaks=colnames(df_polite),
                              labels=gsub("."," ",colnames(df_polite),fixed=T)) +
    ggplot2::scale_fill_manual(breaks = split_levels,values=split_cols, name=split_name) +
    ggplot2::scale_y_continuous(name=map.type, breaks = y.breaks, labels=y.labels, trans = y.trans) +
    ggplot2::theme_bw(base_size=14) +
    ggplot2::ggtitle(top_title) +
    ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5),
                   panel.grid.minor.x = ggplot2::element_blank(),
                   panel.grid.major.y = ggplot2::element_blank(),
                   axis.text = ggplot2::element_text(size=20),
                   legend.title = ggplot2::element_text(size=18, face="bold"),
                   legend.text = ggplot2::element_text(size=18),
                   legend.position = "top",
                   legend.background = ggplot2::element_rect(linetype = "solid", color="black"),
                   plot.margin = ggplot2::unit(c(4,25,4,0),"points"),
                   axis.title = ggplot2::element_text(size=16),
                   text=ggplot2::element_text(family="Times"))
}