R/rMatrix.R
In userfriendlyscience: Quantitative Analysis Made Accessible

Documented in rMatrix

###########################################################
###
### R file with the function rMatrix, which provides a
### symmetric or asymmetric matrix of correlations,
### their confidence intervals, and p-values. The p-values
### can be corrected for multiple testing.
###
### File created by Gjalt-Jorn Peters. Questions? You can
### contact me through http://behaviorchange.eu.
###
###########################################################

###########################################################
### Define functions
###########################################################



#' rMatrix
#' 
#' rMatrix provides a correlation matrix with confidence intervals and a
#' p-value adjusted for multiple testing.
#' 
#' 
#' rMatrix provides a symmetric or asymmetric matrix of correlations their
#' confidence intervals, and p-values. The p-values can be corrected for
#' multiple testing.
#' 
#' @param dat A dataframe containing the relevant variables.
#' @param x Vector of 1+ variable names.
#' @param y Vector of 1+ variable names; if this is left empty, a symmetric
#' matrix is created; if this is filled, the matrix will have the x variables
#' defining the rows and the y variables defining the columns.
#' @param conf.level The confidence of the confidence intervals.
#' @param correction Correction for multiple testing: an element out of the
#' vector c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr",
#' "none"). NOTE: the p-values are corrected for multiple testing; The
#' confidence intervals are not (yet :-)).
#' @param digits With what precision do you want the results to print.
#' @param pValueDigits Determines the number of digits to use when displaying p
#' values. P-values that are too small will be shown as p<.001 or p<.00001 etc.
#' @param colspace Number of spaces between columns (only for R output, ignored
#' for LaTeX output)
#' @param rowspace Number of rows between table rows (note: one table row is 2
#' rows; only for R output, ignored for LaTeX output).
#' @param colNames colNames can be "numbers" or "names". "Names" cause
#' variables names to be printed in the heading; "numbers" causes the rows to
#' become numbered and the numbers to be printed in the heading.
#' @param output Can be "R" or "LaTeX"; if output is set to "LaTeX", the result
#' is a LaTeX table (e.g. for use in knitr).
#' @param env.LaTeX For LaTeX output, the environment can be set with
#' env.LaTeX.
#' @param pboxWidthMultiplier When using LaTeX, pboxWidthMultiplier can be used
#' to make the cells narrower or wider (1 works for anything up until 4 or 5
#' digits).
#' @return
#' 
#' An object with the input and several output variables. Most notably a number
#' of matrices: \item{r}{Pearson r values.} \item{parameter}{Degrees of
#' freedom.} \item{ci.lo}{Lower bound of Pearson r confidence interval.}
#' \item{ci.hi}{Upper bound of Pearson r confidence interval.}
#' \item{p.raw}{Original p-values.} \item{p.adj}{p-values adjusted for multiple
#' testing.}
#' @author Gjalt-Jorn Peters
#' 
#' Maintainer: Gjalt-Jorn Peters <gjalt-jorn@@userfriendlyscience.com>
#' @keywords utilities
#' @examples
#' 
#' 
#' rMatrix(mtcars, x=c('disp', 'hp', 'drat'))
#' 
#' 
#' @export rMatrix
rMatrix <- function(dat, x, y=NULL, conf.level = .95, correction = "fdr",
                    digits = 2, pValueDigits=3, colspace=2, rowspace=0,
                    colNames ="numbers",
                    output="R",
                    env.LaTeX = 'tabular',
                    pboxWidthMultiplier = 1) {
  
  ### Check whether the first vector of vectors has 
  if (length(x) < 1) {
    stop(paste0("Error: x vector has 0 elements or less; ",
                "make sure to specify at least one variable name!."));
  }  
  
  ### Check whether we have a second set of variables
  if (!is.null(y)) {
    if (length(y) < 1) {
      stop(paste0("Error: y vector has 0 elements or less; ",
                  "make sure to specify at least one variable name!."));
    }
    symmetric <- FALSE;
  }
  else {
    y <- x;
    symmetric <- TRUE;
  }
  
  ### Create object to return, and store variable names, confidence of
  ### confidence interval, and digits
  res <- list();
  res$variables.rows <-x;
  res$variables.cols <- y;
  res$ci.confidence <- conf.level;
  res$correction <- correction;
  res$digits <- digits;
  res$pValueDigits <- pValueDigits;
  res$colspace <- colspace;
  res$rowspace <- rowspace;
  res$colNames <- colNames;
  res$output <- output;
  res$env.LaTeX <-env.LaTeX;
  res$pboxWidthMultiplier <- pboxWidthMultiplier;
  
  res$r <- matrix(nrow = length(x), ncol = length(y));
  res$parameter <- matrix(nrow = length(x), ncol = length(y));
  res$ci.lo <- matrix(nrow = length(x), ncol = length(y));
  res$ci.hi <- matrix(nrow = length(x), ncol = length(y));
  res$p.raw <- matrix(nrow = length(x), ncol = length(y));
  res$p.adj <- matrix(nrow = length(x), ncol = length(y));
  
  xCounter <- 1;
  for(curXvar in x) {
    yCounter <- 1;
    for(curYvar in y) {
      curTest <- cor.test(dat[,curXvar], dat[,curYvar],
                          use="complete.obs",
                          conf.level = conf.level);
      res$r[xCounter, yCounter] <- curTest$estimate;
      res$parameter[xCounter, yCounter] <- curTest$parameter;
      res$ci.lo[xCounter, yCounter] <- curTest$conf.int[1];
      res$ci.hi[xCounter, yCounter] <- curTest$conf.int[2];
      res$p.raw[xCounter, yCounter] <- curTest$p.value;
      yCounter <- yCounter + 1;
    }
    xCounter <- xCounter + 1;
  }
  
  ### If a symmetric table was requested, remove half
  ### the correlations and the diagonal
  if (symmetric) {
    ### Remove lower half of the matrices
    res$r[lower.tri(res$r)] = NA;
    res$parameter[lower.tri(res$parameter)] = NA;
    res$ci.lo[lower.tri(res$ci.lo)] = NA;
    res$ci.hi[lower.tri(res$ci.hi)] = NA;
    res$p.raw[lower.tri(res$p.raw)] = NA;
    ### Remove diagonal in matrices
    for(diagonal in c(1:nrow(res$r))) {
      res$r[diagonal, diagonal] <- NA;
      res$parameter[diagonal, diagonal] <- NA;
      res$ci.lo[diagonal, diagonal] <- NA;
      res$ci.hi[diagonal, diagonal] <- NA;
      res$p.raw[diagonal, diagonal] <- NA;
    }
  }
  
  ### Correct p-values for multiple testing
  res$p.adj <- matrix(p.adjust(res$p.raw, method=correction), nrow(res$p.raw), ncol(res$p.raw));
  
  ### Set row and column names
  rownames(res$r) <- x;         colnames(res$r) <- y;
  rownames(res$parameter) <- x; colnames(res$parameter) <- y;
  rownames(res$ci.lo) <- x;     colnames(res$ci.lo) <- y;
  rownames(res$ci.hi) <- x;     colnames(res$ci.hi) <- y;
  rownames(res$p.raw) <- x;     colnames(res$p.raw) <- y;
  rownames(res$p.adj) <- x;     colnames(res$p.adj) <- y;
  
  ### Set class & return result
  class(res) <- "rMatrix";
  return(res);
}

print.rMatrix <- function (x, digits=x$digits, output=x$output,
                           pValueDigits = x$pValueDigits,
                           env.LaTeX = x$env.LaTeX,
                           pboxWidthMultiplier = x$pboxWidthMultiplier,
                           colNames = x$colNames, ...) {
  
  if (output=="R") {
    
    ### We want multiple lines per cell, so we'll need to print manually.
    ### We first print the confidence interval on the first line; then,
    ### on the next line, the point estimate and the p-value (corrected
    ### for multiple testing).
    ###
    ### Compute how wide the columns should be. This depends on
    ### 1) the width of the confidence intervals, and 2) the width
    ### of the variable name in each column
    ### The maximum length of confidence intervals is:
    ### [-.X; -.X]
    ### Where the number of X's is determined by digits.
    ### Thus, 8 + digits * 2 represents the max length of
    ### confidence interval.
    maxConfIntLength <- 8 + digits * 2;
    
    ### Compute max length of second row
    max2ndRowLength <- 4 + digits + 5 + pValueDigits;
    
    ### set widest content
    widestContent <- max(maxConfIntLength, max2ndRowLength);
    
    if (colNames=="numbers") {
      ### The columns contain numbers instead of names;
      ### calculate the max width of these numbers
      numColSize <- nchar(length(x$variables.rows)) + x$colspace;
      colSizes <- rep(numColSize, length(x$variables.cols));
    }
    else {
      ### Otherwise, for each column, store the length of the variable name
      ### of that column.
      colSizes <- nchar(x$variables.cols);
    }
    
    ### Then, compare these to the maxConfIntLength, and store the
    ### larger of the two
    colSizes <- ifelse(colSizes > widestContent, colSizes, widestContent);
    
    ### If pval is TRUE, we use the p-value function to format the p-values.
    ### This means that the columns need to be three characters wider, in case
    ### we'll need the scientific notation somewhere.
#     if(x$pval) {
#       colSizes <- colSizes + 3;
#     }
    ### NOTE: obsolete as of 2015-04-15 (version 0.2-3), as formatPvalue is used now

    if (colNames=="numbers") {
      ### Print spaces in first cell of first row as wide as
      ### the widest number
      cat(repeatStr(' ', numColSize + x$colspace));
    }
    
    ### First print column names. This, however, requires knowing
    ### how long the row names are going to be, so first look for
    ### the longest row name and get its length; add one as
    ### separation between the columns; and then print that
    ### number of spaces.
    leftColSize <- max(nchar(x$variables.rows)) + x$colspace;
    cat(repeatStr(" ", leftColSize));
    
    ### We'll need to print the column names with a loop (see
    ### explanation below)
    for(j in (1:length(x$variables.cols))) {
      if (colNames=="numbers") {
        ### Print column number
        cat(paste0(j, repeatStr(' ', colSizes[j] - nchar(j) + x$colspace)));
      } else {
        ### Print the column name
        cat(x$variables.cols[j]);
        ### Print trailing spaces (+x$colspace to have space between columns)
        cat(repeatStr(" ", colSizes[j] - nchar(x$variables.cols[j]) + x$colspace));
      }
    }
    
    ### Print newline character
    cat("\n");
  }
  else if (output=="LaTeX") {
    
    ### Start table
    if (colNames=="numbers") {
      cat(paste0("\\begin{", env.LaTeX, "}{rl",
                 paste0(rep('c', length(x$variables.cols)), collapse=""),
                 "}\n\\hline\n"));
    }
    else {
      cat(paste0("\\begin{", env.LaTeX, "}{l",
                 paste0(rep('c', length(x$variables.cols)), collapse=""),
                 "}\n\\hline\n"));
    }
    
    if (colNames=="numbers") {
      ### Replace variable names for the columns with numbers
      x$variables.cols <- c(1:length(x$variables.cols));
      ### Add first empty cell for the column with the row numbers
      cat(' &');
    }
      
    ### Print variable names, close line (needs four backslashes; each of the two
    ### backslashes needs to be escaped), and print a horizontal line
    cat(paste0(" & ", paste0(sanitizeLatexString(x$variables.cols), collapse=" & "), " \\\\ \n\\hline\n"));
    
    ### Compute width for pBoxes in cells (see below)
    pboxWidth <- paste0(7 + pboxWidthMultiplier * digits, "em");
    
  }
  
  ### Now we'll start printing the rows, starting with the variable
  ### name and the confidence interval.
  for(i in (1:length(x$variables.rows))) {

    if (output=="R") {
      
      if (colNames=="numbers") {
        ### Print row number
        cat(paste0(repeatStr(' ', numColSize - nchar(i)), i, repeatStr(' ', x$colspace)));
      }

      ### Print variable name for this row
      cat(x$variables.rows[i]);

      ### Print spaces needed to line up second column
      cat(repeatStr(" ", leftColSize - nchar(x$variables.rows[i])));

      ### Now we need two loops (one for each line) to create the cells.
      ### Normally, we could provide paste0 (or paste) with a vector,
      ### and it would concatenate the elements for us, but in this case,
      ### every column can have a different width, so we need a different
      ### number of leading spaces.
      
      ### First, the confidence intervals
      for(j in (1:length(x$variables.cols))) {
        ### If the point estimate is NA, don't display anything
        if (is.na(x$r[i,j])) {
          cat(repeatStr(" ", colSizes[j] + x$colspace));
        }
        else {
          ### Create confidence interval for this column
          confInt <- paste0("[", formatR(x$ci.lo[i,j], digits), "; ", formatR(x$ci.hi[i,j], digits), "]");
          ### Print confidence interval
          cat(confInt);
          ### Print trailing spaces (+ x$colspace to have space between columns)
          cat(repeatStr(" ", colSizes[j] - nchar(confInt) + x$colspace));
        }
      }
      
      ### Print newline character
      cat("\n");

      if (colNames=="numbers") {
        ### Print spaces of width of longest row number
        cat(paste0(repeatStr(' ', numColSize + x$colspace)));
      }
      
      ### Start in second column
      cat(repeatStr(" ", leftColSize));
      
      ### Then, the point estimate and p-value
      for(j in (1:length(x$variables.cols))) {
        ### If the point estimate is NA, don't display anything
        if (is.na(x$r[i,j])) {
          cat(repeatStr(" ", colSizes[j] + x$colspace));
        }
        else {
          ### Create r & p
          content <- paste0("r=", formatR(x$r[i,j], digits), ", ",
                            formatPvalue(x$p.adj[i,j], digits=pValueDigits,
                                         spaces=FALSE));
          ### Print point estimate and p-value
          cat(content);
          ### Print trailing spaces (+x$colspace to have space between columns)
          cat(repeatStr(" ", colSizes[j] - nchar(content) + x$colspace));
        }
      }
      
      ### Print newline character
      cat("\n");
      
      ### x$rowspace indicated how many empty rows should be printed between
      ### every table row
      if (x$rowspace > 1) {
        for(i in c(1:x$rowspace)) {
          cat("\n");      
        }
      }
    }
    else if (output=="LaTeX") {

      if (colNames=="numbers") {
        ### Print row number
        cat(paste0(i, ' & '));
      }
      
      ### Print variable name for this row
      cat(sanitizeLatexString(x$variables.rows[i]));
      
      ### Loop through columns
      for(j in (1:length(x$variables.cols))) {
        ### Only print if the point estimate is not NA
        if (is.na(x$r[i,j])) {
          cat(" & ");
        }
        else {
          ### We need to put the contents in a parbox,
          ### because we need two lines. The width of this
          ### parbox depends on the number of digits to
          ### display, which we calculated earlier.
          cat(paste0(" & \\parbox[t]{", pboxWidth, "}{ \\centering "));
          ### Create confidence interval for this column
          confInt <- paste0("[", formatR(x$ci.lo[i,j], digits), "; ",
                            formatR(x$ci.hi[i,j], digits), "]");
          ### Print confidence interval and newline character
          cat(paste0(confInt, " \\\\ "));
          ### Print point estimate
          content <- paste0("r=", formatR(x$r[i,j], digits), ", ",
                            formatPvalue(x$p.adj[i,j],
                                         digits=pValueDigits,
                                         spaces=FALSE));
          ### Print point estimate and p-value and close cell
          cat(content);
          ### Close cell
          cat("\\\\ } ");
        }
      }
      
      ### Print '\\' and go to next line - note that we need to
      ### escape each backslash with a backslash.
      cat(" \\\\\n");
      
    }
    
    
  }
  
  if (output=="LaTeX") {
    cat(paste0("\n\\hline\n\\end{", env.LaTeX, "}\n"));
  }
  
  invisible();
}