Nothing
# compute correlation matrix:
# - automatically removes non-numeric variables from the data
# - automatically does "pairwise.complete.obs" for handling missing data
# - can report the result of correlation tests if requested
# - can do pearson, spearman and kendall
# - defaults to no test, and to pearson correlation
#' Correlation matrices
#'
#' @description Computes a correlation matrix and runs hypothesis tests with corrections for multiple comparisons
#'
#' @param x Matrix or data frame containing variables to be correlated
#' @param y Optionally, a second set of variables to be correlated with those in \code{x}
#' @param test Should hypothesis tests be displayed? (Default=\code{FALSE})
#' @param corr.method What kind of correlations should be computed? Default is \code{"pearson"}, but \code{"spearman"} and \code{"kendall"} are also supported
#' @param p.adjust.method What method should be used to correct for multiple comparisons. Default value is \code{"holm"}, and the allowable values are the same as for \code{\link{p.adjust}}
#'
#' @details The \code{correlate} function calculates a correlation matrix
#' between all pairs of variables. Much like the \code{cor} function, if the
#' user inputs only one set of variables (\code{x}) then it computes all
#' pairwise correlations between the variables in \code{x}. If the user
#' specifies both \code{x} and \code{y} it correlates the variables in
#' \code{x} with the variables in \code{y}.
#'
#' Unlike the \code{cor} function, \code{correlate} does not generate an
#' error if some of the variables are categorical (i.e., factors). Variables
#' that are not numeric (or integer) class are simply ignored. They appear in
#' the output, but no correlations are reported for those variables. The
#' decision to have the \code{correlate} function allow the user a little
#' leniency when the input contains non-numeric variables should be explained.
#' The motivation is pedagogical rather than statistical. It is sometimes the
#' case in psychology that students need to work with correlation matrices
#' before they are comfortable subsetting a data frame, so it is convenient
#' to allow them to type commands like \code{correlate(data)} even when
#' \code{data} contains variables for which Pearson/Spearman correlations
#' are not appropriate. (It is also useful to use the output of
#' \code{correlate} to illustrate the fact that Pearson correlations should
#' not be used for categorical variables).
#'
#' A second difference between \code{cor} and \code{correlate} is that
#' \code{correlate} runs hypothesis tests for all correlations in the
#' correlation matrix (using the \code{cor.test} function to do the work).
#' The results of the tests are only displayed to the user if
#' \code{test=TRUE}. This is a pragmatic choice, given the (perhaps
#' unfortunate) fact that psychologists often want to see the results
#' of these tests: it is probably not coincidental that the \code{corr.test}
#' function in the \pkg{psych} package already provides this functionality
#' (though the output is difficult for novices to read).
#'
#' The concern with running hypothesis tests for all elements of a correlation
#' matrix is inflated Type I error rates. To minimise this risk, reported
#' p-values are adjusted using the Holm method. The user can change this
#' setting by specifying \code{p.adjust.method}. See \code{\link{p.adjust}}
#' for details.
#'
#' Missing data are handled using pairwise complete cases.
#'
#' @return The printed output shows the correlation matrix, and if tests are
#' requested it also reports a matrix of p-values and sample sizes associated
#' with each correlation (these can vary if there are missing data). The
#' underlying data structure is an object of class \code{correlate} (an S3
#' class). It is effectively a list containing four elements:
#' \code{correlation} is the correlation matrix, \code{p.value} is the matrix
#' of p-values, \code{sample.size} is the matrix of sample sizes, and
#' \code{args} is a vector that stores information about what the user
#' requested.
#'
#' @export
#'
#' @examples
#' # data frame with factors and missing values
#' data <- data.frame(
#' anxiety = c(1.31,2.72,3.18,4.21,5.55,NA),
#' stress = c(2.01,3.45,1.99,3.25,4.27,6.80),
#' depression = c(2.51,1.77,3.34,5.83,9.01,7.74),
#' happiness = c(4.02,3.66,5.23,6.37,7.83,1.18),
#' gender = factor( c("male","female","female","male","female","female") ),
#' ssri = factor( c("no","no","no",NA,"yes","yes") )
#' )
#'
#' # default output is just the (Pearson) correlation matrix
#' correlate( data )
#'
#' # other types of correlation:
#' correlate( data, corr.method="spearman" )
#'
#' # two meaningful subsets to be correlated:
#' nervous <- data[,c("anxiety","stress")]
#' happy <- data[,c("happiness","depression","ssri")]
#'
#' # default output for two matrix input
#' correlate( nervous, happy )
#'
#' # the same examples, with Holm-corrected p-values
#' correlate( data, test=TRUE )
#' correlate( nervous, happy, test=TRUE )
#'
correlate <- function( x, y=NULL, test=FALSE, corr.method="pearson", p.adjust.method="holm" ) {
# did the user specify two input matrices?
two.inputs <- !is.null(y)
# allow numeric vectors
if( is.vector(x) & is.numeric(x) ) {
x <- data.frame(x)
call <- match.call()
n <- call[[2]]
names(x) <- "x.var"
if(class(n) == "name") names(x) <- as.character(n)
}
if( two.inputs & is.vector(y) & is.numeric(y) ) {
y <- data.frame(y)
call <- match.call()
n <- call[[3]]
names(y) <- "y.var"
if(class(n) == "name") names(y) <- as.character(n)
}
# check input matrices are data frames
if( !(class(x) %in% c("matrix","data.frame") )) {
stop( 'x must be a matrix or data frame' )
}
if( two.inputs & !(class(y) %in% c("matrix","data.frame") )) {
stop( 'y must be a matrix or data frame' )
}
# warn if x and y appear to have the same variables!
# coerce to data frame
if( class(x) =="matrix" ) x <- as.data.frame(x)
if( two.inputs & class(y) == "matrix") y <- as.data.frame(y)
# store other args
args <- c( two.inputs=two.inputs, test=test,
corr.method=corr.method, p.adjust.method=p.adjust.method )
# define a function to run correlation test and trap warning message
getCT <- function( x,y, method ) {
# get the correlation test, trapping the ties problem warning as needed...
old.warn <- options(warn=2) # convert warnings to errors
ct <- try( stats::cor.test( x, y, method=method), silent=TRUE ) # try the correlation
tp <- FALSE # assume no ties problem unless...
# check for failures:
if( class(ct) == "try-error" ) {
# handle the case when the "error" was a ties warning
tp <- length( grep("exact p-value with ties",ct )) > 0
if( tp ) { # if it was a ties problem...
options(warn=-1) # suppress warnings completely for the next run...
} else { # if not...
options( old.warn ) # reset warning state and let R throw what it likes...
}
# now run the test again...
ct <- stats::cor.test( x, y, method=method )
}
options( old.warn ) # reset warnings to original state
return( list( ct=ct, tp=tp) )
}
if( !two.inputs ) {
#### one matrix case ###
# drop categorical variables
classes <- sapply(x,"class") # get the classes
inds <- which( classes %in% c("integer","numeric")) # retain only int and num
n.vars <- dim(x)[2] # number of variables
n.cont <- length(inds) # number of continuous variables
# initialise the output list with empty matrices
R <- list( correlation = matrix( NA, n.vars, n.vars) )
rownames( R$correlation ) <- colnames( R$correlation ) <- colnames(x)
R$sample.size <- R$p.value <- R$correlation
R$args <- args
R$tiesProblem <- FALSE
# run pairwise tests (inefficient looping!)
for( a in 1:(n.cont-1) ){
for( b in (a+1):n.cont) {
i <- inds[a] # the a-th continuous variable
j <- inds[b] # the b-th continuous variable
# ct <- cor.test( x[,i], x[,j], method=corr.method )
cttp <- getCT( x[,i], x[,j], corr.method )
# store the output
R$tiesProblem <- R$tiesProblem | cttp$tp
R$correlation[j,i] <- R$correlation[i,j] <- cttp$ct$estimate
R$p.value[j,i] <- R$p.value[i,j] <- cttp$ct$p.value
}
}
# adjust p (inefficient duplication here)
upper.inds <- upper.tri(R$p.value) & !is.na(R$p.value) # cells containing p-values, upper
R$p.value[upper.inds] <- stats::p.adjust( R$p.value[upper.inds], method=p.adjust.method )
lower.inds <- lower.tri(R$p.value) & !is.na(R$p.value) # cells containing p-values, lower
R$p.value[lower.inds] <- stats::p.adjust( R$p.value[lower.inds], method=p.adjust.method )
# fill in sample sizes for individual variables
for( i in 1:n.vars ) {
R$sample.size[i,i] <- sum(!is.na(x[,i]))
}
# and pairwise sample sizes for all variables
for( i in 1:(n.vars-1) ) {
for( j in (i+1):n.vars) {
R$sample.size[j,i] <- R$sample.size[i,j] <- sum(!(is.na(x[,i]) | is.na(x[,j])))
}
}
} else {
#### two matrix case ###
# track only the categorical variables
inds.x <- which(sapply(x,"class") %in% c("integer","numeric"))
inds.y <- which(sapply(y,"class") %in% c("integer","numeric"))
# variable numbers
n.vars.x <- dim(x)[2] # number of variables in x
n.vars.y <- dim(y)[2] # number of variables in y
n.cont.x <- length(inds.x) # number of continuous variables in x
n.cont.y <- length(inds.y) # number of continuous variables in y
# initialise the output list
R <- list( correlation = matrix( NA, n.vars.x, n.vars.y) )
rownames( R$correlation ) <- colnames(x)
colnames( R$correlation ) <- colnames(y)
R$sample.size <- R$p.value <- R$correlation
R$args <- args
R$tiesProblem <- FALSE
# run pairwise tests (inefficient looping!)
for( a in 1:n.cont.x ){
for( b in 1:n.cont.y) {
i <- inds.x[a] # the a-th continuous variable in x
j <- inds.y[b] # the b-th continuous variable in y
#ct <- cor.test( x[,i], y[,j], method=corr.method ) # run the test
cttp <- getCT( x[,i], y[,j], corr.method )
# store the output
R$tiesProblem <- R$tiesProblem | cttp$tp
R$correlation[i,j] <- cttp$ct$estimate
R$p.value[i,j] <- cttp$ct$p.value
# store sample size
R$sample.size[i,j] <- sum(!(is.na(x[,i]) | is.na(y[,j])))
}
}
# adjust p
R$p.value <- matrix( stats::p.adjust( R$p.value, method=p.adjust.method ),
n.vars.x, n.vars.y,
dimnames=dimnames(R$p.value) )
}
# define an S3 class
class(R) <- c("correlate","list")
return(R)
}
# print method
#' Print method for correlate objects
#'
#' @param x An object of class 'correlate'
#' @param ... For consistency with the generic (unused)
#'
#' @return Invisibly returns the original object
#' @export
print.correlate <- function( x, ... ){
# fixed properties that should probably be converted to
# input arguments?
nDigits <- 3
naPrint <- "."
# function to force equal digit printing
makeTxt <- function(x, nDigits, naPrint ) {
n <- dim(x)
format <- paste0("%.",nDigits,"f")
txt <- sprintf( format, x)
txt <- gsub("NA", naPrint, txt, fixed=TRUE)
txt <- matrix(txt,n[1],n[2], dimnames=dimnames(x))
return(txt)
}
# function to print the text matrix
printTxt <- function( txt ) {
print.default( txt, quote=FALSE, right=TRUE)
}
# print the correlations
cat("\n")
cat("CORRELATIONS\n")
cat("============\n")
cat("- correlation type: ", x$arg["corr.method"], "\n")
cat("- correlations shown only when both variables are numeric\n")
cat("\n")
if( options()$show.signif.stars & x$arg["test"]==TRUE ) { # if significance stars needed...
# function to return the significance stars string
getSigString <- function(p) {
if( is.na(p) | p > .1 ) return( " " )
if( p > .05 ) return( ". " )
if( p > .01 ) return( "* " )
if( p > .001 ) return( "**" )
return( "***" )
}
# function to generate interleaved indices
interleave <- function(n){
ord <- vector()
ord[ seq(1,2*n-1,2) ] <- 1:n
ord[ seq(2,2*n,2)] <- (n+1):(2*n)
return(ord)
}
# print correlation matrix with sig stars
n <- dim( x$correlation )[2] # number of columns
txt <- makeTxt( x$correlation, nDigits, naPrint ) # text form of the correlation matrix
for( i in 1:n ) txt[,i] <- paste0(txt[,i], sapply(x$p.value[,i], getSigString)) # append stars
colnames(txt) <- paste0( colnames(x$correlation), rep.int(" ",n)) # column names
rownames(txt) <- rownames( x$correlation ) # row names
printTxt(txt)
# print the key
cat("\n---\n")
cat("Signif. codes: . = p < .1, * = p<.05, ** = p<.01, *** = p<.001\n")
} else { # if no significance stars needed...
txt <- makeTxt( x$correlation, nDigits, naPrint )
printTxt( txt )
}
if( x$arg["test"]==TRUE ) {
# print the p.values
cat("\n\np-VALUES\n")
cat("========\n")
if( x$args["two.inputs"] ) { nTests <- sum( !is.na( x$correlation))
} else { nTests <- sum( !is.na( x$correlation)) / 2 }
cat("- total number of tests run: ", nTests, "\n")
cat("- correction for multiple testing: ", x$arg["p.adjust.method"],"\n")
if( x$tiesProblem ) {
cat("- WARNING: cannot compute exact p-values with ties\n" )
}
cat("\n")
# print p-values, forcing nDigits
txt <- makeTxt( x$p.value, nDigits, naPrint )
printTxt( txt )
# print the sample sizes
cat("\n\nSAMPLE SIZES\n")
cat("============\n")
cat("\n")
txt <- makeTxt( x$sample.size, nDigits=0, naPrint )
printTxt( txt )
}
# invisbly return the input
return( invisible(x) )
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.