R/long_distance_df.R

Defines functions lddf_work lddf_check long_distance_df

Documented in lddf_check lddf_work long_distance_df

### long_distance_df -----------------------------------------------------------

#' Create a long data frame of among-sample distances
#'
#'
#' `long_distance_df` creates a long data frame of all the pairwise
#' distances from a sample distance matrix (e.g. the output of
#' [phyloseq::distance()]) with all the metadata listed for
#' each sample. Allows for easy within- and among-group boxplots, or whatever
#' other comparisons are of interest.
#'
#' @section Value: A data frame \eqn{N(N-1)} (or \eqn{N^2} if `diag = TRUE`
#'   is set) rows (where N is the number of samples) with sample IDs, metadata,
#'   and pairwise distances listed for each pair of samples. Sample ID and
#'   metadata columns have '1' or '2' appended to them so the user can tell
#'   which column belongs to which sample.
#' @param dmat A distance matrix or other diagonal matrix object with sample
#'   names as row and column names.
#' @param metadat A data frame or data frame-like object with the data set's
#'   metadata
#' @param idcol (`'X.SampleID'`.) A string. The column in `metadat`
#'   that holds the sample names. Sample names should match the row/column namse
#'   of the distance matrix. If there are samples in the metadata data frame
#'   that are missing from the distance matrix, they will be excluded with a
#'   warning. If there are samples in the distance matrix that are missing from
#'   the metadata, you will get an error.
#' @param diag (`FALSE`.) Logical. Whether the diagonal elements (zeros in
#'   a distance matrix) should be included in the long data frame. Defaults to
#'   `FALSE` because we almost never want them.
#' @param suff (`c('1','2')`.) A character vector of length 2. The suffixes
#'   to be appended to the metadata column names in the output. The two elements
#'   must not be identical.
#' @param distcol (`'Distance'`.) A string. The desired column name for the
#'   distance column in your long data frame. Only here to avoid clashes with
#'   existing metadata column names.
#' @param baseline (`'NULL'`). A dataframe whose column names must also be
#'   column names in the metadat data frame, and whose rows contain a subset of
#'   the possible values/combinations. If this parameter is used, all the
#'   samples whose metadata matches a row in this data frame will end up in
#'   Sample1 and the rest will end up in Sample2. This means you will _not_ get
#'   all the pairs, because the samples in Sample1 will not get compared to each
#'   other, and neither will the samples in Sample2. If this parameter is not
#'   used, the upper triangle of the distance matrix is used, without regard for
#'   metadata values.
#' @export
long_distance_df = function(dmat, metadat, idcol = 'X.SampleID', diag = FALSE,
                            suff = c('1','2'), distcol = 'Distance',
                            baseline = NULL){

    # The sample data object does not play nice with others
    metadat = data.frame(metadat)

    # Check the inputs
    lddf_check(dmat, metadat, idcol, diag, suff)

    # Turn this into a usable matrix
    dmat = as.matrix(dmat)
    if (is.null(baseline)){
        if (diag) {
            dmat[upper.tri(dmat)]= NA
        } else {
            dmat[!lower.tri(dmat)] = NA
        }
    } else {
        s1 = dplyr::inner_join(metadat, baseline,
                               by = colnames(baseline))[,idcol]
        s2 = dplyr::anti_join(metadat, baseline,
                              by = colnames(baseline))[,idcol]
        dmat = matrix(dmat[as.character(s1),as.character(s2)],
                         nrow = length(s1),
                         ncol = length(s2))
        rownames(dmat) = s1
        colnames(dmat) = s2
    }

    lddf = lddf_work(dmat, metadat, idcol = idcol, suff = suff,
                     distcol = distcol)

    return(lddf)
}

### lddf_check -----------------------------------------------------------------

#' Check the inputs of `long_distance_df()`
#'
#' For internal use only
lddf_check = function(dmat, metadat, idcol = 'X.SampleID', diag = FALSE,
                            suff = c('1','2')){

    # Check inputs
    if (!is.logical(diag)){
        stop('diag must be either TRUE or FALSE')
    }

    if ((length(suff) != 2) | (suff[1] == suff[2]) | (!is.character(suff))){
        stop('suff must be a vector of two non-identical strings')
    }

    if (!(idcol %in% colnames(metadat))){
        stop('idcol must be one of the column names of metadat')
    }

    # Check sample names

    ## If it's a proper distance matrix, it will have a "Labels" attribute
    if (!is.null(attr(dmat,'Labels'))){
        d_samps = attr(dmat, 'Labels')
    } else if (!is.null(colnames(dmat))){
        if (any(colnames(dmat) != rownames(dmat))){
            stop(paste('The row and column names of dmat must be the same,',
                        'and in the same order.'))
        } else {
            d_samps = colnames(dmat)
        }
    }

    if (!all(d_samps %in% as.character(metadat[,idcol]))){
        stop('Some samples missing from metadata')
    } else if (!all(as.character(metadat[,idcol]) %in% d_samps)){
        warning(paste('Metadata samples that are not in the distance matrix',
                        'will be excluded.'))
    }
}


### lddf_work ------------------------------------------------------------------

#' Does the actual gathering and spreading without testing assumptions
#'
#' `lddf_work` Used internally by `long_distance_df()`. I recommend
#' you use that function unless you really know what you're doing. This function
#' does the actual gathering, spreading, and joining associated with making the
#' lddf, but without checking if the distance matrix is sensible or removing
#' diagonals and repeats. Use this function if you know exactly what you want
#' and have trimmed your distance matrix down to only what you know you need.
#' Good for permutation tests.
#'
#' @param dmat A distance matrix or other diagonal matrix object with sample
#'   names as row and column names.
#' @param metadat A data frame or data frame-like object with the data set's
#'   metadata
#' @param idcol (`'X.SampleID'`.) A string. The column in `metadat`
#'   that holds the sample names. Sample names should match the row/column namse
#'   of the distance matrix. If there are samples in the metadata data frame
#'   that are missing from the distance matrix, they will be excluded with a
#'   warning. If there are samples in the distance matrix that are missing from
#'   the metadata, you will get an error.
#' @param suff (`c('1','2')`.) A character vector of length 2. The suffixes
#'   to be appended to the metadata column names in the output. The two elements
#'   must not be identical.
#' @param distcol (`'Distance'`.) A string. The desired column name for the
#'   distance column in your long data frame. Only here to avoid clashes with
#'   existing metadata column names.
lddf_work = function(dmat, metadat, idcol = 'X.SampleID', suff = c('1','2'),
                     distcol = 'Distance'){
    # Make it long
    dmat %>>%
        {data.frame(ID1 = rownames(.),.)} %>%
        tidyr::gather(ID2, Distance, 2:(ncol(dmat)+1), na.rm = TRUE) -> distlong

    ids = paste(idcol,suff, sep = '')
    names(distlong)[1:2] = ids

    # Add the metadata columns for the first sample
    distlong %>%
        dplyr::inner_join(metadat, by = setNames(c(idcol), ids[1])) -> distlong

    # Add the suffix to the column names
    cn = colnames(distlong)
    cn = ifelse(cn %in% colnames(metadat),
                paste(cn, suff[1], sep = ''),
                cn)
    colnames(distlong) = cn

    # Add the metadata columns to the second sample
    distlong %>%
        dplyr::inner_join(metadat, by = setNames(c(idcol), ids[2])) -> distlong

    # Add the suffix to the column names
    cn = colnames(distlong)
    cn = ifelse(cn %in% colnames(metadat),
                paste(cn, suff[2], sep = ''),
                cn)
    colnames(distlong) = cn

    return(distlong)
}
JCSzamosi/aftersl1p documentation built on July 3, 2025, 8:37 p.m.