R/overlap.R

#' Calculate absolute number or fraction of unique 2-way overlap.
#'
#' \code{overlap.2way} quantifies the amount of unique overlap between two input vectors.
#'
#' @param vec1       a vector of non-zero length. Vector can contain non-unique entries.
#'                   E.g.: sequence reads, clonal assignment numbers.
#' @param vec2       a vector of non-zero length. Vector can contain non-unique entries.
#' @param returnType whether to return the absolute number or the fraction of overlap.
#'                   Abbreviations accepted.
#' @return           A whole number or a fraction quantifying the amount of unique 2-way
#'                   overlap between \code{vec1} and \code{vec2}. \code{NA} if either of
#'                   the input vectors has length zero.
#' @details          For \code{returnType="frac"}, the denominator used is the smaller
#'                   number of the two counts of unique objects in \code{vec1} and
#'                   \code{vec2}.
#' @examples
#' # vec1 has 5 unique entries; vec2 has 2
#' # absolute number of unique overlapping entries is 2
#' overlap.2way(vec1=1:5, vec2=c(2,4,4), returnType="a")
#'
#' # fraction of unique overlapping entries is 1 (2/2)
#' # the number of unique entries in vec2 is used as the denominator
#' overlap.2way(vec1=1:5, vec2=c(2,4,4), returnType="f")
#' @export
overlap.2way = function(vec1, vec2, returnType=c("abs", "frac")) {
    returnType = match.arg(returnType)

    if (length(vec1)==0 | length(vec2)==0) {
        return(NA)
    }

    n.uniq.1 = length(unique(vec1))
    n.uniq.2 = length(unique(vec2))
    n.uniq.union = length(unique(c(vec1, vec2)))
    n.id = n.uniq.1 + n.uniq.2 - n.uniq.union
    if (returnType=="frac") {
        # weighted avg; weights = n.uniq.i/(n.uniq.1+n.uniq.2)
        # frac = n.id*2 / (n.uniq.1+n.uniq.2)

        # use smaller n.uniq.i as denominator
        frac = n.id / min(n.uniq.1, n.uniq.2)

        return(frac)
    } else if (returnType=="abs") {
        return(n.id)
    }
}


#' Calculate within-subject 2-way and 3-way clonal overlap.
#'
#' \code{overlap.clonal} calculates the overlap in terms of clonal assignment between
#' samples from the same subject.
#'
#' @param df         \code{data.frame} containing columns specified by \code{col.clone}
#'                   and \code{col.group}.
#' @param col.clone  a \code{character}. Name of column containing clonal assignments.
#' @param col.group  a \code{character}. Name of column containing sample information.
#' @return           A \code{list} consisting of:
#'                   \itemize{
#'                   \item \code{mtx.all}: a \code{data.frame} in which each row tabulates the number
#'                   of sequences from each sample assigned to the clone represented by
#'                   that row. Row names correspond to clonal assignments. Column names
#'                   correspond to names of samples.
#'                   \item \code{count.2way}: a vector indicating the counts of 2-way clonal
#'                   overlap. \code{NULL} if there are fewer than 2 unique samples in column
#'                   specified by \code{col.group}.
#'                   \item \code{mtx.2way}: a list of \code{data.frame}s; each matrix corresponds to a
#'                   different sample-sample combination and includes clones to which both
#'                   samples are assigned. \code{NULL} if there are fewer than 2 unique samples
#'                   in column specified by \code{col.group}.
#'                   \item \code{count.3way}: a vector indicating the counts of 3-way clonal
#'                   overlap. \code{NULL} if there are fewer than 3 unique samples in column
#'                   specified by \code{col.group}.
#'                   \item \code{mtx.3way}: a list of \code{data.frame}s; each matrix corresponds to a
#'                   different sample-sample-sample combination and includes clones to which
#'                   all three samples are assigned. \code{NULL} if there are fewer than 3
#'                   unique samples in column specified by \code{col.group}.
#'                   \item \code{count.uniq}: a vector indicating the counts of unique clones
#'                   in each sample.
#'                   }
#' @details          The same results can be expected from
#'                   \itemize{
#'                   \item \code{overlap.clonal(df, "CLONE", "SAMPLE")$count.2way["A-D"]}
#'                   \item \code{overlap.2way(df$CLONE[df$SAMPLE=="D"], df$CLONE[df$SAMPLE=="A"], returnType="abs")}.
#'                   }                   
#' @examples
#' # TBA
#' @export
overlap.clonal = function(df, col.clone, col.group) {
    stopifnot( (col.clone %in% colnames(df)) &
               (col.group %in% colnames(df)) )

    # get a matrix documenting distribution of sequences from a clone
    # into different groups
    # row: unique clone
    # column: groups
    clones = sort(unique(df[, col.clone]))
    groups = sort(unique(df[, col.group]))

    mtx = matrix(NA, nrow=length(clones), ncol=length(groups),
                 dimnames=list(clones, groups))

    for (i in 1:length(clones)) {
        cur.tab = table(df[df[, col.clone]==clones[i], col.group])
        mtx[i, match(names(cur.tab), colnames(mtx))] = cur.tab
    }

    if (length(groups)>=2) {
        # get a vector counting 2-way overlap; and
        # extract into a list rows from mtx corresponding to said overlaps
        count.vec.2 = vector(mode="integer")
        ov.list.2 = vector(mode="list")

        combo.2 = utils::combn(x=groups, m=2)
        for (i in 1:ncol(combo.2)) {
            # name for current overlap
            cur.name = paste0(combo.2[, i], collapse="-")
            # for each row/clone in mtx, count # NAs across columns involving current groups
            cur.na.count = apply(mtx[, combo.2[, i]], 1, function(x){sum(is.na(x))})
            # 0 NA in a row means clone is in both groups
            count.vec.2 = c(count.vec.2, sum(cur.na.count==0))
            names(count.vec.2)[length(count.vec.2)] = cur.name
            # store these clones
            ov.list.2 = c(ov.list.2, list(data.frame(mtx[cur.na.count==0, ])))
            names(ov.list.2)[length(ov.list.2)] = cur.name
        }
    } else {
        count.vec.2 = NULL
        ov.list.2 = NULL
    }


    if (length(groups)>=3) {
        # get a vector counting 3-way overlap; and
        # extract into a list rows from mtx corresponding to said overlaps
        count.vec.3 = vector(mode="integer")
        ov.list.3 = vector(mode="list")

        combo.3 = utils::combn(x=groups, m=3)
        for (i in 1:ncol(combo.3)) {
            # name for current overlap
            cur.name = paste0(combo.3[, i], collapse="-")
            # for each row/clone in mtx, count # NAs across columns involving current groups
            cur.na.count = apply(mtx[, combo.3[, i]], 1, function(x){sum(is.na(x))})
            # 0 NA in a row means clone is in all three groups
            count.vec.3 = c(count.vec.3, sum(cur.na.count==0))
            names(count.vec.3)[length(count.vec.3)] = cur.name
            # store these clones
            ov.list.3 = c(ov.list.3, list(data.frame(mtx[cur.na.count==0, ])))
            names(ov.list.3)[length(ov.list.3)] = cur.name
        }
    } else {
        count.vec.3 = NULL
        ov.list.3 = NULL
    }

    return(list(mtx.all=data.frame(mtx),
                count.2way=count.vec.2, mtx.2way=ov.list.2,
                count.3way=count.vec.3, mtx.3way=ov.list.3,
                count.uniq=sapply(groups, function(g){
                    length(unique(df[df[, col.group]==g, col.clone]))
                    }) ))
}
julianqz/igtracker documentation built on May 20, 2019, 4:21 a.m.