R/Fluidigm-methods.R

Defines functions primerAverage .internalfilter mast_filter getrc getss getwss getConcordance numexp condSd condmean freq checkGroups logshift logmean expavg

Documented in condmean condSd expavg freq getConcordance getrc getss getwss logmean mast_filter numexp primerAverage

##' Exponential average
##'
##' Puts log transformed values onto natural scale and takes mean of vector.
##' Calculates mean(2^x - 1)
##' @param x \code{numeric}
##' @return \code{numeric}
##' @examples
##' x <- 1:10
##' logmean(expavg(x))
##' @export
expavg <- function(x) mean(2^x-1)

##' Log mean
##'
##' Takes mean of natural scaled values and then logrithm
##' Approximately the inverse operation of \code{\link{expavg}}
##' Calculates log2(mean(x) + 1)
##' @param x \code{numeric}
##' @return \code{numeric}
##' @examples
##' x <- 1:10
##' expavg(logmean(x))
##' @export
logmean <- function(x) log2(mean(x)+1)

logshift <- function(x) log2(x+1)

                                        #try to throw an error if groups isn't in cellData
                                        #groups can be character vector or symbol (quote(Group1:Group2), used for lattice)
checkGroups <- function(sc, groups){
    if(!missing(groups) && !is.null(groups)){
        if(!is.character(groups) || is.factor(groups))
            stop("'groups' must be character or factor")
        sd <- setdiff(groups, names(colData(sc)))
        if(length(sd)>0) stop(sprintf('%s not found in %s object', paste(sd, collapse=', '), class(sc)))
    }
    invisible(TRUE)
}

##' Summary statistics for genes in an experiment
##'
##' \code{freq} returns the frequency of expression, i.e., the proportion of non-zero values in \code{sc}.
##' NAs can be optionally removed
##' @param sc SingleCellAssay
##' @param na.rm should NAs be removed, or carried through?
##' @return vector of proportions
##' @examples
##' data(vbetaFA)
##' freq(vbetaFA)
##' condmean(vbetaFA)
##' @export
freq <- function(sc, na.rm=TRUE){
    stopifnot(is(sc, 'SingleCellAssay'))
    apply(t(assay(sc))>0, 2, mean, na.rm=na.rm)
}

##' @describeIn freq Report the mean non-zero expression value for each gene. NAs are always removed.
##' @export
condmean <- function(sc){
    stopifnot(is(sc, 'SingleCellAssay'))
    exprsNA <- t(assay(sc))
    exprsNA[exprsNA==0] <- NA
    apply(exprsNA, 2, mean, na.rm=TRUE)
}

##' @describeIn freq Report standard deviation of expression, for positive et for each gene. NAs are always removed.
##' @export
condSd <- function(sc){
    stopifnot(is(sc, 'SingleCellAssay'))
    exprsNA <- t(assay(sc))
    exprsNA[exprsNA==0] <- NA
    sqrt(apply(exprsNA, 2, var, na.rm=TRUE))
}

##' @describeIn freq Report number of expressing cells ($>0$) per gene. NAs are removed.
##' @export
numexp <- function(sc){
    stopifnot(is(sc, 'SingleCellAssay'))
    apply(t(assay(sc))>0, 2, sum, na.rm=TRUE)
}

##' Get the concordance between two experiments
##'
##' Return the concordance between two assays (i.e. single cell and hundred cell).
##' The "average" of \code{singleCellRef} (after adjusting for the number of cells) and
##' \code{singleCellComp} are taken per gene, per \code{groups}.
##' A \code{data.frame} with one row per gene-\code{groups} is returned with some additional columns.
##' @param singleCellRef "reference" SingleCellAssay
##' @param singleCellcomp "comparison" SingleCellAssay
##' @param groups character vector giving variable(s) on which the comparison is conditioned
##' @param fun.natural function to transform the SingleCellAssays to a mRNA proportional level
##' @param fun.cycle inverse function of fun.natural
##' @return concordance between two assays
##' @author Andrew McDavid
##' @seealso \link{plotSCAConcordance}
##' @examples
##' data(vbetaFA)
##' sca1 <- subset(vbetaFA, ncells==1)
##' sca100 <- subset(vbetaFA, ncells==100)
##' concord <- getConcordance(sca1, sca100)
##' getss(concord)
##' getrc(concord)
##' @export 
getConcordance <- function(singleCellRef, singleCellcomp, groups=NULL, fun.natural=expavg, fun.cycle=logmean){
    ## vector of groups over which we should aggregate
    if(!(is(singleCellRef, 'FluidigmAssay') && is(singleCellcomp, 'FluidigmAssay'))){
        stop("singleCellRef and singleCellComp should be SingleCellAssay objects")
    }
    scL <- list(singleCellRef, singleCellcomp)
    castL <- list()

    for(i in seq_along(scL)){
        checkGroups(scL[[i]], groups)
        terms1 <- union(groups, "ncells")
        lhs1 <- paste(c(terms1, "primerid"), collapse="+")
        firstForm <- formula(sprintf("%s ~.", lhs1))
        ##should look like Patient.ID + ... + n.cells ~ PrimerID
        m <- as(scL[[i]], 'data.table')
        tmp <- reshape2::dcast(m, firstForm, fun.aggregate=fun.natural)
        ##exponential average per gene, scaled by number of cells
        if(inherits(m[['ncells']], 'factor')){
            warning("ncells is a factor rather than numeric.\n I'll continue, but this may cause problems down the line")
            tmp[['ncells']] <- as.numeric(as.character(tmp[["ncells"]]))
        }
        tmp[["."]] <- tmp[["."]]/tmp[['ncells']]
        rhs2 <- union(groups, "primerid")
        terms2 <- sprintf("%s ~ .", paste(rhs2, collapse="+"))
        secondForm <- formula(terms2)
        nexp = reshape2::dcast(m, secondForm, fun.aggregate=function(x){sum(x>0)}, value.var="value")
                                        #put back on Et scale. fun.cycle adds 1 so -Inf becomes 0 on natural scale
        castL[[i]] <- reshape2::dcast(reshape2::melt(tmp), secondForm, fun.aggregate=fun.cycle)
        renamestr <- c('.'='et')
        castL[[i]] <- plyr::rename(castL[[i]], renamestr)
        castL[[i]] <- cbind(castL[[i]], nexp=nexp[['.']])
    }
    concord <- merge(castL[[1]], castL[[2]], by=c(groups, 'primerid'), suffixes=c(".ref", ".comp"), all=TRUE)
    concord
}

if(getRversion() >= "2.15.1") globalVariables(c('et.ref', 'et.comp'))

##' @describeIn getConcordance getrc the sum of squares, weighted by nexp
##' @param nexp number of expressed cells per row in \code{concord}
##' @export
getwss <- function(concord, nexp){
    mean((concord$et.ref - concord$et.comp)^2*concord$nexp.ref, na.rm=TRUE)
}

##' @describeIn getConcordance return the sum of squares
##' @export
getss <- function(concord){
    mean((concord$et.ref - concord$et.comp)^2, na.rm=TRUE)
                                        #  log2(mean(((2^concord$et.ref-1)-(2^concord$et.comp-1))^2,na.rm=TRUE))
                                        #  mean(with(concord,{a<-(et.ref>0);b<-(et.comp>0);a*b*(et.ref-et.comp)^2+a*(1-b)+b*(1-a)}),na.rm=TRUE)

}

##' @describeIn getConcordance Return Lin's (1989) concordance correlation coefficient
##' @param concord data.frame returned by getConcordance
##' @export
getrc <- function(concord){
    with(concord, {foo<-na.omit(cbind(et.ref=et.ref,et.comp=et.comp));2*cov(foo[,"et.ref"], foo[,"et.comp"])/(var(foo[,"et.ref"])+var(foo[,"et.comp"])+(mean(foo[,"et.ref"])-mean(foo[,"et.comp"]))^2)})
                                        #concordance on log scale but treating NAs
                                        #    with(concord,{foo<-na.omit(2^cbind(et.ref,et.comp)-1);2*cov(foo[,"et.ref"],foo[,"et.comp"])/(var(foo[,"et.ref"])+var(foo[,"et.comp"])+(mean(foo[,"et.ref"])-mean(foo[,"et.comp"]))^2)})
}

##' Filter a SingleCellAssay
##'
##' Remove, or flag wells that are outliers in discrete or continuous space.
##'
##' The function filters wells that don't pass filtering criteria described in filt_control.
##' filt_control is a list with named elements \code{nOutlier}
##' (minimum nmber of outlier cells for a cell to be filtered [default = 2]
##' \code{sigmaContinuous} (the z-score outlier threshold for the continuous part of the signal) [default = 7]
##' and \code{sigmaProportion} (the z-score outlier threshold for the discrete part of the signal) [default = 7].
##'
##' If \code{groups} is provided, the filtering is calculated within each level of the group, then combined again as output.
##' @param sc The \code{SingleCellAssay} object
##' @param groups An optional \code{character} naming the grouping variable
##' @param filt_control The \code{list} with configuration parameters for the filter.
##' @param apply_filter \code{logical} should the filter be applied, or should a matrix of booleans giving if a well would be subject to a filtering criteria be returned?
##' @return A filtered result
##' @author Andrew McDavid
##' @seealso burdenOfFiltering
##' @examples
##' data(vbetaFA)
##' ## Split by 'ncells', apply to each component, then recombine
##' vbeta.filtered <- mast_filter(vbetaFA, groups='ncells')
##' ## Returned as boolean matrix
##' was.filtered <- mast_filter(vbetaFA, apply_filter=FALSE)
##' ## Wells filtered for being discrete outliers
##' head(subset(was.filtered, pctout))
##' @export
mast_filter <- function(sc, groups=NULL, filt_control=NULL, apply_filter=TRUE){
    default_filt <- list(filter=TRUE, nOutlier=2, sigmaContinuous=7, sigmaProportion=7, sigmaSum=NULL, K=1.48)
    if (is.null(filt_control)){
        filt_control <- list()
    }
    missingControl <- setdiff(names(default_filt), names(filt_control))
    filt_control[missingControl] <- default_filt[missingControl]

    if (!is.null(groups)) {
        checkGroups(sc, groups)
        scL <- split(sc, groups)
        lapp <- lapply(scL, mast_filter, groups=NULL, filt_control=filt_control, apply_filter=apply_filter)
        ## Do various things with lapp:
        if(apply_filter && filt_control$filter){
            ## list of SingleCellAssays
            out <- do.call(cbind, lapp)
        } else if(filt_control$filter){ # we are a data.frame
            out <- do.call(rbind, lapp)     #Fix order, argh
            out <- out[match(getwellKey(sc), rownames(out)),] #test this
        } else{                           #I'd reckon it's an unapplied filterset, we'll just keep it as a list
            out <- lapp
        }
        return(out)
    }

    exprs <- t(assay(sc))
    filtered <- do.call(.internalfilter, c(list(exprs), filt_control))
    if(apply_filter && filt_control$filter){
        anyfilter <- apply(filtered, 1, any)
        scout <- sc[,!anyfilter]
    } else{
        scout <- filtered
    }

                                        #if filt_control$filter==TRUE & apply_filter==FALSE
                                        #then rename the retured rows using the wellKey mapping
                                        #if(filt_control$filter && !apply_filter){
                                        #  wk<-as.list(names(getwellKey(sc)))
                                        #  names(wk)<-getwellKey(sc)
                                        #  rownames(scout)<-data.frame(idvars=do.call(rbind,wk[rownames(scout)]))[,1]
                                        #}
    scout
}

                                        #rows are cells, columns are genes
                                        #exprs is a matrix of cells x genes for the strata in which the filtering should occur
                                        #filter optionally return a logical vector or the statistics
.internalfilter <- function(exprs, filter=TRUE, nOutlier=2, sigmaContinuous=7, sigmaProportion=7, sigmaSum=NULL,K=1.48){
    exprs[exprs==0]<-NA
    medianC = apply(exprs, 2, median, na.rm=TRUE)
    medianDev = apply(t(abs(t(exprs) - medianC)), 2, median,na.rm=TRUE)
    z.exprs = t((t(exprs) - medianC)/(medianDev*K+.001))

    ztot =apply(abs(z.exprs)>sigmaContinuous, 1, sum, na.rm=TRUE)

    outlier = ztot >= nOutlier

    if(!is.null(sigmaSum) && !is.na(sigmaSum)){
        zsum <- apply(abs(z.exprs), 1, sum, na.rm=TRUE)
        outlier <- zsum > sigmaSum
    }

    fet0 <- apply(!is.na(exprs), 1, mean)
    null<- fet0==0
    fet0 <- asin(sqrt(fet0))
    medianFet0 <- median(fet0[!null], na.rm=TRUE)
    medianDev.fet0 <- median(abs(fet0[!null] - medianFet0), na.rm=TRUE)
    z.fet0 <- (fet0 - medianFet0)/(medianDev.fet0*K + .001)

    unlikely = abs(z.fet0) > sigmaProportion


    if(filter){
                                        #intout - outlier based on z-score
                                        #pctout - outlier based on frequency of expression
        return(data.frame(intout=outlier,null=null, pctout=unlikely))
    } else{
        return(list(z.exprs=z.exprs, fet0=fet0, z.fet0=z.fet0, et.med=medianC, et.mad=medianDev))
    }
}



#' Average expression values for duplicated/redundant genes
#'
#' Takes an average, potentially on a different scale given by \code{fun.natural}
#' of some genes. The average is then transformed with \code{fun.cycle}.
#' @note
#' This code needs to be tested more extensively after a refactoring.  Caveat calculator.
#' @param fd \code{SingleCellAssay} or subclass 
#' @param geneGroups \code{character} naming a column in the \code{featureData} that keys the duplicates
#' @param fun.natural transformation to be used to collapse the duplicate expression values
#' @param fun.cycle transformation to be used after collapsing
#' @return averaged version of \code{fd}.
#' @importFrom SummarizedExperiment 'rowData<-'
#' @export
primerAverage <- function(fd, geneGroups, fun.natural=expavg, fun.cycle=logshift){
    fVars <- mcols(fd)[, geneGroups, drop=FALSE]
    geneset <- split(1:nrow(fVars), fVars)
    fdOrder <- order(unlist(lapply(geneset, min)))
    geneset <- geneset[fdOrder]           #maintain order of genes from original
    genefirst <- unlist(lapply(geneset, function(x) x[1]))
    skeleton <- fd[genefirst,]       #make smaller version of fd, then we'll replace exprs with the mean per geneset

    exprs.orig <- assay(fd)
    exprs.new <- lapply(geneset, function(rowidx){
        rows <- exprs.orig[rowidx,, drop=FALSE]
        if(nrow(rows)>1){
            return(fun.cycle(apply(rows, 2,fun.natural)))
        }
        return(rows)
    })
    exprs.new <- do.call(rbind, exprs.new)
    dimnames(skeleton) = dimnames(exprs.new)
    assay(skeleton) <- exprs.new
    rowData(skeleton)$primerid <- mcols(skeleton)[,geneGroups]
    # skeleton <- sort(skeleton,rows=FALSE)
    return(skeleton)
}
RGLab/MAST documentation built on Sept. 30, 2023, 1:08 p.m.