CellScore: Tool for Evaluation of Cell Identity from Transcription Profiles

Documented in CellScore

# CellScore.R

# From CS_v1.4.R

#' CellScore evaluates the identity of cells undergoing cell type transition
#'
#' This function will calculate the CellScore (summary score) for a cell that
#' is undergoing a transition in cell identity from a starting cell type to a
#' target cell type. `transitions` is mandatory, and either `data` or all three of
#' `scores.onoff`, `scores.cosine` and `pdata` are as well. If you provide `data`,
#' then `scores.onoff`, `scores.cosine` and `pdata` will override the respective
#' calculations only.
#'
#' @param transitions a data frame containing three columns, one for the
#'   start (donor) test and target cell type. Each row of the data.
#'   frame describes one transition from the start to a target cell type.
#' @param data a SummarizedExperiment or ExpressionSet containing data matrices of normalized
#'   expression data, present/absent calls, a gene annotation data frame and a
#'   phenotype data frame.
#' @param scores.onoff a data.frame of OnOff Scores for all samples in the
#'   expression matrix as generated by the function OnOff().
#' @param scores.cosine a numeric matrix of cosine similarity between
#'    general groups, subgroups and individual samples as calculated by the
#'    function CosineSimScore().
#' @param pdata a data frame with samples as rows, variables as columns.
#' @return The function returns a data frame with 29 columns and M*N rows,
#'   where M is the number of unqiue start and target cell types pairs listed in
#'   the cell.change argument, while N is the number of all samples in the
#'   input dataset eset. The columns include sample phenotype features and all
#'   score (components), including the on/off score, cosine similarity and
#'   CellScore.
#' @keywords cellscore cosine similarity
#' @seealso \code{\link[CellScore]{CosineSimScore}, \link[CellScore]{OnOff}} for
#'   details on specfic score calculations, and
#'   \code{\link[hgu133plus2CellScore]{hgu133plus2CellScore}} for details on the
#'   specific expressionSet object that shoud be provided as an input.
#' @export
#' @import SummarizedExperiment
#' @importClassesFrom Biobase ExpressionSet
#' @importMethodsFrom Biobase pData
#' @examples
#' ## Load the expression set for the standard cell types
#' library(Biobase)
#' library(hgu133plus2CellScore) # eset.std
#'
#' ## Locate the external data files in the CellScore package
#' rdata.path <- system.file("extdata", "eset48.RData", package = "CellScore")
#' tsvdata.path <- system.file("extdata", "cell_change_test.tsv",
#'                              package = "CellScore")
#'
#' if (file.exists(rdata.path) && file.exists(tsvdata.path)) {
#'
#'    ## Load the expression set with normalized expressions of 48 test samples
#'    load(rdata.path)
#'
#'    ## Import the cell change info for the loaded test samples
#'    cell.change <- read.delim(file= tsvdata.path, sep="\t",
#'                              header=TRUE, stringsAsFactors=FALSE)
#'
#'    ## Combine the standards and the test data
#'    eset <- combine(eset.std, eset48)
#'
#'    ## Generate cosine similarity for the combined data
#'    ## NOTE: May take 1-2 minutes on the full eset object
#'    ## so we subset it for 4 cell types
#'    pdata <- pData(eset)
#'    sel.samples <- pdata$general_cell_type %in% c("ESC", "EC", "FIB", "KER")
#'    eset.sub <- eset[, sel.samples]
#'    cs <- CosineSimScore(eset.sub, cell.change, iqr.cutoff=0.1)
#'
#'    ## Generate the on/off scores for the combined data
#'    individ.OnOff <- OnOff(eset.sub, cell.change, out.put="individual")
#'
#'    ## Generate the CellScore values for all samples
#'    cellscore <- CellScore(cell.change, data = eset.sub, scores.onoff = individ.OnOff$scores,
#'                           scores.cosine = cs$cosine.samples)
#' }

CellScore <- function(transitions, data = NULL, scores.onoff = NULL, scores.cosine = NULL, pdata = NULL) {
    ## PSEUDOCODE
    ## Start with individ.OnOff and add cosine similarities as columns to it.
    ## o transpose the cs$cosine.samples so samples are in rows
    ## Does the subgroup have to be in the row name? this is bothersome.
    ##  This is a remnant of CellScore_Cosine_V3.R, filtered ynorm consolidated
    ##  matrix.
    ##  o get rid of the subgroup so that the column names can be used
    ##    later in the process without manipulation.

    ############################################################################
    ## PART 00. Check function arguments
    ############################################################################
    fun.main <- as.character(match.call()[[1]])
    .stopIfNotDataFrame(transitions, "transitions", fun.main)
    # Check that we have at least the minimal set of function arguments
    if (!is.null(data)) {
      summarized_experiment <- .stopIfCantCoerceToSummarizedExperiment(data, "data", fun.main)
      if (is.null(scores.onoff)) {
        # Do OnOff
        scores.onoff <- OnOff(summarized_experiment, transitions, out.put="individual")$scores
      }
      if (is.null(scores.cosine)) {
        # Do CosineSimScore
        scores.cosine <- CosineSimScore(summarized_experiment, transitions, iqr.cutoff=0.1)$cosine.samples
      }
      if (is.null(pdata)) {
        pdata <- colData(summarized_experiment)
      }
    } else {
      if (is.null(scores.onoff) | is.null(scores.cosine) | is.null(pdata)) {
        stop("Missing required arguments to CellScore; either data or all of scores.onoff, scores.cosine, and pdata arguments must not be NULL")
      }
    }
    .stopIfNotDataFrame(scores.onoff, "scores.onoff", fun.main)
    .stopIfNotSymetricMatrix0to1(scores.cosine, "scores.cosine", fun.main)

    ############################################################################
    ## PART 0. Filter samples according to the phenotype
    ############################################################################
    ##  o get rid of samples with is.na(category)
    ##  o there may be samples with NA categories as an easy way to exclude
    ##    them from analysis without changing the composition of the input every
    ##    time
    ##  o also exclude any rows with NA values in "general_cell_type":
    ##    this should not be NA

    sel <- !is.na(pdata$category) & !is.na(pdata$general_cell_type)

    ## DO 'major group' comparisons only
    sel.start <- transitions$start %in% pdata[sel, "general_cell_type"]
    sel.target <- transitions$target %in% pdata[sel, "general_cell_type"]
    sel.unique <- !duplicated(paste(transitions$start,
                                    transitions$target,
                                    sep="_"))
    score.comparisons <- transitions[sel.start & sel.target & sel.unique, ]
    rownames(score.comparisons) <- seq_len(nrow(score.comparisons))

    ############################################################################
    ## PART I. Merge metrics into a nice big table
    ############################################################################
    ##  A. Rearrange tables
    ##   1. standard types and test samples
    ##   a. standards
    target.group <- unique(unlist(score.comparisons[, -2]))
    ##    b. ALL samples with valid labels
    test.samples <- rownames(pdata[sel, ])

    ##   2. extract the cosine similarity for all GSM samples (rows) wrt
    ##   each target group (columns)
    ind.row <- match(test.samples, rownames(scores.cosine))
    cols <- colnames(scores.cosine)
    temp.list <- lapply(target.group,
                        function(group){
                            ## select the column of the target group AND
                            ## the test samples in rows
                            scores.cosine[ind.row, match(group, cols )]
                        })
    temp.cosine <- do.call("cbind",
                           c(data.frame(test.samples, stringsAsFactors=FALSE),
                             temp.list))
    colnames(temp.cosine) <- c("test.samples", paste0("cosine.", target.group))

    ##  B. Merge cosine scores to on/off scores
    ##  Use the composite id ("test" and "test.samples") for merging; We can
    ##  have many rows per sample, because each sample may have scores in more
    ##  than one transition
    big.table <- merge(scores.onoff, temp.cosine, by.x="test",
                       by.y="test.samples", all.x=TRUE)
    ## Rename "test" column to "composite.ID"
    colnames(big.table)[which(colnames(big.table) == "test")]  <- "composite.ID"
    ## final size of table has to be number of rows in scores.onoff
    stopifnot(nrow(big.table) == nrow(scores.onoff))

    ############################################################################
    ##   PART II. CellScore
    ############################################################################
    ##  A. THE CellScore = target-like - donor-like
    ##   o target-like= fraction of target genes + cosine.sim compared to target
    ##   o donor-like = fraction of donor genes + cosine.sim compared to donor
    ## Loop through each row in big.table
    ##   o actually there are no unique rownames in this case;
    ##     just make sure the row order is preserved
    ##   o a truly unique rowname could be derived from sample_id, start,
    ##     and target (for transition).
    mat <- matrix(NA, nrow=nrow(big.table), ncol=7,
                         dimnames = list(c(1:nrow(big.table)),
                                         c("fraction.target", "cosine.target",
                                           "fraction.donor", "cosine.donor",
                                           "target.like", "donor.like",
                                           "CellScore")))
    for (id in seq_len(nrow(big.table))) {
        for (cell in c("target", "donor")){
            col.fr <- paste0("fraction.", cell)
            col.cos <- paste0("cosine.", cell)
            mat[id, col.fr] <- .fractionScore(cell, big.table[id, ])
            mat[id, col.cos] <- .cosineScore(cell, big.table[id, ])
            mat[id, paste0(cell, ".like")] <- mat[id, col.fr] + mat[id, col.cos]
        }
        mat[id, "CellScore"] <- mat[id, "target.like"] - mat[id, "donor.like"]
    }

    ## B. Final output
    selected <- c("composite.ID","experiment_id","sample_id","platform_id",
                   "cell_type","disease_status","category",
                   "general_cell_type","donor_tissue","sub_cell_type1",
                   "transition_induction_method","donor_cell_body_location",
                   "start","target","markers.start","markers.target",
                   "start.mkrs.in.test", "target.mkrs.in.test",
                   "loss.start.mkrs","gain.target.mkrs", "OnOffScore")
    data.frame(big.table[, intersect(selected, colnames(big.table))], mat,
               index=c(1:nrow(big.table)), stringsAsFactors=FALSE)
}

## fractionScore
##
## Local function that calculates the onoffscore

.fractionScore <- function(type, instance){
    switch(type,
           donor = 1 - instance[, "loss.start.mkrs"],
           target = instance[, "gain.target.mkrs"])
}

## cosineScore
##
## Local function that gets the cosine similarity score
.cosineScore <- function(type, instance){
    x = instance[, paste0("cosine.",
                          instance[, ifelse(type == "donor", "start", type)])]
    as.numeric(as.character(x))
}