R/signature.R

Defines functions signature.prediction.1 sigpred.testing.sets sigpred.training.sets exprs.synthetic.similarity exprs.test.statistic.metric unique.helper exprs.unique common.helper exprs.common

Documented in exprs.common exprs.synthetic.similarity exprs.test.statistic.metric exprs.unique signature.prediction.1 sigpred.testing.sets sigpred.training.sets

#' Predicting cell type signature
#'
#' By default, the function predicts cell type signature by using percentage filter and fold-induction filter to refine cluster specific differentially expressed genes
#' When the marker information for some cell types is available, one can choose to predict signature using a logistic regression model integrating four metrics
#'
#' @param object A sincera object
#' @param groups The cell groups for signature prediction
#' @param use.logireg If TRUE, use logistic regression model based method to predict signature for cell groups (cell types) with marker information available;
#'                    use setCellTypeMarkers to add marker information to sincera, and use getCellTypeMarkers to check added cell type information
#' @param sigs.n A vector containing the number of signatures to be predicted for each cell group; if NULL, use genes with top 20 prediction score as the signature
#' @param diff.method Take effect when use.logireg==FALSE. The method of differentiatial expression
#' @param use.fdr Take effect when use.logireg==FALSE. If TRUE, use BH to adjust pvalues of differential expression
#' @param diff.thresh Take effect when use.logireg==FALSE. The threshold of pvalue or fdr.
#' @param avg.thresh A cell type signature gene must have at least avg.thresh expression in the defined cell type
#' @param min.expression The threshold for determining expressed genes
#' @param pct.thresh A cell type signature gene must express in at least pct.thresh percentage of the cells of the defined cell type
#' @param fc.thresh A cell type signature gene must have at least fc.thresh fold of average expression induction in its defined cell type cells when compared to its average expression in all the other cells
#' @return An updated sincera object, use getSigGenes to access the predicted signature genes'
#'
setGeneric("signature.prediction", function(object, groups=NULL, use.logireg=FALSE, sigs.n=NULL, diff.method="welch", use.fdr=FALSE, diff.thresh=0.05,
                                            avg.thresh=1, min.expression=1, pct.thresh=0.8, fc.thresh=1.5, ...) standardGeneric("signature.prediction"))
#' @export
setMethod("signature.prediction","sincera",
          function(object, groups=NULL, use.logireg=FALSE, sigs.n=NULL, diff.method="welch", use.fdr=FALSE, diff.thresh=0.05,
                   avg.thresh=1, min.expression=1, pct.thresh=0.8, fc.thresh=1.5, ...) {

            if (is.null(groups)) {
              groups <- sort(unique(getCellMeta(object, name="GROUP")))
            }

            clusters <- groups

            if (!use.logireg) { # use per cluster statistics to refine diff genes for signature prediction

                cat("Use percentage and fold_induction filters to refine differentially expressed genes for signature prediction")

                gs <- GeneStats(dp=getExpression(object), ident=getCellMeta(object, name="GROUP"),
                                groups=clusters, min.expression=min.expression, min.gsize=2, min.avg=1,
                                stats=c(diff.method, "ident.avg", "ident.pct","ident.fc"))

                if (use.fdr) {
                    sigs <- data.frame(GROUP=NULL, SYMBOL=NULL, P=NULL, FDR=NULL, AVG=NULL, PCT=NULL, FC=NULL)
                } else {
                    sigs <- data.frame(GROUP=NULL, SYMBOL=NULL, P=NULL, AVG=NULL, PCT=NULL, FC=NULL)
                }


                for (g in clusters) {
                    diff.col <- paste(diff.method, ".", g, sep="")
                    diff.fdr.col <- paste(diff.method, ".", g, ".fdr", sep="")
                    fc.col <- paste("ident.fc.", g, sep="")
                    avg.col <- paste("ident.avg.", g, sep="")
                    pct.col <- paste("ident.pct.", g, sep="")
                    sig.col <- paste("sig.", g, sep="")

                    g.fc.idx <- which(gs[, fc.col] >= fc.thresh)
                    g.avg.idx <- which(gs[, avg.col] >= avg.thresh)
                    g.pct.idx <- which(gs[, pct.col] >= pct.thresh)

                    g.sig.idx <- c()
                    g.sig.idx <- intersect(g.fc.idx, g.avg.idx)
                    g.sig.idx <- intersect(g.sig.idx, g.pct.idx)

                    if (use.fdr) {
                        gs[, diff.fdr.col] <- NA
                        gs[g.sig.idx, diff.fdr.col] <- p.adjust(as.numeric(gs[g.sig.idx, diff.col]), method="BH")
                        g.diff.idx <- which(gs[, diff.fdr.col]<diff.thresh)
                    } else {
                        g.diff.idx <- which(gs[, diff.col] < diff.thresh)
                    }

                    g.sig.idx <- intersect(g.sig.idx, g.diff.idx)

                    gs[, sig.col] <- 0
                    if (length(g.sig.idx) > 0) {
                        gs[g.sig.idx, sig.col] <- 1
                        if (use.fdr) {
                            g.sig <- data.frame(GROUP=g, SYMBOL=rownames(gs)[g.sig.idx],
                                                P=gs[g.sig.idx, diff.col],
                                                FDR=gs[g.sig.idx, diff.fdr.col],
                                                AVG=gs[g.sig.idx, avg.col],
                                                PCT=gs[g.sig.idx, pct.col],
                                                FC=gs[g.sig.idx, fc.col])
                        } else {
                            g.sig <- data.frame(GROUP=g, SYMBOL=rownames(gs)[g.sig.idx],
                                                P=gs[g.sig.idx, diff.col],
                                                AVG=gs[g.sig.idx, avg.col],
                                                PCT=gs[g.sig.idx, pct.col],
                                                FC=gs[g.sig.idx, fc.col])
                        }
                        sigs <- rbind(sigs, g.sig)
                    }

                }

                sigs <- sigs[order(sigs$GROUP, sigs$P, -sigs$FC), ]

                object <- setSigGenes(object, value=sigs)


            } else { # use logistic regression model to predict signature for clusters with markers

                es <- getES(object)
                es <- es[getGenesForClustering(object), ]

                markers <- getCellTypeMarkers(object)

                types <- c()
                if (dim(markers)[1]>0) {
                  types <- sort(unique(markers$TYPE))
                }

                celltypes <- getCellType(object)

                clusters.withMarkers <- c()

                for (i in 1:length(types)) {
                  i.type <- types[i]
                  i.cluster <- names(celltypes)[which(celltypes == i.type)]
                  i.name <- paste("use_as_marker_", i.cluster, sep="")
                  i.markers <- markers$SYMBOL[which(markers$TYPE==i.type)]
                  fData(es)[, i.name] <- 0
                  fData(es)[which(rownames(fData(es)) %in% i.markers), i.name] <- 1
                  clusters.withMarkers <- c(clusters.withMarkers, i.cluster)
                }
                clusters.withMarkers <- sort(clusters.withMarkers)

                if (length(clusters.withMarkers)>0) {

                    cat("Use logistic regression and markers to predict signature for cell group(s):", paste(clusters.withMarkers, split=" "))

                    difftest <- getDiffTest(object)
                    es <- es[rownames(difftest), ]
                    fData(es)[, colnames(difftest)] <- difftest

                    wd <- NULL

                    group.by = "GROUP"
                    groups=clusters.withMarkers
                    # training set
                    trainset.prefix = TRAINSET.PREFIX
                    train.class.prefix = TRAIN.CLASS.PREFIX
                    marker.prefix=MARKER.PREFIX
                    # testing set
                    testset.prefix=TESTSET.PREFIX
                    # common gene metric
                    common.prefix = COMMON.PREFIX
                    common.threshold=COMMON.TRESHOLD
                    common.percentage=COMMON.PERCENTAGE
                    # unique gene metric
                    unique.prefix = UNIQUE.PREFIX
                    unique.ratio=UNIQUE.RATIO
                    unique.quantile=UNIQUE.QUANTILE
                    # test statistic metric
                    test.statistic.metric.prefix = TEST.STATS.METRIC.PREFIX
                    log.base=2
                    diff.expr.prefix="welch."
                    diff.expr.threshold = 1
                    # synthetic profile similarity metric
                    syn.sim.prefix = SYN.SIM.PREFIX
                    signature.prefix = SIGNATURE.PREFIX
                    gene.symbol.label = GENE.SYMBOL.LABEL
                    verbose=TRUE
                    export=FALSE
                    export.components="fd"
                    dir.prefix=NULL

                    # calculate metrics
                    es <- exprs.test.statistic.metric(es, group.by=group.by, groups=groups, log.base=log.base, test.statistic.metric.prefix=test.statistic.metric.prefix, diff.expr.prefix=diff.expr.prefix)

                    es <- exprs.common(es, group.by=group.by, groups=groups, common.threshold=common.threshold, common.percentage=common.percentage, common.prefix=common.prefix)

                    es <- exprs.unique(es, group.by=group.by, groups=groups, unique.ratio=unique.ratio, unique.quantile=unique.quantile, unique.prefix=unique.prefix)

                    es <- exprs.synthetic.similarity(es, group.by=group.by, groups=groups, syn.sim.prefix=syn.sim.prefix)

                    # construct training sets
                    es <- sigpred.training.sets(es, group.by=group.by, groups=groups,
                                                trainset.prefix=trainset.prefix,
                                                train.class.prefix=train.class.prefix,
                                                test.statistic.metric.prefix = test.statistic.metric.prefix,
                                                common.prefix = common.prefix,
                                                unique.prefix = unique.prefix,
                                                syn.sim.prefix = syn.sim.prefix,
                                                marker.prefix = marker.prefix)

                    # construct testing sets
                    es <- sigpred.testing.sets(es, group.by=group.by, groups=groups, testset.prefix=testset.prefix, diff.expr.prefix=diff.expr.prefix, diff.expr.threshold=diff.expr.threshold)

                    # performing signature prediction
                    es <- signature.prediction.1(es, group.by=group.by, groups=groups,
                                               trainset.prefix=trainset.prefix,
                                               train.class.prefix=train.class.prefix,
                                               testset.prefix=testset.prefix,
                                               test.statistic.metric.prefix = test.statistic.metric.prefix,
                                               common.prefix = common.prefix,
                                               unique.prefix = unique.prefix,
                                               syn.sim.prefix = syn.sim.prefix,
                                               signature.prefix = signature.prefix,
                                               gene.symbol.label = gene.symbol.label,
                                               verbose=verbose,
                                               export=export,
                                               dir.prefix = wd)

                    sig.cols <- paste("sig_pred_", clusters.withMarkers, sep="")
                    if (is.null(sigs.n)) sigs.n <- rep(20, length(clusters.withMarkers))
                    sigs <- data.frame(GROUP=NULL, SYMBOL=NULL, SCORE=NULL, stringsAsFactors = FALSE)
                    for (i in 1:length(sig.cols)) {
                        tmp <- as.numeric(fData(es)[, sig.cols[i]])
                        if (sigs.n[i] > length(tmp)) sigs.n[i] <- length(tmp)
                        i.idx <- order(tmp, decreasing=T)[1:sigs.n[i]]
                        i.idx <- i.idx[which(!is.na(tmp[i.idx]))]
                        if (length(i.idx)>0) {
                            i.sigs <- data.frame(GROUP=clusters.withMarkers[i], SYMBOL=rownames(fData(es))[i.idx], SCORE=as.numeric(fData(es)[i.idx, sig.cols[i]]), stringsAsFactors = FALSE)
                            sigs <- rbind(sigs, i.sigs)
                        }
                    }

                    if (dim(sigs)[1]>0) sigs <- sigs[order(sigs$GROUP, -sigs$SCORE), ]

                    object <- setSigGenes(object, value=sigs)

                }
            }

            return(object)
          }
)



#' Validating cell type signature using random subsampling mehod
#'
#' @param object A sincera object
#' @param siggenes A data frame containing the . The first column of the data frame contains cell cluster information, while the second column contains the genes
#' @param pct The percentage of cells to be used to construct testing set
#' @param repeats The number of subsamplings
#' @return An updated sincera object, use getSigValidation to access the validation results
#'
setGeneric("signature.validation", function(object, siggenes=NULL, pct=0.2, repeats=3, ...) standardGeneric("signature.validation"))
#' @export
setMethod("signature.validation","sincera",
          function(object, siggenes=NULL, pct=0.2, repeats=3, ...) {

            if (is.null(siggenes)) {

            } else {

              cellcluster <- getCellMeta(object, name="GROUP")
              all.groups <- names(table(cellcluster))
              cluster.cells.n <- as.numeric(table(cellcluster))
              names(cluster.cells.n) <- all.groups

              sig.groups <- sort(as.character(unique(siggenes$GROUP)))
              sig.groups <- intersect(sig.groups, all.groups)

              if (length(sig.groups)==0) {
                stop("Please check cluster definition in the siggenes.")
              }

              cat("Sincera: performing repeated random subsampling validation of signature prediction for groups:", paste(sig.groups, split=" "), "\n")

              nsiggrp <- length(sig.groups)
              nallgrp <- length(all.groups)

              cluster.cells.idx <- list()
              j <- 1
              for (i in all.groups) {
                  idx.i <- which(cellcluster==i)
                  cluster.cells.idx[[j]] <- idx.i
                  j <- j+1
              }
              names(cluster.cells.idx) <- all.groups



              cells.idx <- 1:getCellNum(object)

              tr = NULL
              tr <- data.frame(TID=1:(nsiggrp*nallgrp*repeats), I=NA, J=NA, P=NA, IN=NA, JN=NA, R=NA, IC=NA, JC=NA, TA=NA, TBA=NA, TF1=NA, PA=NA, PBA=NA, PF1=NA)
              z <- 1

              ES <- getES(object)

              for (i in 1:nsiggrp) { # evaluate the signature of cluster i


                  i.sigs <- as.character(siggenes[which(siggenes$GROUP == sig.groups[i]), "SYMBOL"])

                  if (length(i.sigs)>0) {

                    i.sigs.idx <- which(rownames(fData(ES)) %in% i.sigs)

                    for (j in 1:nallgrp) { # using cluster j

                        p = pct
                        #for (p in pct) { # percentage of cells from both clusters for testing
                            for (r in 1:repeats) {

                                if (sig.groups[i] != all.groups[j]) {

                                    i.n <- max(floor(cluster.cells.n[sig.groups[i]] * p),1)
                                    j.n <- max(floor(cluster.cells.n[all.groups[j]] * p),1)
                                    i.cells.test.idx <- sample(cluster.cells.idx[[sig.groups[i]]], i.n, replace=FALSE)
                                    j.cells.test.idx <- sample(cluster.cells.idx[[all.groups[j]]], j.n, replace=FALSE)


                                    i.trainset <- data.frame(t(Biobase::exprs(ES)[i.sigs.idx, setdiff(cluster.cells.idx[[sig.groups[i]]], i.cells.test.idx)]), CLUSTER=paste("C", sig.groups[i], sep=""))
                                    j.trainset <- data.frame(t(Biobase::exprs(ES)[i.sigs.idx, setdiff(cluster.cells.idx[[all.groups[j]]], j.cells.test.idx)]), CLUSTER=paste("C", all.groups[j], sep=""))

                                    i.testset <- data.frame(t(Biobase::exprs(ES)[i.sigs.idx, i.cells.test.idx]), CLUSTER=paste("C", sig.groups[i], sep=""))
                                    j.testset <- data.frame(t(Biobase::exprs(ES)[i.sigs.idx, j.cells.test.idx]), CLUSTER=paste("C", all.groups[j], sep=""))

                                    trainset = NULL
                                    testset = NULL
                                    trainset <- rbind(i.trainset, j.trainset)
                                    testset <- rbind(i.testset, j.testset)

                                    category.idx <- dim(trainset)[2]

                                    library(e1071)

                                    mm = NULL
                                    mm <- svm(CLUSTER ~ ., data=trainset)

                                    prediction = NULL
                                    prediction <- predict(mm, trainset[,-category.idx])

                                    tp = NA
                                    fn = NA
                                    tn = NA
                                    fp = NA
                                    ta = NA
                                    tba = NA
                                    tf1 = NA

                                    # training accuracy
                                    tab = NULL
                                    tab <- table(pred = prediction, true = trainset[,category.idx])
                                    tp <- tab[1,1]
                                    fn <- tab[2,1]
                                    fp <- tab[1,2]
                                    tn <- tab[2,2]
                                    ta <- (tp + tn)/(tp+fp+fn+tn)
                                    tba <- (tp/(tp+fn) + tn/(tn+fp))/2
                                    tf1 <- 2*tp / (2*tp + fp + fn)

                                    tp = NA
                                    fn = NA
                                    tn = NA
                                    fp = NA
                                    pa = NA
                                    pba = NA
                                    pf1 = NA

                                    # prediction accuracy
                                    prediction = NULL
                                    prediction <- predict(mm, testset[, -category.idx])

                                    tab = NULL
                                    tab <- table(pred = prediction, true = testset[,category.idx])
                                    tp <- tab[1,1]
                                    fn <- tab[2,1]
                                    fp <- tab[1,2]
                                    tn <- tab[2,2]
                                    pa <- (tp + tn)/(tp+fp+fn+tn)
                                    pba <- (tp/(tp+fn) + tn/(tn+fp))/2
                                    pf1 <- 2*tp / (2*tp + fp + fn)

                                    tr$I[z] <- sig.groups[i]
                                    tr$J[z] <- all.groups[j]
                                    tr$P[z] <- p
                                    tr$IN[z] <- i.n
                                    tr$JN[z] <- j.n
                                    tr$R[z] <- r
                                    tr$IC[z] <- paste(as.character(rownames(pData(ES))[i.cells.test.idx]), collapse=", ")
                                    tr$JC[z] <- paste(as.character(rownames(pData(ES))[j.cells.test.idx]), collapse=", ")
                                    tr$TA[z] <- ta
                                    tr$TBA[z] <- tba
                                    tr$TF1[z] <- tf1
                                    tr$PA[z] <- pa
                                    tr$PBA[z] <- pba
                                    tr$PF1[z] <- pf1


                                    z <- z+1

                                }
                           # }
                        }
                    }
                  }

                  # print(i)
              }

              tr <- tr[which(!is.na(tr$I)),]


              #write.table(tr, file=paste(wd, "cross-validation-log.txt", sep=""), sep="\t", col.names=T, row.names=F)

              tr.mean <- aggregate(tr[,"PA"], list(tr$I, tr$J, tr$P), mean)
              tr.sd <- aggregate(tr[,"PA"], list(tr$I, tr$J, tr$P), sd)

              colnames(tr.mean) <- c("I","J","P", "PA.mean")
              colnames(tr.sd) <- c("I","J","P", "PA.sd")

              tr.mean <- tr.mean[order(tr.mean$I),]
              tr.sd <- tr.sd[order(tr.sd$I),]

              tr.summary <- cbind(tr.mean, PA.sd=tr.sd$PA.sd, PA.se=tr.sd$PA.sd/sqrt(repeats))

              #
              tr.summary.2 <-tr.summary
              colnames(tr.summary.2) <- c("cluster.i","cluster.j","sampling.p","accuracy.mean","accuracy.sd", "accuracy.se")
              #write.table(tr.summary.2, file=paste(wd, "cross-validation-summary.txt", sep=""), sep="\t", col.names=T, row.names=F)

              np <- nsiggrp
              gs <- list()

              for (i in 1:np) {

                  tr.i <- tr.summary[which(tr.summary$I == sig.groups[i]),]
                  tr.i <- tr.i[which(!(tr.i$I==tr.i$J)),]
                  tr.i$Group <- paste(tr.i$J, sep="")

                  g <- ggplot(tr.i, aes(x=Group, y=PA.mean)) +
                      geom_bar(position=position_dodge(), stat="identity", width=.6) +
                      geom_errorbar(aes(ymin=PA.mean-PA.se, ymax=PA.mean+PA.se), width=.2) +
                      ylab("Accuracy") +
                      ggtitle(paste("Classificiation accurary of predicted signature of cell group ", sig.groups[i], " (", (1-pct)*100, "% training; ", repeats, " repeats)", sep="")) +
                      sincera_theme()
                  gs[[i]] <- g

                  print(g)
                  pause()

              }

              #tiff(file=paste(wd, "accuracy.tiff",sep=""), width=4, height=2*np, unit="in", res=150, pointsize=2, compression="lzw")
              #multiplot(plotlist=gs, cols=1)
              #dev.off()



              cat("Sincera: done\n")



              return(object)
            }
        }

)




##########################################################################################
# The following code is from SINCERA version a10242015. It will be upgraded soon.
##########################################################################################

if (FALSE) {
  dir.delim <- "/"
  TIMESTAMP.FORMAT <- '%m%d%Y_%Hh%Mm%Ss'
  TF.LABEL = "TF"										      # for labeling the column encoding the transcription factors/cofactors
}

GENE.SYMBOL.LABEL = "SYMBOL"					    # for labeling the column encoding the official gene symbol
SAMPLE.LABEL = "SAMPLE"                   # for labeling the column encoding the sample information
CLUSTER.LABEL = "CLUSTER"  					   		# for labeling the column encoding the results of cluster assignment
  

COMMON.TRESHOLD=5                         # in common gene metric, expression > COMMON.TRESHOLD will considered as expressed
COMMON.PERCENTAGE=0.8                     # in common gene metric, genes that express in >COMMON.PRECENTAGE cluster cells will be considered as a common gene shared by cluster cells.
UNIQUE.RATIO=2                            # parameters for unique gene metric
UNIQUE.QUANTILE=0.85								      # parameters for unique gene metric


EXPR.SPECIFICITY.PREFIX <- "specificity_"      		# for labeling the columns encoding the results of expression specificity calculation for each group
EXPR.ABUNDANCY.PREFIX <- "abundancy_"          		# for labeling the columns encoding the results of expression abundance calculation
PREFILTERING.PREFIX <- "use_for_analysis"	   		# for labeling the column encoding the result of prefiltering


DIFF.EXPR.PREFIX = "test_"                     		# for labeling the columns encoding the results of differential expression test for each group
CELLTYPE.ENRICHMENT.PREFIX = "use_for_celltype_"	# for labeling the columns encoding the gene list used for cell type enrichment analysis for each group
MARKER.PREFIX = "use_as_marker_"					# for labeling the columns encoding the biomarkers for each group


COMMON.PREFIX="common_"								# for labeling the columns encoding the results of common metric for each group
UNIQUE.PREFIX="unique_"								# for labeling the columns encoding the results of unique metric for each group
TEST.STATS.METRIC.PREFIX = "test_statistic_"		# for labeling the columns encoding the results of test statistic metric for each group
SYN.SIM.PREFIX="synthetic_similiary_"				# for labeling the columns encoding the results of synthetic profile similarity metric for each group
TRAINSET.PREFIX="use_for_train_"					# for labeling the columns encoding the training instances for each group
TRAIN.CLASS.PREFIX="train_class_"					# for labeling the columns encoding the class of group specific training instances
TESTSET.PREFIX="use_for_test_"						# for labeling the columns encoding the testing instances for each group
SIGNATURE.PREFIX="sig_pred_"						# for labeling the columns encoding the results of signature prediction



#######################################################
#                 Signature Prediction                #
#######################################################





#' Identifying group specific common genes
#'
#' @param ES (ExpressionSet) an ExpressionSet object containing the single cell RNA-seq data
#' @param group.by (character) the name of the column that contains the cluster information
#' @param groups (character) the clusters for signature prediction
#' @param common.prefix (character) prefix for labeling the columns encoding the results of common metric for each group
#' @param common.threshold (numeric) in common gene metric, expression > COMMON.TRESHOLD will considered as expressed
#' @param common.percentage (numeric) in common gene metric, genes that express in >COMMON.PRECENTAGE cluster cells will be considered as a common
#' @return an ExpressionSet object containing the results of common gene metrics in the attributes of fData
#'
exprs.common <- function(ES, group.by=CLUSTER.LABEL, groups=NULL, common.threshold=5, common.percentage=0.8, common.prefix=COMMON.PREFIX) {
  if (is.null(groups)) {
    groups <- sort(unique(pData(ES)[,group.by]))
  }
  for (i in groups) {
    i.cells <- rownames(subset(pData(ES), pData(ES)[, group.by] %in% i))
    i.cells.idx <- which(colnames(Biobase::exprs(ES)) %in% i.cells)
    i.common <- apply(Biobase::exprs(ES), 1, function(y) common.helper(y, i.cells.idx, common.threshold, common.percentage))
    i.name <- paste(common.prefix, i, sep="")
    fData(ES)[,i.name]<-i.common
  }
  return(ES)
}
common.helper <- function(a, idx.i, common.threshold=5, common.percentage=0.8) {
  a <- as.numeric(a)
  a_common_threshold <- ceiling(length(idx.i)*common.percentage)
  a_common <- 0
  if (length(which(a[idx.i] >= common.threshold)) >= a_common_threshold) {
    a_common <- 1
  }
  return(a_common)
}


#' Identifying group specific unique genes
#'
#' @param ES (ExpressionSet) an ExpressionSet object containing the single cell RNA-seq data
#' @param group.by (character) the name of the column that contains the cluster information
#' @param groups (character) the clusters for signature prediction
#' @param unique.prefix (character) prefix for labeling the columns encoding the results of unique metric for each group
#' @param unique.ratio (numeric) parameter for unique gene metric
#' @param unique.quantile (numeric) parameters for unique gene metric
#' @return an ExpressionSet object containing the results of unique gene metric in the attributes of fData
#'
exprs.unique <- function(ES, group.by=CLUSTER.LABEL, groups=NULL, unique.ratio=2, unique.quantile=0.85, unique.prefix=UNIQUE.PREFIX) {
  if (is.null(groups)) {
    groups <- sort(unique(pData(ES)[,group.by]))
  }
  cells <- rownames(pData(ES))
  for (i in groups) {
    i.cells <- rownames(subset(pData(ES), pData(ES)[, group.by] %in% i))
    i.cells.o <- setdiff(cells, i.cells)
    i.cells.idx <- which(colnames(Biobase::exprs(ES)) %in% i.cells)
    i.cells.o.idx <- which(colnames(Biobase::exprs(ES)) %in% i.cells.o)
    i.unique <- apply(Biobase::exprs(ES), 1, function(y) unique.helper(y, i.cells.idx, i.cells.o.idx, unique.ratio, unique.quantile))
    i.name <- paste(unique.prefix, i, sep="")
    fData(ES)[,i.name]<-i.unique
  }
  return(ES)
}
unique.helper <- function(a, idx.i, idx.o, unique.ratio=2, unique.quantile=0.85) {
  a <- as.numeric(a)
  a_mu <- mean(a[idx.i])
  a_qn <- quantile(a[idx.o], unique.quantile)[[1]]
  a_unique <- 0
  
  if (a_mu/a_qn >= unique.ratio) {
    a_unique <- 1
  }
  
  return(a_unique)
}


#' Normalized and smoothened results of group specific differential expression test
#'
#' @param ES (ExpressionSet) an ExpressionSet object containing the single cell RNA-seq data
#' @param group.by (character) the name of the column that contains the cluster information
#' @param groups (character) the clusters for signature prediction
#' @param log.base (numeric) the base of logarithm
#' @param test.statistic.metric.prefix (character) prefix for labeling the columns encoding the results of test statistic metric for each group
#' @param diff.expr.prefix (character) prefix for labeling the columns encoding the results of differential test
#' @return an ExpressionSet object containing the results of test statistic metrics in the attributes of fData
#'
exprs.test.statistic.metric <- function(ES, group.by=CLUSTER.LABEL, groups=NULL, log.base=2, test.statistic.metric.prefix=TEST.STATS.METRIC.PREFIX, diff.expr.prefix=DIFF.EXPR.PREFIX) {
  if (is.null(groups)) {
    groups <- sort(unique(pData(ES)[,group.by]))
  }
  #cells <- rownames(pData(ES))
  for (i in groups) {
    i.name <- paste(test.statistic.metric.prefix, i, sep="")
    i.diffexpr.test.name <- paste(diff.expr.prefix, i, sep="")
    
    fData(ES)[,i.name] <- fData(ES)[, i.diffexpr.test.name]
    fData(ES)[,i.name] <- -log(fData(ES)[,i.name], log.base)
    i.min <- min(fData(ES)[,i.name])
    i.max <- max(fData(ES)[,i.name])
    fData(ES)[,i.name] <- fData(ES)[,i.name]/i.max
  }
  return(ES)
}


#' Calculating the similarity between each gene profile and group specific synthetic reference expression profile
#' synthetic profile is created by setting fpkm(cluster cells)=1, fpkm(non cluster cells)=0
#'
#' @param ES (ExpressionSet) an ExpressionSet object containing the single cell RNA-seq data
#' @param group.by (character) the name of the column that contains the cluster information
#' @param groups (character) the clusters for signature prediction
#' @param syn.sim.prefix (character) prefix for labeling the columns encoding the results of synthetic profile similarity metric for each group
#' @return an ExpressionSet object containing the results of synthetic profile similarity metrics in the attributes of fData
#'
exprs.synthetic.similarity <- function(ES, group.by=CLUSTER.LABEL, groups=NULL, syn.sim.prefix=SYN.SIM.PREFIX) {
  if (is.null(groups)) {
    groups <- sort(unique(pData(ES)[,group.by]))
  }
  cells <- rownames(pData(ES))
  for (i in groups) {
    i.cells <- rownames(subset(pData(ES), pData(ES)[, group.by] %in% i))
    i.cells.o <- setdiff(cells, i.cells)
    i.cells.idx <- which(colnames(Biobase::exprs(ES)) %in% i.cells)
    i.cells.o.idx <- which(colnames(Biobase::exprs(ES)) %in% i.cells.o)
    i.synthetic.profile <-  rep(0, dim(Biobase::exprs(ES))[2])
    i.synthetic.profile[i.cells.idx] <- 1
    i.ss <- apply(Biobase::exprs(ES), 1, function(y) (1+cor(i.synthetic.profile, as.numeric(y)))/2)
    i.name <- paste(syn.sim.prefix, i, sep="")
    fData(ES)[,i.name]<-i.ss
  }
  return(ES)
}

#' Constructing group specific training sets for signature prediction
#'
#' @param ES (ExpressionSet) an ExpressionSet object containing the single cell RNA-seq data
#' @param group.by (character) the name of the column that contains the cluster information
#' @param groups (character) the clusters for signature prediction
#' @param trainset.prefix (character) prefix for labeling the columns encoding the training instances for each group
#' @param train.class.prefix (character) prefix for labeling the columns encoding the class of group specific training instances
#' @param marker.prefix (character) prefix for labeling the columns encoding the biomarkers for each group
#' @param common.prefix (character) prefix for labeling the columns encoding the results of common metric for each group
#' @param unique.prefix (character) prefix for labeling the columns encoding the results of unique metric for each group
#' @param test.statistic.metric.prefix (character) prefix for labeling the columns encoding the results of test statistic metric for each group
#' @param syn.sim.prefix (character) prefix for labeling the columns encoding the results of synthetic profile similarity metric for each group
#' @return an ExpressionSet object containing the constructed training sets in the attributes of fData
#'
sigpred.training.sets <- function(ES, group.by=CLUSTER.LABEL, groups=NULL,
                                  trainset.prefix=TRAINSET.PREFIX,
                                  train.class.prefix=TRAIN.CLASS.PREFIX,
                                  test.statistic.metric.prefix = TEST.STATS.METRIC.PREFIX,
                                  common.prefix = COMMON.PREFIX,
                                  unique.prefix = UNIQUE.PREFIX,
                                  syn.sim.prefix = SYN.SIM.PREFIX,
                                  marker.prefix = MARKER.PREFIX
) {
  if (is.null(groups)) {
    groups <- sort(unique(pData(ES)[,group.by]))
  }
  
  for (i in groups) {
    i.train.name <- paste(trainset.prefix, i, sep="")
    fData(ES)[,i.train.name] <- 0
    i.train.class <- paste(train.class.prefix, i, sep="")
    fData(ES)[,i.train.class] <- NA
    
    i.p.candidates <- c()
    i.n.candidates <- c()
    
    i.test.name <- paste(test.statistic.metric.prefix, i, sep="")
    i.common.name <- paste(common.prefix, i, sep="")
    i.unique.name <- paste(unique.prefix, i, sep="")
    i.ss.name <- paste(syn.sim.prefix, i, sep="")
    
    fd <- fData(ES)
    fd <- fd[order(fd[,i.test.name]),]
    
    i.marker.name <- paste(marker.prefix, i, sep="")
    #i.p.symbols <- markers$SYMBOL[which(markers$CLUSTER %in% i)]
    i.p.candidates <- rownames(fd)[which(fd[, i.marker.name] == 1)]
    
    lambda_n =length(i.p.candidates)
    
    fd <- fData(ES)
    #fd <- fd[order(fd[,i.test.name], decreasing=T),]
    fd <- fd[order(fd[,i.test.name], decreasing=F),]
    fd <- subset(fd, fd[,i.common.name]==0 & fd[,i.unique.name]==0)
    i.n.candidates <- rownames(fd)[1:lambda_n]
    
    if (lambda_n > 0) {
      
      i.p.idx <- which(rownames(fData(ES)) %in% i.p.candidates)
      i.n.idx <- which(rownames(fData(ES)) %in% i.n.candidates)
      
      fData(ES)[c(i.p.idx, i.n.idx), i.train.name] <- 1
      fData(ES)[i.p.idx, i.train.class] <- 1
      fData(ES)[i.n.idx, i.train.class] <- 0
    }
  }
  return(ES)
}


#' Constructing group specific testing sets for signature prediction
#'
#' @param ES (ExpressionSet) an ExpressionSet object containing the single cell RNA-seq data
#' @param group.by (character) the name of the column that contains the cluster information
#' @param groups (character) the clusters for signature prediction
#' @param testset.prefix (character) prefix for labeling the columns encoding the testing instances for each group
#' @param diff.expr.prefix (character) prefix for labeling the columns encoding the results of differential test
#' @param diff.expr.threshold (numeric) genes with differential expression p-value<0.05 will be selected as candidates for signature
#' @return an ExpressionSet object containing the constructed testing sets in the attributes of fData
#'
sigpred.testing.sets <- function(ES, group.by=CLUSTER.LABEL,  groups=NULL, testset.prefix=TESTSET.PREFIX, diff.expr.prefix=DIFF.EXPR.PREFIX, diff.expr.threshold=0.05) {
  if (is.null(groups)) {
    groups <- sort(unique(pData(ES)[,group.by]))
  }
  for (i in groups) {
    i.name <- paste(testset.prefix, i, sep="")
    fData(ES)[,i.name] <- 0
    i.test.name <- paste(diff.expr.prefix, i, sep="")
    i.test.idx <- which(fData(ES)[,i.test.name] < diff.expr.threshold)
    if (length(i.test.idx) > 0) {
      fData(ES)[i.test.idx, i.name] <- 1
    }
  }
  return(ES)
}



#' logistic regression based signature prediction and cross cluster validation
#' Cell type specific signature prediction
#'
#' @param ES (ExpressionSet) an ExpressionSet object containing the single cell RNA-seq data
#' @param group.by (character) the name of the column that contains the cluster information
#' @param groups (character) the clusters for signature prediction
#' @param trainset.prefix (character) prefix for labeling the columns encoding the training instances for each group
#' @param train.class.prefix (character) prefix for labeling the columns encoding the class of group specific training instances
#' @param testset.prefix (character) prefix for labeling the columns encoding the testing instances for each group
#' @param common.prefix (character) prefix for labeling the columns encoding the results of common metric for each group
#' @param unique.prefix (character) prefix for labeling the columns encoding the results of unique metric for each group
#' @param test.statistic.metric.prefix (character) prefix for labeling the columns encoding the results of test statistic metric for each group
#' @param syn.sim.prefix (character) prefix for labeling the columns encoding the results of synthetic profile similarity metric for each group
#' @param signature.prefix (character) prefix for labeling the columns encoding the results of signature prediction
#' @param gene.symbol.label (character) the name of the column encoding gene symbols
#' @param verbose (logical)
#' @param export (logical) wheterh to export the ExpressionSet
#' @param export.components (character) the components of an ExpressionSet object that will be exported
#' @return an ExpressionSet object containing the results of signature prediction in the attributes of fData
#'
signature.prediction.1 <- function(ES, group.by="CLUSTER", groups=NULL,
                                     trainset.prefix=TRAINSET.PREFIX,
                                     train.class.prefix=TRAIN.CLASS.PREFIX,
                                     testset.prefix=TESTSET.PREFIX,
                                     test.statistic.metric.prefix = TEST.STATS.METRIC.PREFIX,
                                     common.prefix = COMMON.PREFIX,
                                     unique.prefix = UNIQUE.PREFIX,
                                     syn.sim.prefix = SYN.SIM.PREFIX,
                                     signature.prefix = SIGNATURE.PREFIX,
                                     gene.symbol.label = GENE.SYMBOL.LABEL,
                                     verbose=TRUE,
                                     export=TRUE,
                                     dir.prefix = ""
) {
  if (is.null(groups)) {
    groups <- sort(unique(pData(ES)[,group.by]))
  }
  predictions <- list()
  for (i in groups) {
    
    i.train.name <- paste(trainset.prefix, i, sep="")
    
    i.test.name <- paste(test.statistic.metric.prefix, i, sep="")
    i.common.name <- paste(common.prefix, i, sep="")
    i.unique.name <- paste(unique.prefix, i, sep="")
    i.ss.name <- paste(syn.sim.prefix, i, sep="")
    i.train.class <- paste(train.class.prefix, i, sep="")
    
    # train
    i.train.data <- subset(fData(ES)[, c(i.common.name, i.unique.name, i.test.name, i.ss.name, i.train.class)], fData(ES)[, i.train.name] == 1)
    
    if (verbose == T) {
      cat("\tConstructing model for cluster", i,":")
    }
    
    formula_for_train <- paste(i.train.class, "~", i.test.name, "+", i.ss.name, sep="")
    
    i.train.p <- i.train.data[which(i.train.data[,i.train.class]==1),]
    i.train.p.sd <- apply(as.matrix(i.train.p), 2, sd)
    
    # removing dominant factors
    
    factors <- c(i.common.name, i.unique.name, i.test.name, i.ss.name)
    
    if (i.train.p.sd[i.common.name] == 0) {
      if (verbose) {
        cat("ignore common gene metric ")
      }
    } else {
      formula_for_train <- paste(formula_for_train, "+", i.common.name, sep="")
    }
    
    if (i.train.p.sd[i.unique.name] == 0) {
      if(verbose) {
        cat(" ignore unique gene metric ")
      }
    } else {
      formula_for_train <- paste(formula_for_train, "+", i.unique.name, sep="")
    }
    
    i.glm <- glm(as.formula(formula_for_train),
                 data=i.train.data,
                 family = "binomial")
    if (verbose) {
      cat(" done\n")
      cat("\tpredicting signature genes for group", i, " ")
    }
    
    
    # prediction
    i.testing.name <- paste(testset.prefix, i, sep="")
    i.test.data <- subset(fData(ES)[, c(gene.symbol.label, i.common.name, i.unique.name, i.test.name, i.ss.name)], fData(ES)[,i.testing.name]==1)
    i.prediction <- predict(i.glm, i.test.data)
    i.pred.name <- paste(signature.prefix, i, sep="")
    #predictions[i.pred.name] <- data.frame(GENE=i.test.data$SYMBOL, P=as.numeric(i.prediction))
    
    i.prediction <- as.numeric(i.prediction)
    i.prediction <- (i.prediction-min(i.prediction))/(max(i.prediction)-min(i.prediction))
    
    fData(ES)[,i.pred.name] <- NA
    fData(ES)[rownames(i.test.data), i.pred.name] <- i.prediction
    if (export) {
      write.table(data.frame(GENE=rownames(i.test.data), SYMBOL=i.test.data[,gene.symbol.label], PREDICTION=as.numeric(i.prediction)), file=paste(dir.prefix , "signature_prediction_", i, ".txt", sep=""), sep="\t", col.names=T, row.names=F) #%%
    }
    if (verbose) {
      cat("done\n")
    }
  }
  
  #print(glms)
  return(ES)
}
xu-lab/SINCERA documentation built on Feb. 3, 2021, 4:19 a.m.