R/estimate.R

Defines functions daseq diffAUC aggregateAUCbyCondition calculateAUC getRankings getControlSet getTargetSet faseq faseqInfer faseqFit fat faseqCount drseq drseqFilter drseqFit drseqCount prepDrseqAnno

Documented in aggregateAUCbyCondition calculateAUC daseq diffAUC drseq drseqCount drseqFilter drseqFit faseq faseqCount faseqFit faseqInfer fat getControlSet getRankings getTargetSet prepDrseqAnno

## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## ------ drseq ------
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## SURF Analysis Module 1

#' Prepare DrSeq annotation files
#'
#' To use the `useMetaFeatures` functionality of [Rsubread::featureCounts], we 
#' need the GTF input. This function helps to produce the *two* annotation files 
#' ("event" and "exon") needed by [drseqCount].
#'
#' @param anno_event a `surf` object
#' @param anno.prefix `character`, prefix of exon/event annotation files for 
#'     saving. These files are needed by [Rsubread::featureCounts].
#' @param anno.format `character`, e.g. "gtf", as accepted by 
#'     [rtracklayer::export].
#' @param remove.overlap.exon `logical`, remove overlapping exons across genes 
#'     (default to `FALSE`).
#' @param cores `integer`, number of available workers, sent to `nthreads` of 
#'     [Rsubread::featureCounts].
#' @param verbose `logical`, whether (`TRUE`) to echo progress.
#' @return `NULL`, the function is a procedure and only output/export results to 
#'     file system, except for messages and warnings.
#' @export
prepDrseqAnno = function(anno_event,
                         anno.prefix = "annotation.drseq",
                         anno.format = "gff2",
                         remove.overlap.exon = FALSE,
                         cores = max(1, detectCores()-2),
                         verbose = TRUE) {
  ## reconstruct genome annotation from @genePartsList
  ## with $type and $gene_id attributes
  isoPL = anno_event@genePartsList
  if (verbose) cat("Recognizing", nrow(isoPL), "gene parts list...\n")
  registerDoParallel(cores)
  anno <- foreach (segment = isoPL$segment, label = isoPL$label) %dopar% {
    gene <- range(segment[!!label])
    gene$type = "gene"
    ## range() and reduce() should be equivalent here
    mergedSegment <- c_granges(range(S4Vectors::split(segment, label)),
                               use.names = FALSE,
                               save.names = "exon_label")
    exon <- mergedSegment[mergedSegment$exon_label != "0"]
    exon$type = "exon"
    c(gene, exon)
  }
  stopImplicitCluster()
  anno <- c_granges(setNames(anno, isoPL$gene_id),
                    use.names = FALSE, save.names = "gene_id")
  
  ## ---- "exon" flattened file
  output.exon <- paste0(anno.prefix, '.exon.', anno.format)
  if (verbose) cat("Outputing exon flattened file to", output.exon, "...\n")
  
  ## merge exons by genes
  exon = anno[anno$type == 'exon']
  exon = GRangesList(S4Vectors::split(exon, exon$gene_id))
  exon = unlist(GenomicRanges::reduce(exon))
  ## remove overlapping exons across genes (optional)
  if (remove.overlap.exon) {
    exon = exon[countOverlaps(exon, exon) < 2]
  }
  
  ## exonic_part
  exon$source = factor('drseq')
  exon$type = factor('exonic_part')
  exon$gene_id = names(exon)
  num = unlist(lapply(rle(exon$gene_id)$lengths, seq_len))
  exon$exonic_part_number = stringr::str_pad(num, 3, pad = "0")
  exon$exonic_part_id = paste(exon$gene_id, exon$exonic_part_number, sep = ":")
  
  ## aggregate_gene
  gene = anno[anno$type == 'gene' & anno$gene_id %in% exon$gene_id]
  names(gene) = gene$gene_id
  mcols(gene) = NULL
  gene$source = factor('drseq')
  gene$type = factor('aggregate_gene')
  gene$gene_id = names(gene)
  
  ## output exon annotation
  gr1 = c(gene, exon)
  gr1 = unlist(GRangesList(S4Vectors::split(unname(gr1), gr1$gene_id)), use.names = FALSE)
  rtracklayer::export(gr1, output.exon, anno.format)
  
  ## ---- "event" flattened file
  output.event <- paste(anno.prefix, 'event', anno.format, sep = ".")
  if (verbose) cat("Outputing event flattened file to", output.event, "...\n")
  
  ## remove events outside exons (if some exons are removed)
  if (remove.overlap.exon) {
    ## AFE1/ALE1 may range multiple exons
    hit <- findOverlaps(anno_event, exon, type = "within") 
    hit <- hit[anno_event$gene_id[from(hit)] == exon$gene_id[to(hit)]]
    anno_event <- anno_event[unique(from(hit))]
  }
  
  ## exonic_part
  event <- c_granges(anno_event$genomicData, sep = "")
  exonic_part_count <- elementNROWS(anno_event$genomicData)
  event$source = factor('drseq')
  event$type = factor('exonic_part')
  event$event_id = names(event)
  event$gene_id = rep(anno_event$gene_id, exonic_part_count)
  event$transcript_id = rep(anno_event$gene_id, exonic_part_count)
  num = rep(unlist(lapply(rle(anno_event$gene_id)$lengths, seq_len)), 
            exonic_part_count)
  event$event_number = stringr::str_pad(num, 3, pad = "0")
  # event$exonic_part_id = paste(event$gene_id, event$exonic_part_number, sep = ":")
  
  # ## aggregate_gene
  # gene = unlist(range(anno_event$genomicData))
  # gene$source = factor('drseq')
  # gene$type = factor('aggregate_gene')
  # gene$gene_id = names(gene)
  
  ## output
  gr2 = c(gene, event)
  gr2 = unlist(GRangesList(S4Vectors::split(
    unname(gr2), gr2$gene_id)), use.names = FALSE)
  rtracklayer::export(gr2, output.event, anno.format)
  
}


#' Construct DrSeq Data
#'
#' This function creates the `drseqData` slot needed by DrSeq analysis, which is 
#' a [DEXSeq::DEXSeqDataSet]. This function requires two DrSeq annotation files 
#' created by [prepDrseqAnno]. Two files have the same prefix (`anno.prefix`), 
#' the same suffix (`anno.format`), and only differ by "exon/event".
#' If annotation files are missing, this function can create them freshly, which 
#' might take some time. This function counts the RNA-seq reads on ATR events 
#' and genes using `featureCounts`.
#'
#' @param event a `surf` object from [parseEvent].
#' @param sampleData `data.frame`, describes the RNA-seq samples and 
#'   contains at least two columns -- `bam` and `condition`, 
#'   whose `row.names` represent sample names.
#' @inheritParams DEXSeq::DEXSeqDataSet
#' @param remove.overlap.exon `logical`, remove overlapping exons across genes 
#'     (default to `FALSE`).
#' @param anno.prefix `character`, file names for outputting annotation files.
#'   If prefix is absent, hidden file will be output to the current working 
#'   directory. 
#' @param anno.format `character`, e.g. "gtf", as accepted by 
#'     [rtracklayer::export].
#' @param minMQS,isPairedEnd as defined in [Rsubread::featureCounts]. 
#'   Note that the default is customized for SURF (see details for more 
#'   information).
#' @param cores `integer`, number of available workers, sent to `nthreads` of 
#'     `featureCounts`.
#' @param verbose `logical`, whether (`TRUE`) to echo progress.
#' @param ... additional parameters for [Rsubread::featureCounts].
#' @details 
#'   If you used Illumina HiSeq 2000, set `strandSpecific = 2` (reversed strand).
#' @return a `surf` object, with `drseqData` slot updated.
#' @export
drseqCount = function(event, sampleData,
                      design = ~ sample + exon + condition:exon,
                      remove.overlap.exon = FALSE,
                      anno.prefix = "drseq.annotation",
                      anno.format = "gff2",
                      minMQS = 10,
                      isPairedEnd = TRUE,
                      cores = max(1, detectCores()-2),
                      verbose = FALSE,
                      ...) {
  # check input sampleData
  sampleData <- as.data.frame(sampleData)
  if (is.null(sampleData$bam))
    stop("sampleData must contain \"bam\" column.")
  bam.files <- sampleData$bam
  if (is.null(sampleData$condition))
    stop("sampleData must contain \"condition\" column.")
  sampleData$condition <- as.factor(sampleData$condition)
  if (length(levels(sampleData$condition))!=2)
    stop("The condition must have two levels, for IP and input respectively.")
  if (is.null(rownames(sampleData))) {
    warning("sampleData has no row.names input, created automatically.")
    rownames(sampleData) <- paste0(
      as.character(sampleData$condition),
      unlist(lapply(rle(as.character(sampleData$condition))$lengths, seq_len))
    )
  }
  
  exon.file <- paste(anno.prefix, 'exon', anno.format, sep = ".")
  event.file <- paste(anno.prefix, 'event', anno.format, sep = ".")
  if (!file.exists(exon.file) || !file.exists(event.file)) {
    cat("Cannot find event/exon annotation files at\n   ",
        exon.file, "\n   ", event.file,
        "\nMaking them freshly...\n")
    ## prepare annotation for exons and events
    prepDrseqAnno(
      anno_event = event,
      anno.prefix = anno.prefix, anno.format = anno.format,
      remove.overlap.exon = remove.overlap.exon,
      cores = cores,
      verbose = verbose
    )
  }
  
  ## count reads on exons
  count.exon = featureCounts(
    files = bam.files,
    annot.ext = exon.file,
    isGTFAnnotationFile = TRUE,
    GTF.featureType = "exonic_part",
    GTF.attrType = "gene_id",
    allowMultiOverlap = TRUE,
    minMQS = minMQS,
    isPairedEnd = isPairedEnd,
    nthreads = cores,
    verbose = verbose, ...
  )
  colnames(count.exon$counts) =
    colnames(count.exon$stat)[-1] =
    rownames(sampleData)
  # cat("Total count:", sum(count.exon$counts), "\n")
  
  ## count reads on events
  count.event = featureCounts(
    files = bam.files,
    annot.ext = event.file,
    isGTFAnnotationFile = TRUE,
    GTF.featureType = "exonic_part",
    GTF.attrType = "event_id",
    allowMultiOverlap = TRUE,
    minMQS = minMQS,
    isPairedEnd = isPairedEnd,
    nthreads = cores,
    verbose = verbose, ...
  )
  colnames(count.event$counts) =
    colnames(count.event$stat)[-1] =
    rownames(sampleData)
  
  ## DEXSeqDataSet
  index <- match(event$event_id, rownames(count.event$counts))
  if (any(is.na(index))) {
    stop("Count results can not match the event annotation.")
  } ## just to be sure
  drd <- DEXSeqDataSet(
    countData = count.event$counts[index,],
    sampleData = sampleData,
    design = design,
    featureID = event$event_id,
    groupID = event$gene_id
  )
  cat(fill = TRUE)
  
  ## replace with gene reads count (still a DEXSeqDataSet)
  cnt.other = count.exon$counts[rowData(drd)$groupID,] - 
    counts(drd)[,seq_along(bam.files)]
  counts(drd)[,length(bam.files)+seq_along(bam.files)] = cnt.other
  rowMin <- rowMin(counts(drd))
  if (any(rowMin < 0)) {
    warning(sum(rowMin < 0), " event(s) prompt to invalid REU coefficients.")
    drd = drd[rowMin >= 0,]
  }
  
  ## update our "surf" object
  event@drseqData = drd
  event@sampleData$"RNA-seq" = DataFrame(
    sample = rownames(sampleData),
    sampleData[setdiff(names(sampleData), "sample")]
  )
  metadata(event) = metadata(event)
  metadata(event)$remove.overlap.exon = remove.overlap.exon
  
  return(event)
}


#' DrSeq Fit
#'
#' Fit DrSeq models using the DEXSeq as the estimation engine.
#' 
#' @param drd a `surf` object from [drseqCount].
#' @param cores `integer`, number of computing workers.
#' @param verbose `logical`, whether (`TRUE`) to echo progress.
#' @inheritParams DEXSeq::DEXSeq
#' @return a `surf` object with (1) `drseqResults` and `sampleData` slot updated 
#'     and (2) three added columns:
#' \item{eventBaseMean}{base read coverage of the event from RNA-seq data.}
#' \item{padj}{adjusted p-value for differential REU.}
#' \item{logFoldChange}{estimated log2 fold change of REU: log2(condition of the 
#'     1st sample / another condition)}
#' @details the `drseqResults` slot contains extensive DrSeq results. To access 
#'     the object, use [drseqResults] function.
#' @export
drseqFit <- function(drd,
                     fullModel = design(drd@drseqData),
                     reducedModel = ~ sample + exon,
                     fitExpToVar = "condition",
                     cores = max(1, detectCores()-2),
                     verbose = FALSE) {
  dxd <- drd@drseqData ## "DEXSeqDataSet"
  if (is.null(dxd)) stop("Cannot find drseqData.")
  
  BPPARAM = MulticoreParam(workers = cores)
  dxdl = S4Vectors::split(dxd, drd[mcols(dxd)$featureID, "event_name"])
  
  if (verbose) cat("Fitting DrSeq for", paste(names(dxdl), collapse = ", "), "...\n")
  t <- system.time({
    dxrl <- lapply(dxdl, DEXSeq,
                   fullModel = fullModel,
                   reducedModel = reducedModel,
                   fitExpToVar = fitExpToVar,
                   BPPARAM = BPPARAM,
                   quiet = !verbose)
  })
  if (verbose) cat("Running time:", t[3], "sec. \n")
  
  ## added columns
  dxr = do.call("rbind", dxrl)
  index <- match(drd$event_id, dxr$featureID)
  if (any(is.na(index))) {
    stop("Event ID's do not match between DrSeq and annotation.")
  }
  dxr_sub <- as(dxr[index, c(3:10, 12)], "DataFrame")
  ## re-control FDR across all event types
  dxr_sub$padj <- p.adjust(dxr_sub$pvalue, method = "fdr") 
  names(dxr_sub)[1] <- "eventBaseMean"
  names(dxr_sub)[8] <- "logFoldChange" ## log2(condition of the 1st sample / another condition)
  mcols(dxr_sub)$type <- "drseq"
  mcols(dxr_sub)["eventBaseMean", "description"] <- 
    "mean of the counts across samples in each event"
  mcols(dxr_sub)["dispersion", "description"] <- "event dispersion estimate"
  mcols(dxr_sub)["countData", "type"] <- "RNA-seq"
  
  ## representation
  sampleData = dxrl[[1]]@sampleData
  modelFrameBM = lapply(dxrl, slot, "modelFrameBM")
  dispersionFunction = lapply(dxrl, slot, "dispersionFunction")
  metadata = do.call("c", lapply(dxrl, slot, "metadata"))
  
  ## take a subset of drd's column
  
  drr <- new("drseqResults", 
             cbind(as(drd, "DataFrame"), as(dxr_sub, "DataFrame")),
             modelFrameBM = DataFrameList(modelFrameBM),
             dispersionFunction = List(dispersionFunction))
  drr@metadata <- c(drd@metadata, metadata)
  
  res <- new(
    "surf",
    cbind(as(drd, "DataFrame"), 
          as(drr[c("eventBaseMean", "padj", "logFoldChange")], "DataFrame")),
    genePartsList = drd@genePartsList,
    drseqData = drd@drseqData,
    drseqResults = drr, ## newly added
    faseqData = drd@faseqData,
    faseqResults = drd@faseqResults,
    daseqResults = drd@daseqResults,
    sampleData = drd@sampleData
  )
  res@sampleData$"RNA-seq" <- sampleData
  metadata(res) <- metadata(drd)
  return(res)
}

#' Screen SURF training samples
#'
#' Filter DrSeq results for the analysis module 2 of SURF (DASeq).
#'
#' @param event a `surf` object output by [drseqFit].
#' @param drseq.fdr `numeric`, FDR (BH procedure) adjusted p-value cut-off.
#' @param read.length `numeric`, RNA-seq read length. Default is 100 bp (e.g., 
#'     Illumina TruSeq). This is used to adjust event base count, which is then 
#'     used to select the representative events if replicated.
#' @param min.adjMean `numeric`, adjusted event base mean threshold.
#' @param filter.overlap.event `logical`, whether (default to `TRUE`) to select 
#'     one representitive event from overlapping ones and remove the others.
#' @param verbose `logical`, whether (default to `FALSE`) to print out basic 
#'     summary statistics.
#'
#' @return a `surf` object, with three columns added:
#' \item{adjMean}{adjusted base mean of the event from RNA-seq data.}
#' \item{group}{group labels of ATR events, `increase` for increased REU upon 
#'     RBP knock-down, and `decrease` for decreased, and `no change` for 
#'     no-changed.}
#' \item{included}{logical, indicating whether the event is included into SURF 
#'     analysis module 2.}
#' @export
drseqFilter = function(event,
                       drseq.fdr = .05,
                       read.length = 100,
                       min.adjMean = .05,
                       filter.overlap.event = TRUE,
                       verbose = FALSE) {
  ## check input
  if (!all(c("eventBaseMean", "padj", "logFoldChange") %in% colnames(event))) {
    stop("You should perform DrSeq first.")
  }
  if (any(c("adjMean", "group", "included") %in% colnames(event))) {
    event$adjMean = NULL
    event$group = NULL
    event$included = NULL
  }
  
  ## event read coverage
  eventLen = vapply(width(event$genomicData), sum, numeric(1))
  adjMean = event$eventBaseMean / (eventLen + read.length - 1)
  ## group by logFoldChange x padj
  group = rep("no change", nrow(event))
  group[event$padj < drseq.fdr & event$logFoldChange < 0] = "decrease"
  group[event$padj < drseq.fdr & event$logFoldChange > 0] = "increase"
  group = ordered(group, c("decrease", "no change", "increase"))
  
  ## included event id
  id_evtDU = event$event_id[!is.na(event$padj) &
                              event$padj < drseq.fdr &
                              adjMean > min.adjMean &
                              !is.na(event$logFoldChange) &
                              abs(event$logFoldChange) > 0]
  id_evtEU = event$event_id[!is.na(event$padj) &
                              event$padj > .4 &
                              adjMean > min.adjMean &
                              !is.na(event$logFoldChange) &
                              abs(event$logFoldChange) > 0]
  if (verbose)
    cat("Identified", length(id_evtDU), "DR events and",
        length(id_evtEU), "ER events.\n")
  id_evt = union(id_evtDU, id_evtEU)
  
  ## pooling duplicated events by max{adjMean}
  ## (i) same ATR event
  ## (ii) overlapped event body
  if (filter.overlap.event) {
    event1 = event[event$event_id %in% id_evt,]
    adjMean1 <- adjMean[event$event_id %in% id_evt]
    ## sort by normalized read coverage
    event1 = event1[order(adjMean1, decreasing = TRUE),] 
    hits = findOverlaps(event1$genomicData, event1$genomicData)
    ## from back to front, will remove the smaller in adjMean
    hits = hits[from(hits) > to(hits)] 
    ## same event names
    hits = hits[event1$event_name[from(hits)] == event1$event_name[to(hits)]] 
    if (verbose) cat(length(unique(from(hits))), "overlapping events.\n")
    id_evt = setdiff(id_evt, event1$event_id[from(hits)])
  }
  
  if (!length(id_evt)) stop("No usable SURF instance!")
  included = event$event_id %in% id_evt
  
  ## new columns
  ds = DataFrame(adjMean, group, included)
  mcols(ds)$type = c("RNA-seq", "faseq", "faseq")
  mcols(ds)$description <- c(
    "adjusted base mean of each event",
    "direction of differential regulation (upon knock-down)",
    "indicator of usable event")
  
  ## wrap into faseqData object
  res <- new(
    "surf", 
    cbind(as(event, "DataFrame"), ds),
    genePartsList = event@genePartsList,
    drseqData = event@drseqData,
    drseqResults = event@drseqResults,
    faseqData = event@faseqData,
    faseqResults = event@faseqResults,
    daseqResults = event@daseqResults,
    sampleData = event@sampleData
  )
  metadata(res) = metadata(event)
  metadata(res)$drseq.fdr = drseq.fdr
  metadata(res)$RNAseq.read.length = read.length
  metadata(res)$min.adjMean = min.adjMean
  metadata(res)$filter.overlap.event = filter.overlap.event
  
  if (verbose) {
    cat(length(id_evt), "events included for SURF analysis.\n",
        "The distribution of AS/ATI/APA events identified:\n")
    print(table(data.frame(res[res$included, c("event_name", "group")])))
  }
  
  return(res)
}

#' DrSeq
#'
#' Perform the differential REU (DrSeq) test in a single command. This function 
#' is a wrapper that calls the necessary functions in order for DrSeq.
#' 
#' @inheritParams drseqCount
#' @param ... parameters for [Rsubread::featureCounts].
#' @inheritParams drseqFit
#' @inheritParams drseqFilter
#' @return a `surf` object containing the DrSeq result in the `drseqResults` 
#'     slot.
#' @references Chen, F., & Keles, S. (2020). SURF: integrative analysis of a 
#'     compendium of RNA-seq and CLIP-seq datasets highlights complex governing 
#'     of alternative transcriptional regulation by RNA-binding proteins. 
#'     *Genome Biology*, 21(1), 1-23.
#' @export
drseq <- function(event,
                  
                  ## data
                  sampleData,
                  design = ~ sample + exon + condition:exon,
                  remove.overlap.exon = FALSE,
                  anno.prefix = "drseq.annotation",
                  anno.format = "gff2",
                  minMQS = 10,
                  isPairedEnd = TRUE,
                  ...,
                  
                  ## fit
                  fullModel = design,
                  reducedModel = ~ sample + exon,
                  fitExpToVar = "condition",
                  
                  ## filter
                  drseq.fdr = .05,
                  read.length = 100,
                  min.adjMean = .05,
                  filter.overlap.event = TRUE,
                  
                  cores = max(1, detectCores()-2),
                  verbose = FALSE) {
  if (verbose)
    cat("Preparing DrSeq count dataset...\n")
  timer <- system.time({
    event <- drseqCount(
      event, sampleData,
      design = design,
      remove.overlap.exon = remove.overlap.exon,
      anno.prefix = anno.prefix,
      anno.format = anno.format,
      minMQS = minMQS,
      isPairedEnd = isPairedEnd,
      cores = cores,
      verbose = verbose, ...
    )
  })
  if (verbose) cat("Run time (RNA-seq read counting):", timer[3], "sec.\n")
  
  # if (verbose) cat("Fitting DrSeq models...\n")
  timer <- system.time({
    event <- drseqFit(
      event,
      fullModel = fullModel,
      reducedModel = reducedModel,
      fitExpToVar = fitExpToVar,
      cores = cores,
      verbose = verbose
    )
  })
  if (verbose) cat("Run time (model fitting):", timer[3], "sec.\n")
  
  if (verbose) cat("Annotating/referencing DrSeq results...\n")
  event <- drseqFilter(
    event,
    drseq.fdr = drseq.fdr,
    read.length = read.length,
    min.adjMean = min.adjMean,
    filter.overlap.event = filter.overlap.event,
    verbose = verbose
  )
  
  return(event)
}


## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## ------ faseq ------
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## SURF Analysis Module 2

#' Construct FASeq Data Set
#'
#' This function quantifies feature signals for location features using CLIP-seq 
#' data. You align CLIP-seq reads to the genome and provide FASeq with the 
#' resulting bam files. We will take care of the rest.
#'
#' @param event a `surf` object.
#' @param sampleData `data.frame`, must contain two columns -- 
#'   "bam" and "condition" (for "IP" and "input", where "IP" should come first), 
#'   whose `row.names` represent the sample names. "bam" is the file name of 
#'   CLIP-seq bam. "condition" will be coerced to factor, whose first level will 
#'   be treated as IP, and the second level as input.
#' @param signal.type `character`, indicate the type of feature signal wanted, 
#'   support "TPM" for Transcripts Per Kilobase Million, 
#'   "FPKM" for Fragments Per Kilobase Million (for paired-end reads) and 
#'   Reads Per Kilobase Million (for single-end reads), and 
#'   "raw.count" for raw read counts
#' @param FUN.aggregate `function`, used for aggregating signals within 
#'     `condition`, default to [mean()].
#' @param cores `integer`, number of available workers, 
#'   sent to `nthreads` of [featureCounts]
#' @param verbose `logical`, whether (default to `TRUE`) to echo progress
#' @param minMQS,minOverlap,isPairedEnd,... parameters for [featureCounts]. 
#'   `minMQS` is default to 10, and `minOverlap` is default to 12 (25% of the 
#'   typical read length of CLIP-seq (~50bp)), and `isPairedEnd` is default to 
#'   `TRUE`.
#' @return a `surf` object, with 
#'   (1) one column `featureSignal` added, 
#'   (2) `faseqData` slot updated, and 
#'   (3) `sampleData` slot updated.
#' @details 
#'   If your sequencing platform is Illumina HiSeq 2000, set 
#'   `strandSpecific = 2`.
#' @keywords feature signal, CLIP-seq
#' @references \url{https://www.rna-seqblog.com/rpkm-fpkm-and-tpm-clearly-explained/}
#' @export
faseqCount <- function(event, sampleData,
                       signal.type = "FPKM",
                       FUN.aggregate = "mean",
                       minMQS = 10,
                       minOverlap = 12,
                       isPairedEnd = TRUE,
                       cores = max(1, detectCores()-2),
                       verbose = FALSE, ...) {
  ## check @sampleData
  if (length(signal.type) != 1 || 
      !signal.type %in% c("TPM", "FPKM", "raw.count")) {
    stop("Please specify signal type.")
  }
  sampleData <- as.data.frame(sampleData)
  if (is.null(sampleData$bam))
    stop("sampleData must contain \"bam\" column.")
  bam.files <- sampleData$bam
  if (is.null(sampleData$condition))
    stop("sampleData must contain \"condition\" column.")
  sampleData$condition <- as.factor(sampleData$condition)
  if (length(levels(sampleData$condition))!=2)
    stop("The condition must have two levels, for IP and input respectively.")
  if (is.null(rownames(sampleData))) {
    warning("sampleData has no row.names input, created automatically.")
    rownames(sampleData) <- paste0(
      as.character(sampleData$condition),
      unlist(lapply(rle(as.character(sampleData$condition))$lengths, seq_len)))
  }
  
  
  if (any("featureSignal" %in% colnames(event))) {
    event$featureSignal = NULL
  }
  
  ## featureCounts
  count.feature = featureCounts(
    bam.files,
    annot.ext = ensembldb::toSAF(event$feature),
    useMetaFeatures = FALSE,
    allowMultiOverlap = TRUE,
    minOverlap = minOverlap,
    minMQS = minMQS,
    isPairedEnd = isPairedEnd,
    nthreads = cores,
    verbose = verbose, ...
  )
  colnames(count.feature$counts) =
    colnames(count.feature$stat)[-1] =
    rownames(sampleData)
  
  ## prepare @sampleData
  CLIPseqSampleData = DataFrame(
    sample = rownames(sampleData),
    sampleData[setdiff(names(sampleData), "sample")],
    depth = colSums(count.feature$stat[rownames(sampleData)])
  )
  mcols(CLIPseqSampleData)$type = "input"
  mcols(CLIPseqSampleData)$description = ""
  
  ## construct "faseqData", CLIP-seq read count
  if (verbose) cat("Generating signals from individual samples...\n")
  featureCount <- count.feature$counts
  rownames(featureCount) <- NULL
  rd <- c_granges(event$feature, sep = "-")
  n_feature <- elementNROWS(event$feature)
  rd$event_id <- rep(event$event_id, n_feature)
  rd$event_name <- rep(event$event_name, n_feature)
  rd$gene_id <- rep(event$gene_id, n_feature)
  rd$transcript_id <- rep(event$transcript_id, n_feature)
  rd$feature_name <- factor(unlist(lapply(event$feature, names)), surf.features)
  faseqData = SummarizedExperiment(
    assays = List(count = featureCount),
    rowData = rd,
    colData = CLIPseqSampleData,
  )
  
  ## transform count into signal
  if (signal.type == "raw.count") {
    signal <- count.feature$counts
    f <- 1
  } else {
    rgs <- unlist(event$feature)
    if (signal.type == "FPKM") {
      f <- colSums(count.feature$stat[-1]) * 1e-6 # "per million" scaling factor
      rpk <- t(t(count.feature$counts / f)) # reads per million (RPM)
      signal <- rpkm <- rpk / width(rgs) * 1e3
    }
    if (signal.type == "TPM") {
      rpk <- count.feature$counts / width(rgs) * 1e3 # reads per kilobase (RPK).
      f <- colSums(rpk) * 1e-6 # "per million" scaling factor
      signal <- tpm <- t(t(rpk) / f)
    }
  }
  rownames(signal) <- names(unlist(event$feature))
  
  ## aggregate by condition -- mean, then take the difference: IP - input
  if (verbose) cat("Contrasting IP from SMInput...\n")
  # colnames(signal) = sampleData$condition
  # signal <- signal %>%
  #   melt(varnames = c("feature", "condition")) %>%
  #   group_by(feature, condition) %>%
  #   summarise(aggregate = FUN.aggregate(value)) %>%
  #   group_by(feature) %>%
  #   summarise(contrast = - diff(aggregate))
  signal.IP = signal[,sampleData$condition == sampleData$condition[1], 
                     drop = FALSE]
  if (FUN.aggregate == "mean") {
    signal.IP <- rowMeans(signal.IP)
  } else {
    signal.IP <- apply(signal.IP, 1, FUN.aggregate)
  }
  signal.input = signal[
    ,sampleData$condition == sampleData$condition[nrow(sampleData)], 
    drop = FALSE]
  if (FUN.aggregate == "mean") {
    signal.input <- rowMeans(signal.input)
  } else {
    signal.input <- apply(signal.input, 1, FUN.aggregate)
  }
  signal.contrast <- signal.IP - signal.input
  names(signal.contrast) <- names(unlist(event$feature, use.names = FALSE))
  featureSignal = relist(signal.contrast, event$feature)
  
  ## annotate new column's attribute
  ds = DataFrame(featureSignal)
  mcols(ds)$type <- "CLIP-seq"
  mcols(ds)$description <- "normalized CLIP-seq feature signals (contrasted)"
  
  ## add scaling factor to sampleData
  CLIPseqSampleData <- cbind(CLIPseqSampleData, sizeFactor = f)
  mcols(CLIPseqSampleData)["sizeFactor", "type"] = "intermediate"
  mcols(CLIPseqSampleData)["sizeFactor", "description"] = 
    "\"per million\" scaling factor"
  
  res <- new(
    "surf", 
    cbind(as(event, "DataFrame"), ds),
    genePartsList = event@genePartsList,
    drseqData = event@drseqData,
    drseqResults = event@drseqResults,
    faseqData = faseqData, ## newly added
    faseqResults = event@faseqResults,
    daseqResults = event@daseqResults,
    sampleData = event@sampleData
  )
  res@sampleData$"CLIP-seq" <- CLIPseqSampleData
  metadata(res) = metadata(event)
  metadata(res)$signal.type = signal.type
  metadata(res)$FUN.aggregate = FUN.aggregate
  return(res)
}

#' Perform the functional association test (FAT)
#'
#' This is a learning one unit of SURF.
#' It trains a GLM model for the functional association of one RBP with one ATR 
#' event.
#'
#' @param data `data.frame`, contains training data for one RBP and one event 
#'     type
#' @param min.size `integer`, the minimum size of "reliable" training set, 
#'     default to 60.
#' @param trim `numeric`, the percentile used to trim the training data. 
#'     This is useful in producing a more robust estimation of functional 
#'     association.
#' @return a `data.frame` that summarizes the FAT.
fat = function(data, min.size = 60, trim = 0.025) {
  feature = intersect(colnames(data), 
                      c("up3", "up2", "up1", "bd1", "bd2", "dn1", "dn2", "dn3"))
  res = data.frame()
  # inc vs ctrl
  coef.inc = t(vapply(feature, function(f) {
    sub = data[data$group != "decrease", c("group", f)]
    sub = sub[sub[[f]] > quantile(sub[[f]], trim) &
                sub[[f]] < quantile(sub[[f]], 1 - trim),]
    fit.glm = arm::bayesglm(paste("group ~", f), binomial(link = "logit"), sub)
    coef = coef(summary(fit.glm))[2,]
    coef[4] = pnorm(coef[3], lower.tail = FALSE)
    names(coef)[4] = "p.value"
    coef
  }, FUN.VALUE = numeric(4)))
  res = rbind(res, cbind(
    feature = feature,
    size = sum(data$group == "increase"),
    as.data.frame(coef.inc),
    functional = "exclusion"))
  
  # dec vs ctrl
  coef.dec = t(vapply(feature, function(f) {
    sub = data[data$group != "increase", c("group", f)]
    sub = sub[sub[[f]] > quantile(sub[[f]], trim) &
                sub[[f]] < quantile(sub[[f]], 1 - trim),]
    fit.glm = arm::bayesglm(paste("group ~", f), binomial(link = "logit"), sub)
    coef = coef(summary(fit.glm))[2,]
    coef[4] = pnorm(coef[3], lower.tail = TRUE)
    names(coef)[4] = "p.value"
    coef
  }, FUN.VALUE = numeric(4)))
  res = rbind(res, cbind(
    feature = feature,
    size = sum(data$group == "decrease"),
    as.data.frame(coef.dec),
    functional = "inclusion"))
  
  res = dplyr::filter(res, .data$size >= min.size)
  return(res)
}

#' Functional Association using Sequencing data
#'
#' This function performs functional association test (FAT).
#' The null hypothesis of FAT is that there is no association between 
#' feature signals and differential ATR.
#'
#' @param event a `surf` object output by [faseqCount].
#' @inheritParams fat
#' @param verbose `logical`, whether to print out progress information.
#' @return a `surf` object with `faseqResults` slot updated.
#' @export
faseqFit <- function(event,
                     min.size = 60,
                     trim = 0.025,
                     verbose = FALSE) {
  ## check
  stopifnot(all(c("group", "included", "featureSignal") %in% colnames(event)))
  
  ## format data for SURF, group by event_name
  dat = event[event$included, c("group","featureSignal")]
  event_name = event[event$included, "event_name"]
  dat = S4Vectors::split(dat, event_name)
  dat <- dat[!vapply(dat, is.null, FUN.VALUE = logical(1)) & 
               !!vapply(dat, nrow, FUN.VALUE = integer(1))]
  if (verbose) cat("Testing location features for", length(dat), "events:",
                   paste(names(dat), collapse = " "), "\n")
  
  ## after slipt, ncol() becomes the same within each event type,
  ## thus can coerce into data.frame
  dat <- lapply(dat, function(x) {
    x$featureSignal <- list_rbind(x$featureSignal)
    data.frame(group = x$group, x$featureSignal)
  })
  
  testing <- lapply(dat, fat,
                    min.size = min.size,
                    trim = trim)
  
  res <- list_rbind(testing, save.names = "event")
  res$event <- factor(res$event, surf.events)
  res$padj = p.adjust(res$p.value, method = "fdr")
  res <- DataFrame(res)
  mcols(res)$type = "faseq"
  mcols(res)$description = c(
    "event type/category",
    "positional feature",
    "number of events (sample size in FA test)",
    "estimated feature main effect",
    "estiamted feature standard error",
    "standardized Z value (Gaussian)",
    "p value",
    "inferred regulating function",
    "adjusted p value (BH)"
  )
  if (nrow(res)) {
    rownames(res) <- paste0(res$event, "-", res$feature, ":", res$functional)
  }
  
  metadata(res) <- list(min.size = min.size,
                        trim = trim)
  faseqResults <- new("faseqResults", res)
  
  event@faseqResults <- faseqResults
  metadata(event)$min.size = min.size
  metadata(event)$trim = trim
  
  return(event)
}

#' Functional association inference
#'
#' Inference the functionality of individual location features,
#' where the RBP is likely to interact and regulate the corresponding ATR event.
#' A location feature is inferred as function associated if
#' (i) it is included in FAT, and
#' (ii) the corresponding FAT is significant (padj < cut.off), and
#' (iii) it has strong binding signal (featureSignal > cut.off).
#'
#' @param event a `surf` object from [faseq] or [faseqFit].
#' @param fdr.cutoff `numeric`, significance cutoff for the adjusted p-values.
#' @param signal.cutoff `numeric`, threshold cut-off for the eCLIP signals, 
#'   default to 20. Set this to 0 if don't want to filter those location with 
#'   low eCLIP signals of the RBP.
#' @return a `surf` object, with one added `inferredFeature` column 
#'   (inclusion/exclusion/none).
#' @export
faseqInfer = function(event,
                      fdr.cutoff = 0.05,
                      signal.cutoff = 20) {
  if (any("inferredFeature" %in% colnames(event))) {
    event$inferredFeature = NULL
  }
  
  far <- event@faseqResults
  signal <- unlist(event$featureSignal, use.names = FALSE)
  event_name <- rep(event$event_name, elementNROWS(event$feature))
  group <- rep(event$group, elementNROWS(event$feature))
  included <- rep(event$included, elementNROWS(event$feature))
  testFeature <- paste0(event_name, "-", names(signal))
  inferred <- rep("none", length(signal))
  
  ## infer inclusion features
  subfar <- far[far$padj < fdr.cutoff &
                  far$functional == "inclusion",]
  sigFeature <- paste0(subfar$event, "-", subfar$feature)
  inferred[testFeature %in% sigFeature &
             signal > signal.cutoff &
             group == "decrease" &
             included] <- "inclusion"
  
  ## infer exclusion features
  subfar <- far[far$padj < fdr.cutoff &
                  far$functional == "exclusion",]
  sigFeature <- paste0(subfar$event, "-", subfar$feature)
  inferred[testFeature %in% sigFeature &
             signal > signal.cutoff &
             group == "increase" &
             included] <- "exclusion"
  
  ## recode into factor & relist
  inferred <- factor(inferred,
                     c("inclusion", "exclusion", "none"))
  inferred <- FactorList(relist(inferred, event$featureSignal))
  
  ds <- DataFrame(inferredFeature = inferred)
  mcols(ds)$type <- "faseq"
  mcols(ds)$description <- "inferred functionality of location features"
  
  res <- new(
    "surf", 
    cbind(as(event, "DataFrame"), ds),
    genePartsList = event@genePartsList,
    drseqData = event@drseqData,
    drseqResults = event@drseqResults,
    faseqData = event@faseqData,
    faseqResults = event@faseqResults,
    daseqResults = event@daseqResults,
    sampleData = event@sampleData
  )
  metadata(res) = metadata(event)
  metadata(res)$faseq.fdr = fdr.cutoff
  metadata(res)$signal.cutoff = signal.cutoff
  return(res)
}

#' DASeq
#'
#' Perform the functional association analysis (DASeq) in a single command.
#' This function is a wrapper that calls the necessary functions in order for 
#' DASeq.
#'
#' @inheritParams faseqCount
#' @param ... parameters for [Rsubread::featureCounts].
#' @inheritParams faseqFit
#' @inheritParams faseqInfer
#' @return a `surf` object DASeq results updated.
#' @references Chen, F., & Keles, S. (2020). SURF: integrative analysis of a 
#'     compendium of RNA-seq and CLIP-seq datasets highlights complex governing 
#'     of alternative transcriptional regulation by RNA-binding proteins. 
#'     *Genome Biology*, 21(1), 1-23.
#' @export
faseq <- function(event,
                  ## data
                  sampleData,
                  signal.type = "FPKM",
                  FUN.aggregate = "mean",
                  minMQS = 10,
                  minOverlap = 12,
                  isPairedEnd = TRUE,
                  cores = max(1, detectCores()-2),
                  ...,
                  
                  ## fit
                  min.size = 100,
                  trim = 0.025,
                  
                  ## inference
                  fdr.cutoff = 0.05,
                  signal.cutoff = 20,
                  
                  verbose = FALSE) {
  
  if (verbose)
    cat("Counting CLIP-seq reads (FASeq data)...\n")
  event <- faseqCount(
    event, sampleData,
    signal.type = signal.type,
    FUN.aggregate = FUN.aggregate,
    minMQS = minMQS,
    minOverlap = minOverlap,
    isPairedEnd = isPairedEnd,
    cores = cores,
    verbose = verbose, ...
  )
  
  if (verbose)
    cat("Performing functional association test (FAT)...\n")
  event <- faseqFit(
    event,
    min.size = min.size,
    trim = trim,
    verbose = verbose
  )
  
  if (verbose)
    cat("Inferencing functional association...\n")
  event <- faseqInfer(
    event,
    fdr.cutoff = fdr.cutoff,
    signal.cutoff = signal.cutoff
  )
  
  return(event)
}


## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## ------ daseq ------
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## SURF Discovery Module 1

#' Get target set based on `inferredFeature`
#' @param event a `surf` object from [faseqInfer] or [faseq].
#' @param id_column `character`, the name of the column that contains target IDs.
#' @param verbose `logical`, whether (`TRUE`) to echo progress.
#' @return a `list` of `character`, each being a set of target identifiers.
getTargetSet <- function(event,
                         id_column = "transcript_id",
                         verbose = FALSE) {
  targeted <- sapply(event$inferredFeature != "none", any)
  event_targeted <- event[targeted,]
  id <- split(event_targeted[[id_column]],
              paste0(event_targeted$event_name))
  lapply(id, unique)
}

#' Get control sets
#' @param event a `surf` object from [faseqInfer] or [faseq].
#' @param targetSets a named `list` of `character`, the target IDs. 
#' @param id_column `character`, the name of the column that contains target IDs.
#' @param verbose `logical`, whether (`TRUE`) to echo progress.
#' @return a `list` of `character`, containing the control set for each target 
#'     set.
getControlSet <- function(event, targetSets,
                          id_column = "transcript_id",
                          verbose = FALSE) {
  isoPL <- event@genePartsList
  lapply(targetSets, function(targetSet) {
    targeted <- selectMethod("%in%", "Vector")(isoPL[[id_column]], targetSet)
    any_targeted <- sapply(targeted, any)
    unlist(isoPL[[id_column]][any_targeted], use.names = FALSE)
  })
}

#' Build gene/transcript rankings for each sample
#'
#' Builds the "rankings" for each sample: expression-based ranking for all the 
#' genes/transcripts in each sample. The genes/transcripts with same expression 
#' value are shuffled. Therefore, genes/transcripts with expression '0' are 
#' randomly sorted at the end of the ranking. These "rankings" can be seen as a 
#' new representation of the original dataset. Once they are calculated, they 
#' can be saved for future analyses.
#'
#' @param exprMat Expression matrix (genes/transcripts as rows, samples as 
#'     columns). The expression matrix can also be provided as one of the 
#'     Bioconductor classes:
#' \itemize{
#' \item [RangedSummarizedExperiment] and derived classes:
#' The matrix will be obtained through `assay(exprMat)`,
#' -which will extract the first assay (usually the counts)-
#' or the assay name given in `assayName`
#' \item \link[Matrix]{dgCMatrix-class}:
#' Sparse matrix
#' \item `ExpressionSet`:
#' The matrix will be obtained through `exprs(exprMat)`
#' }
#' @param cores `integer`, number of computing workers.
#' @param verbose `logical`, whether (`TRUE`) to echo progress.
#' @inheritParams AUCell::AUCell_buildRankings
#' @return a `SummarizedExperiment` object.
#' @export
getRankings <- function(exprMat,
                        plotStats = FALSE,
                        cores = max(1, detectCores() - 2),
                        verbose = FALSE) {
  
  
  ## rank exprMat
  rankings <- AUCell::AUCell_buildRankings(
    as.matrix(exprMat),
    plotStats = plotStats,
    nCores = cores,
    verbose = verbose
  )
  names(dimnames(assays(rankings, withDimnames = FALSE)$ranking)) =
    c("genomic feature", "sample")
  SummarizedExperiment(assays(rankings))
}

#' Calculate AUC
#' @param set a `list` of sets (or signatures) to test. 
#'   The sets should be provided as `GeneSet`, `GeneSetCollection` or 
#'   `character` list.
#' @param cores integer, number of computing cores to use.
#' @param ... additional parameters for [AUCell::AUCell_calcAUC].
#' @inheritParams AUCell::AUCell_calcAUC
#' @return a `SummarizedExperiment` object.
calculateAUC <- function(set, rankings,
                         cores = 1,
                         verbose = FALSE, ...) {
  if (!is.list(set) || is.null(names(set)))
    stop("set must be a named list.")
  if (!length(set))
    return(SummarizedExperiment())
  AUC <- AUCell::AUCell_calcAUC(
    set,
    AUCell::aucellResults(rankings),
    nCores = cores,
    verbose = verbose, ...
  )
  names(dimnames(assays(AUC, withDimnames = FALSE)$AUC)) = c("set", "sample")
  SummarizedExperiment(assays(AUC))
}

#' Aggregate AUC by sample condition
#'
#' @param object a `SummarizedExperiment` output from [calculateAUC].
#' @param sampleData `data.frame` of sample data, which contains a `condition` 
#'     column.
#' @param FUN.aggregate function, used for aggregating AUC within `condition`, 
#'     default to `mean`.
#' @return a `data.frame` with 4 columns: `set`, `condition.1`, `condition.2`, 
#'     and `diff`.
aggregateAUCbyCondition <- function(object, sampleData,
                                    FUN.aggregate = "median") {
  conditions <- levels(sampleData$condition)
  getAUC(object) %>%
    reshape2::melt(value.name = "AUC") %>%
    dplyr::mutate(set = as.character(.data$set),
                  sample = as.character(.data$sample)) %>%
    left_join(data.frame(sampleData), by = "sample") %>%
    group_by(.data$set, .data$condition) %>%
    summarise(AUC = match.fun(FUN.aggregate)(.data$AUC)) %>%
    ungroup() %>%
    pivot_wider(id_cols = .data$set,
                names_from = .data$condition,
                values_from = "AUC") %>%
    # dplyr::select(set, conditions[1], conditions[2]) %>%
    dplyr::mutate(diff = .data[[conditions[1]]] - .data[[conditions[2]]])
  
  # AUC <- getAUC(object)
  # set <- rownames(AUC)
  # condition <- sampleData(object)[set, "condition"]
  # aggrAUC <- apply(getAUC(auc), 1, function(x) {
  #   a <- aggregate(x, list(condition = condition), "mean")
  #   setNames(a$x, a$condition)
  # })
  # res <- cbind(set = set, t(aggrAUC))
  # res$diff = res[[2]] - res[[3]] ## is this necessary?
  #
}

#' Differential activity (via AUC)
#'
#' Detect differential activity using the AUC measure and RNA-seq quantification.
#' This unit is a helper of [daseq].
#'
#' @param targetSet `character` vector, set of targeted units.
#' @param controlSet `character` vector, control set for contrast. 
#'   If `NULL` (default), the full set of elements in `rankings` will be used.
#' @param rankings a `SummarizedExperiment` object, with row corresponding to 
#'   transcript or gene and column corresponding to samples. 
#'   Each element is a expression measure (e.g., TPM).
#' @param n.sample `integer`, number of times of controlSet sampling, default to 
#'     1000.
#' @param cores `integer`, number of computing cores.
#' @param verbose `logical`, whether to print out progress report.
#' @inheritParams calculateAUC
#' @inheritParams aggregateAUCbyCondition
#' @return a `data.frame` of DASeq results.
diffAUC <- function(targetSet,
                    controlSet = NULL,
                    rankings,
                    sampleData,
                    n.sample = 1000,
                    cores = 1,
                    verbose = FALSE, ...) {
  conditions = levels(sampleData$condition)
  
  ## calculate targetSet AUC
  auc <- calculateAUC(list("NONAME" = targetSet),
                      rankings,
                      cores = 1,
                      verbose = FALSE, ...)
  auc.obs <- aggregateAUCbyCondition(auc, sampleData)
  
  ## sample control sets
  if (is.null(controlSet)) controlSet = rownames(rankings)
  controlSetSamples = lapply(seq_len(n.sample), sample,
                             x = controlSet,
                             size = length(targetSet))
  names(controlSetSamples) = seq_len(n.sample)
  
  ## calculate control AUC dist'n
  auc <- calculateAUC(controlSetSamples,
                      rankings,
                      cores = cores,
                      verbose = FALSE, ...)
  auc.null <- aggregateAUCbyCondition(auc, sampleData)
  
  
  res <- data.frame(
    base = weighted.mean(auc.obs[2:3], table(sampleData$condition)),
    auc.obs[2:3],
    # p.cond1 = 2 * min(mean(auc.obs[[2]] > auc.null[[2]]),
    #                   mean(auc.obs[[2]] < auc.null[[2]])),
    # p.cond2 = 2 * min(mean(auc.obs[[3]] > auc.null[[3]]),
    #                   mean(auc.obs[[3]] < auc.null[[3]])),
    background = mean(auc.null$diff),
    stat = (auc.obs$diff - mean(auc.null$diff)) / sd(auc.null$diff),
    p.value = mean(abs(auc.null$diff) > abs(auc.obs$diff))
  )
  # names(res)[4:5] = paste0("PD.", conditions)
  return(res)
}

#' Differential activity analysis (DASeq)
#'
#' Detect differential activity using the AUC measure and RNA-seq 
#' quantification. For more details about methodology, see Details.
#' This function can be used as part of SURF or as well a stand-along analysis.
#' For the former, input a `surf` object to `event`.
#' For the latter, input `targetSets` and optionally `controlSets`.
#'
#' @param rankings a `SummarizedExperiment` object from [getRankings].
#' @param sampleData `data.frame`, external samples, which contain `condition` 
#'     column, whose row names must match the column names of `rankings`.
#' @param event a `surf` object from [faseqInfer] or [faseq].
#' @param target.type `character(1)`, either "transcript" or "gene".
#' @param controlSets a named `list` of `character`, the control IDs. 
#' @param verbose `logical`, whether (`TRUE`) to echo progress.
#' @inheritParams getControlSet
#' @inheritParams diffAUC
## @example inst/examples/example_AUCell_buildRankings.R
#' @return 
#'   a `daseqResults` object if `targetSets` was given, 
#'   or a `surf` object if `event` was give.
#' @references Chen, F., & Keles, S. (2020). SURF: integrative analysis of a 
#'     compendium of RNA-seq and CLIP-seq datasets highlights complex governing 
#'     of alternative transcriptional regulation by RNA-binding proteins. 
#'     *Genome Biology*, 21(1), 1-23.
#' @export
daseq <- function(event = NULL, 
                  rankings, sampleData,
                  target.type = "transcript",
                  targetSets = NULL,
                  controlSets = NULL,
                  n.sample = 1000,
                  cores = max(1, detectCores() - 2),
                  verbose = FALSE, ...) {
  ## check rankings and sampleData
  if (ncol(rankings) != nrow(sampleData) ||
      any(colnames(rankings) != rownames(sampleData))) {
    stop("colnames(rankings) and rownames(sampleData) must match.")
  }
  if (is.null(sampleData$condition)){
    stop("sampleData must contain a \"condition\" column indicating two groups to compare; please add.")
  }
  sampleData$condition <- as.factor(sampleData$condition)
  if (nlevels(sampleData$condition) != 2) {
    stop("The condition must have two levels.")
  }
  
  ## standardize sampleData
  externalSampleData = DataFrame(
    sample = rownames(sampleData),
    sampleData[setdiff(names(sampleData), "sample")]
  )
  
  if (is.null(event) && is.null(targetSets)) {
    stop("Either event or targetSets is required.")
  }
  
  ## auto generate target sets
  if (is.null(targetSets)) {
    stopifnot(target.type %in% c("transcript", "gene"))
    if (verbose) cat("Infer target/control", target.type, "sets\n")
    id_column = paste0(target.type, "_id")
    targetSets <- getTargetSet(event,
                               id_column = id_column,
                               verbose = verbose)
    targetSets <- lapply(targetSets, intersect, rownames(rankings))
    if (!length(targetSets)) {
      cat("Halt: there is none SURF-inferred feature.\n")
      return(event)
    }
    controlSets <- getControlSet(event, targetSets,
                                 id_column = id_column,
                                 verbose = verbose)
    controlSets <- lapply(controlSets, intersect, rownames(rankings))
  }
  
  ## check targetSets and controlSets
  if (is.null(names(targetSets))) {
    stop("All target sets should be named.")
  }
  if (is.null(controlSets)) {
    controlSets = lapply(targetSets, as.null)
  }
  if (length(targetSets) != length(controlSets) ||
      any(names(targetSets) != names(controlSets))) {
    stop("Names of target and control sets must match.")
  }
  
  ## @AUC
  if (verbose) cat("Calculating AUC for target sets...\n")
  AUC <- calculateAUC(targetSets,
                      rankings,
                      cores = cores,
                      verbose = FALSE, ...)
  # externalSampleData <- cbind(colData(AUC), externalSampleData)
  colData(AUC) <- externalSampleData
  
  ## non-parametric differential activity test
  if (verbose) cat("Testing for differential activity...\n")
  test.res <- mapply(
    diffAUC, targetSets, controlSets,
    MoreArgs = list(rankings = rankings,
                    sampleData = externalSampleData,
                    n.sample = n.sample,
                    cores = cores,
                    verbose = verbose, ...),
    SIMPLIFY = FALSE
  ) %>% bind_rows()
  
  ## collect results
  res = DataFrame(id = names(targetSets),
                  size = sapply(targetSets, length),
                  set = List(targetSets),
                  control = List(controlSets),
                  test.res,
                  row.names = names(targetSets))
  res$padj <- p.adjust(res$p.value, method = "fdr")
  
  ## annotate attributes in mcols()
  mcols(res)$type = "daseq"
  conditions = levels(colData(rankings)$condition)
  mcols(res)$description = c(
    "target set identifier",
    "target set size",
    "target set",
    "full control set",
    "base AUC of target set",
    paste("target set activity (AUC) in", conditions[1]),
    paste("target set activity (AUC) in", conditions[2]),
    "background difference in activity (control set)",
    "statistic (parametric analogue, reference only)",
    paste0("p-value of differential activity (", 
           conditions[1], " vs. ",conditions[2], 
           ") contrasted to background"),
    "adjusted p-values"
  )
  metadata(res) = list(target.type = target.type,
                       n.resample = n.sample)
  
  daseqResults <- new("daseqResults", res, AUC = AUC)
  
  if (!is.null(event)) {
    event@daseqResults <- daseqResults
    event@sampleData$"External" <- externalSampleData
    metadata(event) <- metadata(event)
    metadata(event)$target.type = target.type
    metadata(event)$n.resample = n.sample
    return(event)
  } else {
    return(daseqResults)
  }
}
fchen365/surf documentation built on June 18, 2021, 12:02 p.m.