R/survGroup.R

Defines functions run_surv survGroup

Documented in survGroup

#' Predict genesets associated with survival
#' @param maf an \code{\link{MAF}} object generated by \code{\link{read.maf}}
#' @param top If genes is \code{NULL} by default used top 20 genes
#' @param genes Manual set of genes
#' @param geneSetSize Default 2
#' @param minSamples minimum number of samples to be mutated to be considered for analysis. Default 5
#' @param clinicalData dataframe containing events and time to events. Default looks for clinical data in annotation slot of \code{\link{MAF}}.
#' @param time column name contining time in \code{clinicalData}
#' @param Status column name containing status of patients in \code{clinicalData}. must be logical or numeric. e.g, TRUE or FALSE, 1 or 0.
#' @param verbose Default TRUE
#' @param plot Default FALSE If TRUE, generate KM plots of the genesets combinations.
#' @export
#' @examples
#' laml.maf <- system.file("extdata", "tcga_laml.maf.gz", package = "maftools")
#' laml.clin <- system.file("extdata", "tcga_laml_annot.tsv", package = "maftools")
#' laml <- read.maf(maf = laml.maf,  clinicalData = laml.clin)
#' survGroup(maf = laml, top = 20, geneSetSize = 1, time = "days_to_last_followup", Status = "Overall_Survival_Status", plot = FALSE)

survGroup = function(maf, top = 20, genes = NULL, geneSetSize = 2, minSamples = 5, clinicalData = NULL, time = "Time",
                     Status = "Status", verbose = TRUE, plot = FALSE){

  if(is.null(genes)){
    genes = getGeneSummary(x = maf)[1:top, Hugo_Symbol]
  }

  if(length(genes) < 2){
    stop("Minimum two genes required!")
  }

  genesCombn = combn(x = genes, m = geneSetSize)

  if(verbose){
    cat("------------------\n")
    cat(paste0("genes: ", length(genes), "\n"))
    cat(paste0("geneset size: ", geneSetSize, "\n"))
    cat(paste0(ncol(genesCombn), " combinations\n"))
  }

  if(is.null(clinicalData)){
    if(verbose){
      message("Looking for clinical data in annoatation slot of MAF..")
    }

    clinicalData = data.table::copy(getClinicalData(x = maf))
    clinicalData = data.table::setDT(clinicalData)
  }else{
    clinicalData = data.table::setDT(clinicalData)
  }

  if(!"Tumor_Sample_Barcode" %in% colnames(clinicalData)){
    print(colnames(clinicalData))
    stop("Column Tumo_Sample_Barcode not found in clinical data. Check column names and rename it to Tumo_Sample_Barcode if necessary.")
  }

  if(length(colnames(clinicalData)[colnames(clinicalData) %in% time]) == 0){
    print(colnames(clinicalData))
    stop(paste0(time, " not found in clinicalData. Use argument time to povide column name containing time to event."))
  }else{
    colnames(clinicalData)[colnames(clinicalData) %in% time] = 'Time'
  }

  if(length(colnames(clinicalData)[colnames(clinicalData) %in% Status]) == 0){
    print(colnames(clinicalData))
    stop(paste0(Status, " not found in clinicalData. Use argument Status to povide column name containing events (Dead or Alive)."))
  }else{
    colnames(clinicalData)[colnames(clinicalData) %in% Status] = 'Status'
  }

  clinicalData$Time = suppressWarnings(as.numeric(as.character(clinicalData$Time)) )
  clinicalData$Status = suppressWarnings(as.integer(clinicalData$Status))
  clinicalData$Time = ifelse(test = is.infinite(clinicalData$Time), yes = NA, no = clinicalData$Time)
  if(nrow(clinicalData[!is.na(Time)][!is.na(Status)]) < nrow(clinicalData)){
    message(paste0("Removed ", nrow(clinicalData) - nrow(clinicalData[!is.na(Time)][!is.na(Status)]),
                   " samples with NA's"))
    clinicalData = clinicalData[!is.na(Time)][!is.na(Status)]
  }

  om = createOncoMatrix(m = maf, g = genes)
  all.tsbs = levels(getSampleSummary(x = maf)[,Tumor_Sample_Barcode])

  mutMat = t(om$numericMatrix)
  missing.tsbs = all.tsbs[!all.tsbs %in% rownames(mutMat)]

  if(nrow(mutMat) < 2){
    stop("Minimum two genes required!")
  }
  mutMat[mutMat > 0 ] = 1

  res = lapply(seq_along(1:ncol(genesCombn)), function(i){
    x = genesCombn[,i]
    mm = mutMat[,x, drop = FALSE]
    genesTSB = names(which(rowSums(mm) == geneSetSize))
    if(length(genesTSB) >= minSamples){
      if(verbose){
        cat("Geneset: ", paste0(x, collapse = ","), "[N=", length(genesTSB),"]\n")
      }
      surv.dat = run_surv(cd = clinicalData, tsbs = genesTSB, doplot = plot)
    }else{
      surv.dat = NULL
    }
    surv.dat
  })
  names(res) = apply(genesCombn, 2, paste, collapse = '_')
  res = data.table::rbindlist(l = res, idcol = "Gene_combination")
  res = res[order(P_value, decreasing = FALSE)]
  res
}

run_surv = function(cd, tsbs, doplot = FALSE){
  groupNames = c("Mutant", "WT")
  col = c('maroon', 'royalblue')
  cd$Group = ifelse(test = cd$Tumor_Sample_Barcode %in% tsbs, yes = groupNames[1], no = groupNames[2])

  if(nrow(cd[,.N,Group]) == 1){
    #Only group - surv.diff wont work
    return(NULL)
  }

  surv.km = survival::survfit(formula = survival::Surv(time = Time, event = Status) ~ Group, data = cd, conf.type = "log-log")
  res = summary(surv.km)

  surv.diff = survival::survdiff(formula = survival::Surv(time = Time, event = Status) ~ Group, data = cd)
  surv.diff.pval = signif(1 - pchisq(surv.diff$chisq, length(surv.diff$n) - 1), digits = 3)

  surv.cox = survival::coxph(formula = survival::Surv(time = Time, event = Status) ~ Group, data = cd)
  hr = signif(1/exp(stats::coef(surv.cox)), digits = 3)

  surv.dat = data.table::data.table(Group = res$strata, Time = res$time, survProb = res$surv, survUp = res$upper, survLower = res$lower)
  surv.dat$Group = gsub(pattern = 'Group=', replacement = '', x = surv.dat$Group)

  geneSet <-  paste("Geneset: ", paste0( parent.frame()$x, collapse = " + "))

  if (doplot){
        par(mar = c(4, 4, 2, 2))
        x_lims = pretty(surv.km$time)
        y_lims = seq(0, 1, 0.20)

        plot(NA, xlim = c(min(x_lims), max(x_lims)), ylim = c(0, 1),
            frame.plot = FALSE, axes = FALSE, xlab = NA, ylab = NA)
        abline(h = y_lims, v = x_lims, lty = 2, col = grDevices::adjustcolor(col = "gray70", alpha.f = 0.5), lwd = 0.75)

        points(surv.dat[Group %in% "Mutant", Time], y = surv.dat[Group %in% "Mutant", survProb], pch = 8, col = col [1])
        points(surv.dat[Group %in% "WT", Time], y = surv.dat[Group %in% "WT", survProb], pch = 8, col = col [2])

        lines(surv.km[1], col = col[1], lty = 1, lwd = 2, conf.int=FALSE)
        lines(surv.km[2], col = col[2], lty = 1, lwd = 2, conf.int=FALSE)

        axis(side = 1, at = x_lims)
        axis(side = 2, at = y_lims, las = 2)
        mtext(text = "Survival probability", side = 2, line = 2.5)
        #mtext(text = "Time", side = 1, line = 2)

        legend(x = "topright", legend = c(paste0("Geneset [N= ", nrow(cd[Group == "Mutant"]),"]"),
                                          paste0("WT [N= ", nrow(cd[Group == "WT"]),"]")), col = col, bty = "n", lwd = 2, pch = 8)

        title(main = NA,
              sub = paste0("P-value: ", surv.diff.pval, "; ", "HR: ", hr), cex.main = 1, font.main= 4, col.main= "black",
              cex.sub = 1, font.sub = 3, col.sub = ifelse(test = surv.diff.pval < 0.05, yes = "red", no = "black"),
              line = 2.5, adj = 0)

        title(main = geneSet, adj = 0, font.main = 3, cex.main = 1)
  }

  #surv.dat = data.table::data.table(Group = res$strata, Time = res$time, survProb = res$surv, survUp = res$upper, survLower = res$lower)
  #surv.dat$Group = gsub(pattern = 'Group=', replacement = '', x = surv.dat$Group)
  surv.dat = data.table::data.table(P_value = surv.diff.pval, hr = hr, WT = nrow(cd[Group == "WT"]), Mutant = nrow(cd[Group == "Mutant"]))
}
PoisonAlien/maftools documentation built on April 7, 2024, 2:49 a.m.