R/RunSignatureAnalyzerAttribution.R

Defines functions RunSignatureAnalyzerAttribution

Documented in RunSignatureAnalyzerAttribution

#' Run SignatureAnalyzer attribution on a catalog file and output by RunSignatureAnalyzerOnFile().
#'
#' Normally, please call \code{\link{SignatureAnalyzerOneRun}}
#' instead of this function.
#'
#' @param input.catalog File containing input catalog.  Columns are
#' samples (tumors), rows are signatures.  SignatureAnalyzer does
#' not care about the row names (I think) TODO(Steve): check this.
#'
#' @param read.catalog.function Function taking a file path as
#' its only argument and returning a catalog as a numeric matrix.
#'
#' @param extracted.signature.file A .csv file containing extracted
#' signatures. Normally, this file is named \code{"sa.output.sigs.csv"}
#' and is generated by function \code{RunSignatureAnalyzerOnFile()}.
#' It expects to have the same format as the \code{input.catalog},
#' thus it will be read by \code{read.catalog.function} too.
#'
#' @param raw.exposures.file A .csv file containing raw attributions
#' of exposures. Normally, this file is named \code{"sa.output.raw.exp.csv"}
#' and is generated by function \code{RunSignatureAnalyzerOnFile()}.
#'
#' @param out.dir Directory that will be created for the output;
#' abort if it already exits.  Log files will be in
#' \code{paste0(out.dir, "/tmp")}.
#'
#' @param write.signature.function Function with first argument the
#' signatures generated by SignatureAnalyzer and second argument
#' the file to write to.
#'
#' @param input.exposures A file with the synthetic exposures used to generate
#' \code{input.catalog}; if provided here,
#' this is copied over to the output directory
#' for downstream analysis.
#'
#' @param test.only If TRUE, only analyze the first 10 columns
#' read in from \code{input.catalog}.
#'
#' @param delete.tmp.files If TRUE delete the many temporary
#'  files generated by SignatureAnalyzer.
#'
#' @param verbose If \code{TRUE} cat a message regarding progress.
#'
#' @param overwrite If TRUE, overwrite existing output.
#'
#' @return The final attribution matrix.
#' (i.e. \code{exp.fine.tuned})
#'
#' @details Save the final attribution of a catalog matrix into
#' a file named \code{"sa.output.fine.exp.csv"} under the
#' folder \code{out.dir}.
#'
#' @export
#'
#' @importFrom utils capture.output

RunSignatureAnalyzerAttribution <-
  function(input.catalog,
           read.catalog.function,
           extracted.signature.file,
           raw.exposures.file,
           write.signature.function,
           out.dir,
           test.only = FALSE,
           input.exposures = NULL,
           delete.tmp.files = TRUE,
           overwrite = FALSE,
           verbose = FALSE) {
    # TEMPORARY is a global required by SignatureAnalyzer
    assign("TEMPORARY",paste0(out.dir, "/tmp/"),envir = envSA)
    if (dir.exists(envSA$TEMPORARY)) {
      if (!overwrite) stop("Directory ", envSA$TEMPORARY, " already exists")
    } else {
      dir.create(envSA$TEMPORARY)
    }

    # Load files required by fine-attribution step.

    # Load catalog matrix file
    # same as the file used in RunSignatureAnalyzerOnFile()
    syn.data <- read.catalog.function(input.catalog,
                                      strict = FALSE)

    if (test.only) syn.data <- syn.data[ , 1:10]

    if (dir.exists(out.dir)) {
      if (!overwrite) stop(out.dir, " already exits")
    } else {
      dir.create(out.dir)
    }

    # Load extracted signatures.
    sigs <- read.catalog.function(extracted.signature.file,
                                  strict = FALSE)
    # If signatures are un-normalized, Normalize them
    sigs <- apply(sigs, 2, function(x) x/sum(x))

    # Load RAW-inferred exposures required by fine-attribution step.
    exp.raw <- SynSigGen::ReadExposure(raw.exposures.file)
    if(FALSE){
    exp.raw <- exp.raw[sigs.to.use, , drop = FALSE]
    rownames(exp.raw) <- new.names
    }

    # 2 more steps of fine-tuned attribution.
    # INPUT: extracted signatures (sigs),
    # input catalog matrix (syn.data),
    # and RAW-inferred exposures (exp.raw)
    W0 <- sigs # Extracted signatures
    H0 <- exp.raw # initially inferred exposures

    V0 <- syn.data     # Catalog matrix for all tumors
    K0 <- ncol(W0)     # Number of signatures

    # Z0 is a signature allowance matrix for the whole dataset.
    # rows refer to signatures, columns refer to tumor samples.
    # If you specify an entry as 0,
    # then you prohibit a signature to be active in this tumor sample.
    # Here, we simply select Z0 as all-1 matrix
    # and therefore any signature is allowed to be present in any tumor.
    Z0 <- array(1,dim=c(K0,ncol(V0))) # signature indicator matrix Z (0 = not allowed, 1 = allowed); all elements initialized by one.
    colnames(Z0) <- colnames(H0)
    rownames(Z0) <- rownames(H0)

    # The original SignatureAnalyzer code attributes tumors of different tumor types separately
    # TODO(Wuyang): Add separation of tumor types later.
    ttype <- rep("SBS1.SBS5.correlated",ncol(V0))
    ttype.unique <- unique(ttype)
    n.ttype.unique <- length(ttype.unique)
    a0 <- 10 ### default parameter
    phi <- 1.5 ### default parameter
    for (i in 1:n.ttype.unique) {
      #### First step: determine a set of optimal signatures best explaining the observed mutations in each tumor type ("cohort").
      #### The automatic relevance determination technique was applied to the H matrix only, while keeping signatures (W0) frozen.
      cohort <- ttype.unique[i] ## each cohort is composed of tumor catalogs with the SAME tumor type.
      W1 <- W0
      V1 <- as.matrix(V0[,ttype==cohort])
      Z1 <- Z0[,match(colnames(V1),colnames(Z0),nomatch=0)] ## If a signature is not active in a tumor type in initial attribution,
      ## it is no more allowed in the fine-tuned attribution step.
      H1 <- Z1*H0[,match(colnames(V1),colnames(Z0),nomatch=0)]
      lambda <- rep(1+(sqrt((a0-1)*(a0-2)*mean(V1,na.rm=T)/K0))/(nrow(V1) + ncol(V1) + a0 + 1)*2,K0)
      res0 <- envSA$BayesNMF.L1.KL.fixed_W.Z(as.matrix(V1),as.matrix(W1),as.matrix(H1),as.matrix(Z1),lambda,2000000,a0,1.e-07,1/phi)
      H2 <- res0[[2]]  ## Attribution from second step
      colnames(H2) <- colnames(V1) ## The colnames of H2 (exposure attribution matrix) are the names of tumor samples,
      ## it should be the same as the colnames of V1 (catalog matrix for tumors whose tumor type is cohort=ttype.unique[i])
      rownames(H2) <- colnames(W0) ## The rownames of H2 (exposure attribution matrix) are the names of signatures,
      ## it should be the same as the colnames of W0 (signature matrix extracted in the previous section)

      #### Second step: determine a sample-level exposure attribution using selected sigantures in the first step.
      index.H2 <- rowSums(H2)>1 ### identify only active signatures in the cohort
      Z2 <- Z1
      Z2[!index.H2,] <- 0 ### only selected signatures in the first step are allowed + the original contraints on the signature availability from Z1.
      for (j in 1:ncol(H2)) {
        tmpH <- rep(0,ncol(W0))
        if (sum(V1[,j])>=5) {
          lambda <- 1+(sqrt((a0-1)*(a0-2)*mean(V1,na.rm=T)/K0))/(nrow(V1) + ncol(V1) + a0 + 1)*2
          res <- envSA$BayesNMF.L1.KL.fixed_W.Z.sample(as.matrix(V1[,j]),W0,as.matrix(H2[,j]),as.matrix(Z2[,j]),lambda,1000000,a0,1.e-07,1) ## Precise attribution
          tmpH <- res[[2]]
        }
        if (j==1) {
          H3 <- tmpH
        } else {
          H3 <- cbind(H3,tmpH)
          if (verbose) cat(j,'\n')
        }
      }
      colnames(H3) <- colnames(V1)
      rownames(H3) <- colnames(W0)
      if (i==1) {
        H2.all <- H2
        H3.all <- H3
      } else {
        H2.all <- cbind(H2.all,H2)  ## Attribution result from step 2
        H3.all <- cbind(H3.all,H3)  ## Attribution result from step 3
      }
    }
    exp.fine.tuned <- H3.all ### fine-tuned attributions of signature exposures


    # Output fine-tuned attributions
    SynSigGen::WriteExposure(
      exp.fine.tuned,
      file = paste0(out.dir, "/sa.output.fine.exp.csv"))

    # Copy the input exposure into "out.dir",
    # if filepath "input.exposures" is provided.
    if (!is.null(input.exposures)) {
      SynSigGen::WriteExposure(
        SynSigGen::ReadExposure(input.exposures),
        file = paste0(out.dir, "/input.syn.exp.csv"))
    }

    # Delete temporary files if flag delete.tmp.files == TRUE
    if (delete.tmp.files) unlink(TEMPORARY, recursive = TRUE)

    invisible(exp.fine.tuned)
  }
WuyangFF95/SynSigRun documentation built on Oct. 7, 2022, 1:16 p.m.