data-raw/old.code.and.notes/ExtendIterationAndPosterior.R

#' Run hdp extraction and attribution on a spectra catalog file
#' This repeats what Nicola do in her thesis. It starts four independent initial chains with post.burnin iterations,
#' then pick up from the end of each initial chain,
#' started another num.posterior MCMC chains for another post.burnin iterations
#' and then collected post.n posterior samples at intervals of post.space iterations.
#' In total, this collects 4 times num.posterior times post.n posterior samples from 4 times y separate chains.
#'
#'
#' @param input.catalog Input spectra catalog as a matrix or
#' in \code{\link[ICAMS]{ICAMS}} format.
#'
#' @param CPU.cores Number of CPUs to use in running
#'    \code{\link[hdpx]{hdp_posterior}}; this is used to parallelize
#'    running the posterior sampling chains, so there is no
#'    point in making this larger than \code{num.posterior}.
#'
#' @param seedNumber An integer that is used to generate separate
#'   random seeds for each call to \code{\link[hdpx]{dp_activate}},
#'   and each call of \code{\link[hdpx]{hdp_posterior}}; please see the code
#'   on how this is done. But repeated calls with same value of
#'   \code{seedNumber} and other inputs should produce the same results.
#'
#' @param K.guess Suggested initial value of the number of
#' signatures, passed to \code{\link[hdpx]{dp_activate}} as
#' \code{initcc}.
#'
#' @param multi.types A logical scalar or
#' a character vector.
#' If \code{FALSE}, hdp will regard all input spectra as one tumor type.
#'
#' If \code{TRUE}, hdp will infer tumor types based on the string before "::" in their names.
#' e.g. tumor type for "SA.Syn.Ovary-AdenoCA::S.500" would be "SA.Syn.Ovary-AdenoCA"
#'
#' If \code{multi.types} is a character vector, then it should be of the same length
#' as the number of columns in \code{input.catalog}, and each value is the
#' name of the tumor type of the corresponding column in \code{input.catalog},
#' e.g. \code{c("SA.Syn.Ovary-AdenoCA", "SA.Syn.Ovary-AdenoCA", "SA.Syn.Kidney-RCC")}.
#'
#' @param verbose If \code{TRUE} then \code{message} progress information.
#'
#' @param num.posterior Number of posterior sampling chains; can set to
#'   1 for testing.
#'
#' @param post.burnin Pass to \code{\link[hdpx]{hdp_posterior}}
#'      \code{burnin}.
#'
#' @param post.n Pass to \code{\link[hdpx]{hdp_posterior}}
#'      \code{n}.
#'
#' @param post.space Pass to \code{\link[hdpx]{hdp_posterior}}
#'      \code{space}.
#'
#' @param post.cpiter Pass to \code{\link[hdpx]{hdp_posterior}}
#'      \code{cpiter}.
#'
#' @param post.verbosity Pass to \code{\link[hdpx]{hdp_posterior}}
#'      \code{verbosity}.
#'
#' @param cos.merge The cosine similarity threshold for merging raw clusters
#'      from the posterior sampling chains into "components" i.e. signatures;
#'      passed to \code{\link[hdpx]{hdp_extract_components}}.
#'
#' @param min.sample A "component" (i.e. signature) must have at least
#'      this many samples; passed to \code{\link[hdpx]{hdp_extract_components}}.
#'
#' @param checkpoint.aft.post If non-\code{NULL}, a file path to checkpoint
#'      the list of values returned from the calls to \code{\link[hdpx]{hdp_posterior}}
#'      as a .Rdata file.
#'
#' @return A list with the following elements:\describe{
#' \item{signature}{The extracted signature profiles as a matrix;
#'             rows are mutation types, columns are
#'             samples (e.g. tumors).}
#' \item{exposure}{The inferred exposures as a matrix of mutation counts;
#'            rows are signatures, columns are samples (e.g. tumors).}
#' \item{exposure.p}{\code{exposure} converted to proportions.}
#'
#' \item{multi.chains}{A \code{\link[hdpx]{hdpSampleMulti-class}} object.
#'     This object has the method \code{\link[hdpx]{chains}} which returns
#'     a list of \code{\link[hdpx]{hdpSampleChain-class}} objects. Each of these
#'     sample chains objects has a method \code{\link[hdpx]{final_hdpState}}
#'     (actually the methods seems to be just \code{hdp})
#'     that returns the \code{hdpState} from which it was generated.}
#' }
#'
#' @export
#'
#'

ExtendIterationAndPosterior <-
  function(input.catalog,
           CPU.cores           = 1,
           seedNumber          = 1,
           K.guess,
           multi.types         = FALSE,
           verbose             = TRUE,
           num.posterior       = 4,
           post.burnin         = 4000,
           post.n              = 50,
           post.space          = 50,
           post.cpiter         = 3,
           post.verbosity      = 0,
           chlist.path         = NULL,
           gamma.alpha         = 1,
           gamma.beta          = 1

  ) { # 13 arguments


    hdp.state <- SetupAndActivate(input.catalog = input.catalog,
                                  seedNumber    = seedNumber,
                                  K.guess       = K.guess,
                                  multi.types   = multi.types,
                                  verbose       = verbose,
                                  gamma.alpha   = gamma.alpha,
                                  gamma.beta    = gamma.beta)

    hdplist <- hdpx::as.list(hdp.state)
    iterate <- utils::getFromNamespace(x = "iterate", ns = "hdpx")
    output <- iterate(hdplist, post.burnin, post.cpiter, post.verbosity)##burn-in first, then return the hdplist after burnt in.
    hdplist <- output[[1]]
    as.hdpState <- utils::getFromNamespace(x = "as.hdpState", ns = "hdpx")
    hdp.state.burned <- as.hdpState(hdplist)

    # Run num.posterior independent sampling chains for burned-in hdp
    hdp_posterior_sample <- function(my.seed) {

      if (verbose) message("calling hdp_posterior ", Sys.time())
      print(my.seed)
      sample.chain <- hdpx::hdp_posterior(
        hdp       = hdp.state.burned,
        verbosity = post.verbosity,
        burnin    = post.burnin,
        n         = post.n,
        space     = post.space,
        cpiter    = post.cpiter,
        seed      = my.seed)

      return(sample.chain)
    }


    parallel.time <- system.time(
      chlist <- c(chlist,parallel::mclapply(
        # Must choose a different seed for each of the chains
        X = (seed + 1:num.posterior * 10^6) ,
        FUN = hdp_posterior_sample,
        mc.cores = CPU.cores))

    )

    if (verbose) {
      message("compute chlist time: ")
      for (xn in names(parallel.time)) {
        message(" ", xn, " ", parallel.time[[xn]])
      }
    }

    # Generate the original multi_chain for the sample
    if (verbose) message("calling hdp_multi_chain ", Sys.time())
    # If a child dies the corresponding element of chlist has
    # class try-error.
    #
    # We filter these and generate a warning. This is a bit
    # tricky and I am not sure I have anticipated all possible
    # returns, so I do this in a loop.

    clean.chlist <- CleanChlist(chlist)

    if (length(clean.chlist) == 0) {
      fname <- "chlist.from.aborted.run.of.RunhdpInternal4.Rdata"
      save(clean.chlist, file = fname)
      stop("No usable result in chlist, look in ", fname)
    }else{
      save(clean.chlist, file = chlist.path)
    }


  }
steverozen/mSigHdp documentation built on Feb. 6, 2023, 1:36 a.m.