R/importFilterHypermutators.R

# importFilterHypermutators.R

#' Filters out hypermutators
#'
#' \code{importFilterHypermutators} identifies the number of SNV and CNA
#' mutations for each sample and removes the samples from rSNV and rCNA
#' datasets. Default threshold for removal is 400 mutations per sample.
#'
#' @param rSNVFileIn A vector of local file names of rSNV
#' @param rCNAFileIn A vector of local file names of rCNA
#' @param dOut Directory to store output, defaults to getwd().
#' @param xS Mutation threshold. 400 by default.
#' @param silent Controls whether output to console should be suppressed. FALSE
#'   by default.
#' @param writeLog Controls whether writing the result to the global logfile is
#'   enabled. TRUE by default.
#' @param writeDetailedLog Flag for extra details about log. TRUE by default.
#'
#' @examples
#' \dontrun{
#'     importFilterHypermutators(rSNVFileIn, rCNAFileIn, dOut, xS)
#' }
#'
#' @export
importFilterHypermutators <- function(rSNVFileIn = c(),
                                      rCNAFileIn = c(),
                                      dOut = getwd(),
                                      xS = 400,
                                      silent = FALSE,
                                      writeLog = TRUE,
                                      writeDetailedLog = TRUE) {

    ## VALIDATE PARAMS ########
    # make sure that files actually exist
    for (file in rSNVFileIn) {
        r <- .checkArgs(file, like = "FILE_E", checkSize = TRUE)
        if (length(r) > 0) {
            stop(r)
        }
    }

    for (file in rCNAFileIn) {
        r <- .checkArgs(file, like = "FILE_E", checkSize = TRUE)
        if (length(r) > 0) {
            stop(r)
        }
    }

    # make sure that output directory is valid
    r <- .checkArgs(dOut, like = "DIR", checkSize = TRUE)
    if(length(r) > 0) {
        stop(r)
    }

    r <- c(r, .checkArgs(xS, like = 1, checkSize = TRUE))
    if (length(r) > 0) {
        stop(r)
    }

    # set up hashes
    hashTable <- new.env(hash = TRUE)
    totalSamples <- 0

    ## COUNT MUTATIONS ########
    # for each file in rCNAFileIn
    for (file in rCNAFileIn) {
        # readRDS, count gene CNAs and add to hash for sample @ CNA and nMUT
        rCNA <- readRDS(file)
        for (sample in colnames(rCNA)[4:length(colnames(rCNA))]) {
            totalSamples <- totalSamples + 1
            # if key is in hash, increment CNA and nMUT for every gene if there is an abberation
            for (copyNumberValue in rCNA[[sample]]) {
                copyNumberValue <- as.double(copyNumberValue)
                if (copyNumberValue != 0) {
                    if (!is.null(hashTable[[sample]])) {
                        prevCNACount <- hashTable[[sample]]$CNA
                        prevTotalCount <- hashTable[[sample]]$total
                        assign(sample, list(CNA = prevCNACount + 1, SNV = 0, total = prevTotalCount + 1), envir = hashTable)
                    } else {
                        assign(sample, list(CNA = 1, SNV = 0, total = 1), envir = hashTable)
                    }
                }
            }
        }
    }

    # for each file in rSNVFileIn
    for (file in rSNVFileIn) {
        # readRDS, increment counter for SNV and nMUT
        rSNV <- readRDS(file)
        for (i in 1:length(rSNV$Tumor_Sample_Barcode)) {
            # need to substitute dashes with period for consistency in files
            sample <-  gsub("-", ".", rSNV$Tumor_Sample_Barcode[i])
            totalSamples <- totalSamples + 1

            # Present a progress bar.
            if (!silent) {
                .pBar(i, length(rSNV$Tumor_Sample_Barcode))
            }

            # if key is in hash, increment SNV and nMUT
            if (!is.null(hashTable[[sample]])) {
                prevCNACount <- hashTable[[sample]]$CNA
                prevSNVCount <- hashTable[[sample]]$SNV
                prevTotalCount <- hashTable[[sample]]$total
                assign(sample, list(CNA = prevCNACount, SNV = prevSNVCount + 1, total = prevTotalCount + 1), envir = hashTable)
            } else {
                assign(sample, list(CNA = 0, SNV = 1, total = 1), envir = hashTable)
            }
        }
    }

    ## ASSESS MUTATIONS AND LOG STATISTICS #######

    # log global statistics:
    numSamplesBothSNVAndCNA <- 0
    numSamplesOnlyCNA <- 0
    numSamplesOnlySNV <- 0
    numSamplesNoChange <- 0
    numSamplesExceedThresh <- 0
    numRemovedSamples <- 0
    removedSamples <- c()

    for (sample in ls(hashTable)) {
        # num samples with only CNA
        if (hashTable[[sample]]$SNV == 0 && hashTable[[sample]]$CNA > 0) {
            numSamplesOnlyCNA <- numSamplesOnlyCNA + 1
        }
        # num samples with only SNV
        if (hashTable[[sample]]$CNA == 0 && hashTable[[sample]]$SNV > 0) {
            numSamplesOnlySNV <- numSamplesOnlySNV + 1
        }
        # num samples with both SNV and CNA
        if (hashTable[[sample]]$CNA > 0 && hashTable[[sample]]$SNV > 0) {
            numSamplesBothSNVAndCNA <- numSamplesBothSNVAndCNA + 1
        }
        # num samples that exceeded threshold and need to be removed
        if (hashTable[[sample]]$total > xS) {
            numRemovedSamples <- numRemovedSamples + 1
            removedSamples <- c(removedSamples, sample)
        }
        # num samples with no change
        if (hashTable[[sample]]$total <= xS) {
            numSamplesNoChange <- numSamplesNoChange + 1
        }
    }

    ## PROCESS FILES AND UPDATE METADATA ##########

    # for each rCNAFileIn
    for (file in rCNAFileIn) {
        rCNA <- readRDS(file)

        # remove sample if sample in `removedSamples`
        newRCNA <- rCNA[, !(names(rCNA) %in% removedSamples)]

        # update metadata
        getUUID("newRCNA", overwrite = TRUE)
        # ToDo: record updated UUID in log-file
        # ToDo: test that UUID is _actually_ being updated.


        # save new CNAFile with "filtered" prepended to filename
        saveRDS(newRCNA, file = paste(dOut, "/filteredHypermutators_", basename(file), sep = ""))
    }

    # for each rSNVFileIn
    for (file in rSNVFileIn) {
        rSNV <- readRDS(file)

        # for each sample (need to gsub("-", ".", sample)), if sample in `removedSamples` remove it
        newRSNV <- rSNV[, !(gsub("-", ".", rSNV$Tumor_Sample_Barcode) %in% removedSamples)]

        # update metadata
        getUUID("newRSNV", overwrite = TRUE)
        # ToDo: record updated UUID in log-file
        # ToDo: test that UUID is _actually_ being updated.

        # save new SNVFile
        saveRDS(newRSNV, file = paste(dOut, "/filteredHypermutators_", basename(file), sep = ""))
    }

    ## LOGGING ###############

    if(writeLog) {

        logTitle <- "importFilterHypermutators"

        # Compile function call record
        logCall <- character()
        logCall[1] <- "importFilterHypermutators("
        logCall[2] <- sprintf("rSNVFileIn = (%s)", paste(rSNVFileIn, collapse = ", "))
        logCall[3] <- sprintf("rCNAFileIn = (%s)", paste(rCNAFileIn, collapse = ", "))
        logCall[4] <- sprintf("dOut = \"%s\", ", dOut)
        logCall[5] <- sprintf("xS = \"%s\", ", as.character(xS))
        logCall[6] <- sprintf("silent = %s, ", as.character(silent))
        logCall[7] <- sprintf("writeLog = %s)", as.character(writeLog))
        logCall <- paste0(logCall, collapse = "")

        # Record progress information
        logNotes <- character()
        logNotes <- c(logNotes, sprintf("Removed %s of %s samples", numRemovedSamples, totalSamples))

        if (writeDetailedLog) {
            logNotes <- c(logNotes, sprintf("%s of samples had both SNV and CNA", numSamplesBothSNVAndCNA/totalSamples))
            logNotes <- c(logNotes, sprintf("%s of samples had only CNA", numSamplesOnlyCNA/totalSamples))
            logNotes <- c(logNotes, sprintf("%s of samples had only SNV", numSamplesOnlySNV/totalSamples))
            logNotes <- c(logNotes, sprintf("%s of samples had no change", numSamplesNoChange/totalSamples))
            logNotes <- c(logNotes, sprintf("%s of samples exceeded threshold of %s", numSamplesExceedThresh/totalSamples, xS))

            # Accumulate all removed samples
            for (sample in removedSamples) {
                logNotes <- c(logNotes, sprintf("%s was removed", sample))
            }
        }

        # # send info to log file
        logEvent(eventTitle = logTitle,
                 eventCall = logCall,
                 notes = logNotes)
    }

}

# [END]
hyginn/rete documentation built on May 17, 2019, 9:16 p.m.