R/EMu_prepare.R

Defines functions readfast readmut mut_count mut_process35

Documented in mut_count mut_process35 readfast readmut

#' @useDynLib convSig
#' @importFrom Rcpp sourceCpp
NULL

#' Read in the reference genome fasta file
#'
#' @section Details:
#' This is an auxiliary function.
#'
#' @importFrom data.table fread data.table as.data.table
readfast <- function(datapath) {
  tryCatch(
    {
      #if (!file.exists(datapath)) {
        #stop(datapath, ": File doesn't exist.")
      #}
      if ("character" %in% class(datapath)) {
        x <- fread(datapath, header = FALSE)
      } else if ("data.frame" %in% class(datapath)) {
        x <- as.data.table(datapath)
        #data.loaded = TRUE
      }
    },
    error=function(cnd) {
      message(paste("There seems to be a problem with the filepath you supplied."))
      message("Here's the original error message:")
      message(cnd)
      stop("Reading in assembly file input aborted.")
    }
  )
  invisible(x)

  # Handle factors if we are dealing with data in env.
}

#' Read in the mutation input file
#'
#' @section Details:
#' This is an auxiliary function.
#'
#' @importFrom data.table fread data.table as.data.table
readmut <- function(datapath) {
  tryCatch(
    {
      if ("character" %in% class(datapath)) {
        x <- fread(datapath)
      } else {
        x <- as.data.table(datapath)
        #data.loaded = TRUE
        x <- x[, icgc_sample_id := as.character(icgc_sample_id)]
        x <- x[, chromosome := as.numeric(as.character(chromosome))]
        x <- x[, chromosome_start := as.numeric(as.character(chromosome_start))]
        x <- x[, mutated_from_allele := as.character(mutated_from_allele)]
        x <- x[, mutated_to_allele := as.character(mutated_to_allele)]
      }
    },
    error=function(cnd) {
      message(paste("There seems to be a problem with the filepath you supplied."))
      message("Here's the original error message:")
      message(cnd)
      stop("Reading in mutation file input aborted.")
    }
  )
  invisible(x)
}

#' Computes the frequency of each possible trinucleotide or 5-nucleotide mutation signature
#'
#' @param reference The path leading to your assembly file (.fa or .fa.gz).
#'
#' @param mut_file The path leading to your mutation input file (tsv or csv only). Alternatively,
#' this is your mutation data if you supplied either a \code{data.frame} or a \code{matrix}.
#' Please make sure that your input data is not malformed; It should contain
#' the following columns: icgc_sample_id (i.e. the ICGC sample ID), chromosome
#' (i.e. the Chromosome ID), chromosome_start (i.e. the chromosome start position),
#' mutated_from_allele (i.e. the reference allele), and mutated_to_allele
#' (i.e. alternate allele).
#'
#' @param five A boolean variable. A value of \code{TRUE} will lead to the function
#' scanning the input files for 5 bases mutation signatures as opposed to 3 bases
#' signatures. a value of \code{FALSE} causes the function to scan for 3 bases signatures.
#' 
#' @param slice A boolean variable. A value of \code{TRUE} will slice the assembly
#' provided and only keep the chromosomes that appear in your mutation input file.
#'
#' @return A background mutation signatures vector (\code{wt}), which provides
#' the frequency of each possible signature given an assembly file. A matrix (\code{mut_mat})
#' containing the mutational rate of each signature for each sample in your supplied mutation
#' input file.
#'
#' @examples
#' assembly <- "Homo_sapiens.GRCh37.dna.primary_assembly.fa"
#' mut_file <- "mutation_file_input.tsv"
#'
#' mut_sign <- mut_count(assembly, mut_file)
#' 
#' @importFrom data.table like
#' 
#' @export
mut_count <- function(reference, mut_file, five = FALSE, slice = FALSE) {
  # Make sure mutation input is a data.table

  cat("Loading the assembly\n")
  # Call readfast
  reference_gen <- readfast(reference)

  cat("Loading the mutation input file\n")
  # Read in the mutation input file
  datapath <- readmut(mut_file)

  if (slice == TRUE) {

    cat("Counting the number of chromosomes\n")
    uniq_chrom <- unique(datapath$chromosome)
    max_chrom <- max(uniq_chrom)
    min_chrom <- min(uniq_chrom)
    if (min_chrom > 1) {
      low_cut <-
        reference_gen[like(V1, paste0(">", min_chrom, " ")), which = T]
    } else {
      low_cut <- 2
    }
    high_cut <-
      reference_gen[like(V1, paste0(">", max_chrom + 1, " ")), which = T]
    reference_gen <- reference_gen[(low_cut - 1):(high_cut - 1),]

  }

  cat("Miscellaneous processing...\n")
  nb_uniq = length(unique(as.character(datapath$icgc_sample_id)))
  # This is the row names of init_mut_mat
  uniq_sample <- unique(as.character(datapath$icgc_sample_id))

  if (five == FALSE) {
    init_mut_mat <- matrix(0L, nrow = nb_uniq, ncol = 96)
    init_wt <- rep(0, 96)
  } else if (five == TRUE) {
    init_mut_mat <- matrix(0L, nrow = nb_uniq, ncol = 1536)
    init_wt <- rep(0, 1536)
  }

  # Call mut_process35 here and store result in variable called treated_mut
  cat("Processing and encoding the mutation input file\n")
  # Order of this file's column is super important <= Here
  treated_mut <- mut_process35(datapath)

  cat("Counting the frequency of mutation fragment types. This could take a few minutes...\n")
  shallowres <- new("Shallowres", mut_mat = init_mut_mat, wt = init_wt)

  if (five == FALSE) {
    shallowres = shallow_loop3(shallowres, reference_gen, treated_mut, uniq_sample)
  } else if (five == TRUE) {
    shallowres = shallow_loop5(shallowres, reference_gen, treated_mut, uniq_sample)
  }

  invisible(shallowres)
}

#' A function that treats the mutation input file and reorders its columns
#'
#' @section Details:
#' This is an auxiliary function.
#'
#' @importFrom data.table data.table as.data.table
#' @importFrom purrr map
mut_process35 <- function(datapath) {

  tryCatch(
    {
      datapath <- datapath[!chromosome == "X"]
      datapath <- datapath[!chromosome == "Y"]
    },
    error=function(cnd) {
      message(paste("Input column 'chromosome' for chromosome ID doesn't seem
                    to exist."))
      message("Here's the original error message:")
      message(cnd)
      stop("###### aborted.")
    }
  )

  tryCatch(
    {
      from_allele <- map(datapath[, mutated_from_allele], Base2Num)
    },
    error=function(cnd) {
      message(paste("Input column 'mutated_from_allele' doesn't seem to exist."))
      message("Here's the original error message:")
      message(cnd)
      stop("####### aborted.")
    }
  )
  tryCatch(
    {
      if ("numeric" %in% class(datapath[, mutated_to_allele])) {
        stop("Either your input mutation file has already been processed
             or the values in your 'mutated_to_allele' column are not letter
             bases (i.e. wrong format)")
      } else {
        to_allele <- map(datapath[, mutated_to_allele], Base2Num)
      }
    },
    error=function(cnd) {
      message(paste("There seems to be a problem with your
                    'mutated_to_allele' input column."))
      message("Here's the original error message:")
      message(cnd)
      stop("######## aborted.")
    }
  )

  datapath <- datapath[, mutated_to_allele := as.numeric(to_allele)]

  datapath <- datapath[from_allele < 4 &
                        mutated_to_allele < 4, .(icgc_sample_id, chromosome,
                                                  chromosome_start,
                                                  mutated_from_allele,
                                                  mutated_to_allele)]

  both_alleles <- data.table(from_allele = from_allele, to_allele = to_allele)
  both_alleles <- both_alleles[from_allele < 4 & to_allele < 4, .(from_allele,
                                                                  to_allele)]


  fromto_allele <- alt_reverse(datapath, as.numeric(both_alleles[, from_allele]))
  datapath <- datapath[, mutated_to_allele := as.numeric(fromto_allele$to_allele)]

  invisible(datapath)
}

#' @section Details:
#' This is an auxiliary function.
#'
#' @importFrom data.table data.table as.data.table
alt_reverse <- function(datapath, from_allele) {
  alleles = data.table(from_allele = from_allele, to_allele =
                         datapath[,mutated_to_allele])
  tryCatch(
    {
      alleles <- reverse_transform(alleles)
    },
    error=function(cnd) {
      message(paste("Input column 'mutated_from_allele' doesn't seem to exist."))
      message("Here's the original error message:")
      message(cnd)
      stop("###### aborted.")
    }
  )

  invisible(alleles)
}
antoine186/convSig documentation built on Jan. 17, 2020, 4:09 a.m.