R/load.R

Defines functions correct_coding predict_coding annotate_isomut format_indels remove_features read_isomut

Documented in annotate_isomut correct_coding format_indels predict_coding read_isomut remove_features

#' Read and preprocess an isomut file
#'
#' `read_isomut` reads an isomut file, adds a `read` column, optionally
#' filters according to given criteria and adds extra columns.
#'
#' @param file the name of the file which the data are to be read from.
#' @param minReads integer indicating the minimum number of reads
#' (default = 0L).
#' @param minCoverage double indicating the minimum coverage
#' (default = 0).
#' @param minMutFreq double indicating the minimum mutation frequency
#' (default = 0).
#' @param minScore double indicating the minimum isomut score
#' (default = 0).
#' @param minCleanliness double indicating the minimum isomut cleanliness
#' (default = 0).
#' @param removePatterns character vector providing patterns in
#' sample_names that are to be removed.
#' @param extraColumns named list to generate extra columns. The list names are
#' the names of the new columns. The list elements can either be a vector of
#' length one (each row gets the same value for this column), a named vector
#' with names being a subset of the `sample_name` column in the original isomut
#' file, or a function that accepts those names as arguments.
#' @param removeFeatures whether certain genomic features should be removed.
#' @param featureData `GRanges` that contains features to be removed.
#' @param removeMito whether variants in mitochondrial DNA should be removed.
#' @param asDataFrame whether to return a `data.frame`. If `FALSE`, a
#' `data.table` is returned.
#'
#' @return a `data.frame` or `data.table` containing the filtered and preprocessed data.
#'
#' @importFrom data.table fread setnames setorder
#' @export
read_isomut <- function(file, minReads = 0L, minCoverage = 0, minMutFreq = 0,
                        minScore = 0, minCleanliness = 0, removePatterns = NULL,
                        extraColumns = NULL, removeFeatures = TRUE,
                        featureData = SGD_features, removeMito = TRUE,
                        asDataFrame = TRUE) {
  ## read data
  if(is.null(removePatterns)) {
    isomut <- fread(file = file, header = TRUE, stringsAsFactors = TRUE)
  } else {
    cmd <- paste("egrep -v", ' "', paste(removePatterns, collapse = "|"), '" ', file, sep = "")
    isomut <- fread(cmd = cmd, header = TRUE, stringsAsFactors = TRUE)
  }
  ## rename `sample_name` column
  names(isomut)[1] = "file_name"
  ## rename `cov` to `coverage` if necessary
  names(isomut)[names(isomut) == "cov"] <- "coverage"
  # ## remove samples that match certain patterns
  # if(!is.null(removePatterns)) {
  #   for(pattern in removePatterns) {
  #     isomut <- isomut[!grepl(pattern, file_name)]
  #   }
  # }
  ## remove variants in mito
  if(removeMito) isomut <- isomut[chr != "Mito"]
  ## remove features
  if(removeFeatures) {
    isomut <- remove_features(isomut, featureData = featureData)
  }
  ## add `read` column
  isomut[,read := round(coverage * mut_freq)]
  ## remove rows with too few reads
  if(minReads > 0) {
    isomut <- isomut[read >= minReads]
  }
  ## remove rows with low coverage
  if(minCoverage > 0) {
    isomut <- isomut[coverage >= minCoverage]
  }
  ## remove rows with low mutation frequency
  if(minMutFreq > 0) {
    isomut <- isomut[mut_freq >= minMutFreq]
  }
  ## remove rows with low isomut score
  if(minScore > 0) {
    isomut <- isomut[score >= minScore]
  }
  ## replace cleanliness 42 with NA
  isomut[cleanliness==42, cleanliness := NA]
  if(minCleanliness > 0) {
    isomut <- isomut[cleanliness >= minCleanliness]
  }
  ## add extra columns
  if(!is.null(extraColumns)) {
    stopifnot(is.list(extraColumns), !is.null(names(extraColumns)))
    for(col_name in names(extraColumns)) {
      col_value <- extraColumns[[col_name]]
      stopifnot(is.vector(col_value) || is.function(col_value))
      if(is.function(col_value)) {
        isomut[, (col_name) := factor(vapply(isomut$file_name, col_value, character(1)))]
      } else {
        if(length(col_value) == 1) {
          isomut[,(col_name) := factor(col_value)]
        } else {
          stopifnot(isomut$file_name %in% names(col_value))
          isomut[,(col_name) := factor(col_value[as.character(isomut$file_name)])]
        }
      }
    }
  }
  ## order and remove possible duplicates
  isomut <- unique(isomut)
  setorder(isomut, file_name, chr, pos)
  if(asDataFrame) isomut <- as.data.frame(isomut)
  return(isomut)
}

#' Remove features
#'
#' Remove variants from isomut data if they match given genomic features
#'
#' @param isomut a `data.frame` or `data.table` of isomut data as generated by
#' `read_isomut`.
#' @param featureData a `GRanges` object that contains the features to be removed.
#'
#' @return a `data.frame` or `data.table` containing the filtered data.
#'
#' @importFrom GenomicRanges GRanges findOverlaps
#' @importFrom IRanges IRanges
#'
#' @export
remove_features <- function(isomut, featureData = SGD_features) {
  if(!is(featureData, "GRanges")) stop("`featureData` must be GRanges")
  mut_ranges <- GRanges(isomut$chr,
                        IRanges(isomut$pos, width = nchar(as.character(isomut$ref))))
  suppressWarnings(
    hits <- findOverlaps(mut_ranges, SGD_features)
  )
  toKeep <- setdiff(seq_len(nrow(isomut)), hits@from)
  return(isomut[toKeep,])
}

#' Format indels
#'
#' Converts the sequence information about indels to a format that can be used
#' to predict the coding consequences.
#'
#' @param isomut a `data.frame` or `data.table` of isomut data as generated by
#' `read_isomut`.
#' @param ref either the name of a fasta file containing the reference genome or
#' an object of class `FaFile`.
#'
#' @return A data.frame with formatted sequence information.
#'
#' @importFrom Rsamtools FaFile
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @importFrom Biostrings getSeq
#' @importFrom data.table as.data.table
#'
#' @export
format_indels <- function(isomut, ref = system.file("extdata", "Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa", package = "isomutR")) {
  if(with(isomut, all(ref != "-") && all(mut != "-"))) {
    return(isomut)
  }
  if(!is(isomut, "data.table")) {
    retDF <- TRUE
    isomut <- as.data.table(isomut)
  } else {
    retDF <- FALSE
  }
  stopifnot(is(ref, "FaFile") || is.character(ref))
  if(is.character(ref)) ref <- FaFile(ref)
  mut_ranges <- GRanges(isomut$chr, IRanges(isomut$pos, width = 1))
  mut_seq <- getSeq(ref, mut_ranges)
  ## format insertions
  mut_seq_ins <- as.character(mut_seq[isomut$type == "INS"])
  isomut[type == "INS", `:=`(ref=mut_seq_ins, mut=paste0(mut_seq_ins, mut))]
  ## format deletions
  mut_seq_del <- as.character(mut_seq[isomut$type == "DEL"])
  isomut[type == "DEL", `:=`(ref=paste0(mut_seq_del, ref), mut=mut_seq_del)]
  if(retDF) as.data.frame(isomut)
  return(isomut)
}

#' Annotate isomut data
#'
#' `annotate_isomut` adds information about the location of variants (coding or
#' intergenic) and, for coding variants, about the affected gene(s) and the
#' position of the variant in the gene.
#'
#' @param isomut a data.frame of isomut data as generated by `read_isomut`
#' (minimally needs to contain `chr` and `pos` columns.
#' @param  annotation path to annotation file in `gtf` or `gff` format or `GRanges`
#' object containing the annotations.
#' @param txdb optionally provide `TxDb` object (should match `anno`).
#' @param noDups don't keep multiple annotations (e.g. when variant is inside
#' two genes on opposite strands). Default `FALSE`.
#' @param predictCoding predict consequences of variants in coding regions.
#' @param ... further arguments for `predict_coding`.
#'
#' @return a data.frame with additional columns regarding annotation.
#'
#' @importFrom GenomicFeatures makeTxDbFromGFF makeTxDbFromGRanges
#' @importFrom rtracklayer import
#' @importFrom VariantAnnotation locateVariants CodingVariants IntergenicVariants
#' @importFrom data.table as.data.table rbindlist
#' @export
annotate_isomut <- function(isomut, ref = system.file("extdata", "Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa", package = "isomutR"), annotation = gtf_scer,
                            txdb = NULL, noDups = FALSE, predictCoding = TRUE,
                            ...) {
  if(nrow(isomut)==0) return(isomut)
  if(!is(isomut, "data.table")) {
    retDF <- TRUE
    isomut <- as.data.table(isomut)
  } else {
    retDF <- FALSE
  }
  ## load annotation
  stopifnot(is.character(annotation) || is(annotation, "GRanges"))
  if(is.character(annotation)) {
    if(is.null(txdb)) {
      suppressMessages(
        txdb <- suppressWarnings(makeTxDbFromGFF(annotation, metadata = "gene_id"))
      )
    }
    gr <- import(annotation)
  } else {
    if(is.null(txdb)) {
      suppressMessages(
        txdb <- suppressWarnings(makeTxDbFromGRanges(annotation))
      )
    }
    gr <- annotation
  }
  gr_genes <- gr[gr$type == "gene"]
  id2names <- setNames(gr_genes$gene_name, gr_genes$gene_id)
  ## convert isomut to GRanges
  isomut_list <- split(isomut, isomut$file_name)
  variants <- lapply(isomut_list, function(isomut_i) {
    mut_ranges_i <- GRanges(isomut_i$chr, IRanges(isomut_i$pos, width = 1))
    mut_ranges_i <- unique(mut_ranges_i)
    ## locate variants
    variants_coding_i <- suppressMessages(suppressWarnings(
      locateVariants(mut_ranges_i, txdb, CodingVariants())
      ))
    variants_intergenic_i <- suppressMessages(suppressWarnings(
      locateVariants(mut_ranges_i, txdb,
                     IntergenicVariants(upstream = 0 , downstream = 0))
      ))
    variants_i <- rbind(as.data.table(variants_coding_i), as.data.table(variants_intergenic_i))
    ## get rid of additional annotations for the same variant
    if(noDups) {
      variants_i <- variants_i[!duplicated(QUERYID)]
    }
    return(variants_i)
  })
  ## merge variants with isomut data
  variants <- rbindlist(variants, idcol = "file_name")
  variants <- variants[, c("seqnames", "start", "strand", "LOCATION", "LOCSTART", "GENEID", "file_name")]
  names(variants) <- c("chr", "pos", "strand", "location", "nt_start", "gene_id", "file_name")
  variants[, gene_name := id2names[gene_id]]
  ## get rid of previous annotation if present
  cols <- c("chr", "pos", "file_name", setdiff(names(isomut), names(variants)))
  isomut <- unique(isomut[, cols, with = FALSE])
  isomut <- merge(isomut, as.data.table(variants), by = c("chr", "pos", "file_name"), all.x = TRUE)
  isomut[location=="intergenic", `:=`(gene_id = "intergenic")]
  isomut[is.na(gene_name), gene_name:=gene_id]
  setorder(isomut, file_name, chr, pos)
  if(predictCoding) isomut <- predict_coding(isomut, ref = ref,
                                             annotation = annotation, ...)
  if(retDF) isomut <- as.data.frame(isomut)
  return(isomut)
}

#' Predict coding from variants
#'
#' predicts the consequences of variants in coding regions.
#'
#' @param isomut a data.frame of isomut data as generated by `read_isomut`.
#' @param ref either the name of a fasta file containing the reference genome or
#' an object of class `FaFile`.
#' @param annotation path to annotation file in `gtf` or `gff` format or `GRanges`
#' object containing the annotations.
#' @param txdb optionally provide `TxDb` object (should match `annotation`).
#' @param format_indels whether indels should be formatted (see `format_indels`
#' for details).
#' @param correct_coding whether coding be corrected if two variants hit the same
#' codon.
#' @param ... further arguments for `correct_coding`.
#'
#' @return a data.frame with additional columns regarding coding consequences.
#'
#' @importFrom Biostrings DNAStringSet translate reverseComplement
#' @importFrom VariantAnnotation predictCoding
#' @importFrom magrittr `%>%`
#' @importFrom dplyr mutate group_by filter n case_when
#'
#' @export
predict_coding <- function(isomut, ref = system.file("extdata", "Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa", package = "isomutR"),
                           annotation = gtf_scer, txdb = NULL,
                           format_indels = TRUE, correct_coding = FALSE, ...) {
  if(!"gene_id" %in% names(isomut)) {
    stop("Please annotate data first!")
  }
  isomut_cols <- names(isomut)
  if(nrow(isomut)==0) return(isomut)
  if(!is(isomut, "data.table")) {
    retDF <- TRUE
    isomut <- as.data.table(isomut)
  } else {
    retDF <- FALSE
  }
  ## load genome
  stopifnot(is(ref, "FaFile") || is.character(ref))
  if(is.character(ref)) ref <- FaFile(ref)
  ## load annotation
  if(is.null(txdb)) {
    stopifnot(is.character(annotation) || is(annotation, "GRanges"))
    if(is.character(annotation)) {
      suppressMessages(
        txdb <- suppressWarnings(makeTxDbFromGFF(annotation, metadata = "gene_id"))
      )
    } else {
      suppressMessages(
        txdb <- suppressWarnings(makeTxDbFromGRanges(annotation))
      )
    }
  }
  ## format indels
  if(format_indels) isomut <- format_indels(isomut, ref)
  ## generate GRanges
  mut_ranges <- with(isomut, GRanges(chr, ranges = IRanges(start = pos,
                                                           width = nchar(as.character(ref)))))
  varAllele <- DNAStringSet(isomut$mut)
  coding <- suppressWarnings(predictCoding(mut_ranges, txdb, ref, varAllele))
  if(length(coding) > 0) {
    coding <- as.data.table(coding)
    coding$AA_start <- vapply(coding$PROTEINLOC, function(l) l[[1]], numeric(1))
    coding$mut <- ifelse(coding$strand=="+", coding$varAllele, reverseComplement(DNAStringSet(coding$varAllele)))
    coding <- coding[, c("seqnames", "start", "mut", "GENEID", "CONSEQUENCE", "REFCODON",
                         "VARCODON", "REFAA", "VARAA", "AA_start")]
    names(coding) <- c("chr", "pos", "mut", "gene_id", "consequence", "ref_codon",
                       "var_codon", "ref_AA", "var_AA", "AA_start")
    coding <- unique(coding)
    ## get rid of previous coding if present
    cols <- c("chr", "pos", "gene_id", "mut", setdiff(names(isomut), names(coding)))
    isomut <- unique(isomut[, cols, with = FALSE])
    isomut <- merge(isomut, coding, by = c("chr", "pos", "mut", "gene_id"),
                      all.x = TRUE, all.y = FALSE)
    isomut <- unique(isomut)
    if(correct_coding) isomut <- correct_coding(isomut, ...)
  }
  ## rearrange columns
  isomut <- isomut[,c(isomut_cols, setdiff(names(isomut), isomut_cols)), with = FALSE]
  ## adjust return format
  if(retDF) isomut <- as.data.frame(isomut)
  return(isomut)
}


#' Correct coding
#'
#' corrects the predicted AA if multiple variants are found in the same codon
#'
#' @param isomut a data.frame of isomut data as generated by `read_isomut` and
#' annotated by `annotate_isomut`.
#' @param keep_uncorrected boolean. Adds a column for the uncorrected coding.
#' @param highlight boolean. Adds a column that indicates which variants have
#' corrected condons.
#' @return a data.frame of the same format with corrected coding.
#'
#' @importFrom Biostrings DNAStringSet translate reverseComplement
#' @importFrom VariantAnnotation predictCoding
#' @importFrom dplyr case_when
#'
#' @export
correct_coding <- function(isomut, keep_uncorrected = FALSE, highlight = FALSE) {
  if(!all(c("gene_id", "nt_start", "AA_start") %in% names(isomut))) {
    stop("Please annotate data first!")
  }
  if(!is(isomut, "data.table")) {
    retDF <- TRUE
    isomut <- as.data.table(isomut)
  } else {
    retDF <- FALSE
  }
  isomut[,var_AA_uncorrected:=var_AA]
  isomut[,var_codon_uncorrected:=var_codon]
  isomut[,consequence_uncorrected:=consequence]
  ind_coding <- which(!is.na(isomut$gene_id) & isomut$type=="SNV")
  isomut_coding <- isomut[ind_coding]
  isomut_coding[, ind:=ind_coding]
  isomut_coding[, N:=.N , by=.(file_name, gene_id, AA_start)]
  isomut_coding <- isomut_coding[N %in% 2:3]
  isomut_coding[, codon_pos:=ifelse(nt_start %% 3 %in% 1:2, nt_start %% 3, 3)]
  isomut_coding[, var_codon:=getVarCodon(
    codon_pos, mut, ref_codon, var_codon, strand, type),
    by = .(file_name, gene_id, AA_start)]
  isomut_coding[, `:=`(
    var_AA = case_when(
      !is.na(var_codon) && nt_start>3 ~
        as.character(translate(DNAStringSet(var_codon), no.init.codon = TRUE)),
      !is.na(var_codon) ~
        as.character(translate(DNAStringSet(var_codon), no.init.codon = FALSE)),
    TRUE ~ var_AA))]
  isomut_coding[, `:=`(
    consequence = case_when(
      var_AA == "*" ~ "nonsense",
      var_AA == ref_AA ~ "synonymous",
      var_AA != ref_AA ~ "nonsynonymous",
      TRUE ~ as.character(consequence)))]


  isomut[isomut_coding$ind, c("var_codon", "var_AA", "consequence")] <- isomut_coding[, c("var_codon", "var_AA", "consequence")]
  if(highlight) isomut[,corrected_codon:= var_AA!=var_AA_uncorrected]
  if(!keep_uncorrected) {
    isomut[,var_AA_uncorrected:=NULL]
    isomut[,var_codon_uncorrected:=NULL]
    isomut[,consequence_uncorrected:=NULL]
  }
  if(retDF) isomut <- as.data.frame(isomut)
  return(isomut)
}

getVarCodon <- function(codon_pos, MUT, REFCODON, VARCODON, strand, type) {
  codon_pos <- codon_pos[!is.na(codon_pos)]
  MUT <- MUT[!is.na(codon_pos)]
  if(length(codon_pos)==0) return(NA)
  MUT <- ifelse(strand=="+", as.character(MUT), reverseComplement(DNAStringSet(MUT)))
  RC <- strsplit(REFCODON[[1]], "")[[1]]
  RC[codon_pos] <- MUT
  return(case_when(
    type=="SNV" & !is.na(codon_pos) ~ paste(RC, collapse = ""),
    TRUE ~ VARCODON
  ))
}

#' Read isomut data from directory
#'
#' @param dir path to the directory containing isomut data.
#' @param minReads integer indicating the minimum number of reads
#' (default = 0L).
#' @param minCoverage double indicating the minimum coverage
#' (default = 0).
#' @param minMutFreq double indicating the minimum mutation frequency
#' (default = 0).
#' @param minScore double indicating the minimum isomut score
#' (default = 0).
#' @param minCleanliness double indicating the minimum isomut cleanliness
#' (default = 0).
#' @param removePatterns character vector providing patterns in
#' sample_names that are to be removed.
#' @param extraColumns named list to generate extra columns. The list names are
#' the names of the new columns. The list elements can either be a vector of
#' length one (each row gets the same value for this column), a named vector
#' with names being a subset of the `sample_name` column in the original isomut
#' file, or a function that accepts those names as arguments.
#' @param removeFeatures whether certain genomic features should be removed.
#' @param featureData `GRanges` that contains features to be removed.
#' @param removeMito whether variants in mitochondrial DNA should be removed.
#' @param verbose monitor progress.
#' @param asDataFrame whether to return a `data.frame`. If `FALSE`, a
#' `data.table` is returned.
#' @param annotate whether data should be annotated. For details see
#' `annotate_isomut`.
#' @param ... further arguments for `annotate_isomut`.
#'
#' @return a `data.frame` or `data.table` containing the filtered and
#' preprocessed data.
#'
#' @importFrom data.table rbindlist
#' @importFrom GenomicFeatures makeTxDbFromGFF makeTxDbFromGRanges
#' @importFrom rtracklayer import
#' @export
read_isomut_from_dir <- function(dir, minReads = 0L,
                                 minCoverage = 0, minMutFreq = 0, minScore = 0,
                                 minCleanliness = 0, removePatterns = NULL,
                                 extraColumns = NULL, removeFeatures = TRUE,
                                 featureData = SGD_features, removeMito = TRUE,
                                 verbose = TRUE,
                                 asDataFrame = TRUE,
                                 annotate = TRUE,
                                 ...
                                 ) {
  isomut_files <- list.files(dir, pattern = "*.isomut$", recursive = TRUE,
                             full.names = TRUE)
  if(length(isomut_files) == 0) stop("No isomut files found.")
  if(verbose) {
    message("Reading files...")
    pb <- txtProgressBar(min = 0, max = length(isomut_files), style = 3)
    i <- 1
  }
  isomut_list <- lapply(isomut_files,
                        function(file) {
                          isomut <- read_isomut(file, minReads = minReads,
                                      minCoverage = minCoverage,
                                      minMutFreq = minMutFreq,
                                      minScore = minScore,
                                      minCleanliness = minCleanliness,
                                      removePatterns = removePatterns,
                                      extraColumns = extraColumns,
                                      removeFeatures = removeFeatures,
                                      featureData = featureData,
                                      removeMito = removeMito,
                                      asDataFrame = FALSE
                                      )
                          if(verbose) {
                            setTxtProgressBar(pb, i)
                            i <<- i+1
                          }
                          return(isomut)
                        })
  if(annotate) {
    args <- list(...)
    ## load annotation
    if("annotation" %in% names(args)) {
      annotation <- args[["annotation"]]
      stopifnot(is.character(annotation) || is(annotation, "GRanges"))
      if(is.character(annotation)) {
        if(!"txdb" %in% names(args) || is.null(args[["txdb"]])) {
          suppressMessages(
            args[["txdb"]] <- suppressWarnings(makeTxDbFromGFF(annotation, metadata = "gene_id"))
          )
        }
        args[["annotation"]] <- import(annotation)
      } else {
        if(!"txdb" %in% names(args) || is.null(args[["txdb"]])) {
          suppressMessages(
            args[["txdb"]] <- suppressWarnings(makeTxDbFromGRanges(annotation))
          )
        }
        args[["annotation"]] <- annotation
      }
    } else {
      if(!"txdb" %in% names(args) || is.null(args[["txdb"]])) {
        suppressMessages(
          args[["txdb"]] <- suppressWarnings(makeTxDbFromGRanges(gtf_scer))
        )
      }
      args[["annotation"]] <- gtf_scer
    }
    if(verbose) {
      message("\nAnnotating...")
      pb2 <- txtProgressBar(min = 0, max = length(isomut_files), style = 3)
      i <- 1
    }
    isomut_list <- lapply(isomut_list,
                          function(isomut) {
                            isomut <- do.call(annotate_isomut, c(list(isomut = isomut), args))
                            if(verbose) {
                              setTxtProgressBar(pb2, i)
                              i <<- i+1
                            }
                            return(isomut)
                            }
                          )
  }
  isomut <- rbindlist(isomut_list, fill = TRUE)
  if(asDataFrame) isomut <- as.data.frame(isomut)
  message("...done!\n")
  return(isomut)
}
fridoling/isomutR documentation built on Jan. 21, 2022, 4:45 a.m.