R/load_maf.R

Defines functions load_maf

Documented in load_maf

#' Load MAF somatic mutation data
#' 
#' Load MAF data from a text file or data table into your CESAnalysis. Column names are
#' expected to match MAF format specifications (Chromosome, Start_Position, etc.). It's
#' recommended to use preload_maf() to prep the input (including, optionally, liftOver
#' conversion of genomic coordinates), but if you have clean MAF data, you can run this
#' function directly. By default, data is assumed to be derived from whole-exome
#' sequencing. Whole-genome data and targeted sequencing data are also supported when the
#' \code{coverage} option is specified.
#' 
#' @param cesa CESAnalysis.
#' @param maf Path of tab-delimited text file in MAF format, or an MAF in data.table or
#'   data.frame format.
#' @param maf_name Optionally, a name to identify samples coming from the current MAF. Used to
#'   populate the maf_source field of the CESAnalysis samples table.
#' @param sample_data_cols MAF columns containing sample-level data (e.g., tumor grade)
#'   that you would like to have copied into the CESAnalysis samples table.
#' @param coverage exome, genome, or targeted (default exome).
#' @param covered_regions optional for exome, required for targeted: a GRanges object or a
#'   BED file of covered intervals matching the CESAnalysis genome.
#' @param covered_regions_name a name describing the covered regions (e.g.,
#'   "my_custom_targeted_regions"); required when covered_regions are supplied.
#' @param covered_regions_padding How many bases (default 0) to expand start and end of
#'   each covered_regions interval, to include variants called just outside of targeted
#'   regions. Consider setting from 0-100bp, or up to the sequencing read length. If the
#'   input data has been trimmed to the targeted regions, leave set to 0.
#' @param enforce_default_exome_coverage When loading default exome data, exclude records
#'   that aren't covered in the default exome capture intervals included with CES genome
#'   reference data (default FALSE).
#' @return CESAnalysis with the specified MAF data loaded. The MAF data table includes
#'   CES-generated variant IDs, a list of all genes overlapping the site, and top_gene and
#'   top_consequence columns that give the most significant annotated coding changes for
#'   each mutation record. Annotation precedence is determined by MAF prevalence (usually
#'   equal), essential splice status, premature stop codon, nonsilent status, MAF mutation
#'   prevalence across the transcript (often favors longer transcripts), and finally
#'   alphabetical order. The columns are recalculated when more data is loaded, so changes
#'   in MAF prevalence can change which variants are highlighted. Note that
#'   \code{[CESAnalysis]$variants} contains more information about all top_consequence
#'   variants and all noncoding variants from the MAF.
#' @export
load_maf = function(cesa = NULL, maf = NULL, maf_name = character(), coverage = "exome", covered_regions = NULL,
                    covered_regions_name = NULL, covered_regions_padding = 0,
                    sample_data_cols = character(), enforce_default_exome_coverage = FALSE) {
  
  if (! is(cesa, "CESAnalysis")) {
    stop("cesa should be a CESAnalysis")
  }
  
  cesa = copy_cesa(cesa)
  cesa = update_cesa_history(cesa, match.call())
  bsg = get_cesa_bsg(cesa)
  
  if (is.null(maf)) {
    stop("Supply MAF data via maf=[file path or data.table/data.frame].")
  }
  
  if(! is.character(sample_data_cols)) {
    stop("sample_data_cols should be character if used (a vector of column names)")
  }
  
  # Validate sample_data_cols
  if (length(sample_data_cols) > 0) {
    sample_data_cols = unique(na.omit(sample_data_cols))
  } 
  
  
  # give a warning if interval padding is really high
  if (covered_regions_padding > 1000) {
    warning(sprintf("covered_regions_padding=%d is awfully high!", covered_regions_padding), call. = F)
  }
  
  # Validate covered_regions
  previous_covered_regions_names = c(names(cesa@coverage$exome), names(cesa@coverage$targeted)) # may be NULL
  if (! is.character(coverage) || length(coverage) > 1 || ! coverage %in% c("exome", "genome", "targeted")) {
    stop("Argument coverage must be \"exome\", \"genome\", or \"targeted\"")
  }
  
  if (! coverage %in% names(cesa@coverage)) {
    cesa@coverage[[coverage]] = list()
  }
  
  
  # validate covered_regions_name (supplied if and only if covered_regions is)
  if (! is.null(covered_regions) & is.null(covered_regions_name)) {
    stop("You must supply a name for your covered_regions using covered_regions_name = ...")
  }
  if (is.null(covered_regions) && ! is.null(covered_regions_name)) {
    stop("covered_regions_name was supplied, but covered_regions wasn't.")
  }
  if (! is.null(covered_regions_name)) {
    if (! is(covered_regions_name, "character") || length(covered_regions_name) > 1) {
      stop("covered_regions_name should be a 1-length character vector, just a name to use for your covered_regions")
    }
  }
  
  if(is.null(covered_regions) && (length(covered_regions_padding) != 1 || covered_regions_padding != 0)) {
    stop("You can't use covered_regions_padding without supplying covered_regions.", call. = F)
  }
  
  # validate covered_regions (required for targeted, optional for exome/genome)
  if (coverage == "genome" & is.null(covered_regions)) {
    covered_regions_name = "genome"
  }
  
  if (coverage == "targeted" & is.null(covered_regions) ) {
    stop("can't load targeted data without covered_regions (see docs)")
  }
  
  if (! is(enforce_default_exome_coverage, "logical") || length(enforce_default_exome_coverage) != 1) {
    stop("enforce_default_exome_coverage should be T/F")
  }
  
  if (coverage == "exome" & is.null(covered_regions)) {
    covered_regions_name = "exome"
    if(! check_for_ref_data(cesa, "default_exome_gr")) {
      stop("This genome has no default exome intervals, so to load exome data you must supply covered_regions (see docs)")
    }
  }
  
  # Determine whether we're using "exome" or "exome+" for default exome data
  # Usually, the lenient exome+ option is used, but the choice must be consistent throughout a CESAnalysis
  if(covered_regions_name == "exome") {
    # Whether or not CESAnalysis uses "exome+", once the first default exome data has been loaded,
    # there will always be default exome intervals under "exome" in the coverage list
    if (! "exome" %in% names(cesa@coverage[["exome"]])) {
      cesa@advanced$using_exome_plus = ! enforce_default_exome_coverage
      covered_regions = get_ref_data(cesa, "default_exome_gr")
      cesa@coverage$exome[["exome"]] = covered_regions
      
    } else if (enforce_default_exome_coverage == TRUE & cesa@advanced$using_exome_plus) {
      stop("You can't have enforce_default_exome_coverage = TRUE because you previously loaded default exome data\n",
           "with enforce_default_exome_coverage = FALSE")
    } else if (enforce_default_exome_coverage == FALSE & ! cesa@advanced$using_exome_plus) {
      stop("You can't have enforce_default_exome_coverage == FALSE because you previously loaded default exome data\n",
           "with enforce_default_exome_coverage = TRUE.")
    } else {
      covered_regions = cesa@coverage$exome[["exome"]]
    }
    
    # For exome+, use GRanges if available
    # Otherwise, for exome or exome+, load the default exome intervals
    if (! enforce_default_exome_coverage) {
      covered_regions_name = "exome+"
      if("exome+" %in% names(cesa@coverage$exome)) {
        covered_regions = cesa@coverage$exome[["exome+"]]
      } else {
        cesa@coverage[["exome"]][["exome+"]] = covered_regions # that is, the default exome regions
      }
    } else {
      if("exome" %in% names(cesa@coverage$exome)) {
        covered_regions = cesa@coverage$exome[["exome"]]
      } else {
        covered_regions = get_ref_data(cesa, "default_exome_gr")
        cesa@coverage$exome[["exome"]] = covered_regions
      }
    }
    pretty_message("Assuming this data has default exome coverage....")
  } else if (! is.null(covered_regions)) {
    cesa = .add_covered_regions(cesa = cesa, covered_regions = covered_regions, covered_regions_padding = covered_regions_padding, 
                               coverage_type = coverage, covered_regions_name = covered_regions_name)
  }
  
  refset = .ces_ref_data[[cesa@ref_key]]
  
  read_args = list(maf = maf, refset_env = refset,
                   sample_col = 'Unique_Patient_Identifier', chr_col = 'Chromosome', start_col = 'Start_Position',
                   ref_col = 'Reference_Allele', tumor_allele_col = 'guess', separate_old_problems = TRUE)

  if(length(sample_data_cols) > 0) {
    read_args = c(read_args, list(more_cols = sample_data_cols))
  }
  
  reserved_cols = c("Unique_Patient_Identifier", "Chromosome", "Start_Position", "Reference_Allele", 
                    "Tumor_Seq_Allele1", "Tumor_Seq_Allele2", "Tumor_Allele", "Tumor_Sample_Barcode")
  illegal_sample_cols = intersect(reserved_cols, read_args[["more_cols"]])
  if(length(illegal_sample_cols) > 0) {
    stop("Can't use these column(s) as sample-level data columns: ", paste(illegal_sample_cols, collapse = ", "), '.')
  }

  if(! is.character(maf_name) || length(maf_name) > 1) {
    stop("maf_name should be 1-length character.")
  }
  if(length(maf_name) == 0) {
    maf_name = as.character(uniqueN(cesa@samples$maf_source) + 1)
  } else {
    if(! maf_name %ilike% '^[a-z0-9][-0-9a-z\\_\\.,\\+]*$') {
      stop('Invalid maf_name. You can use alphanumerics and ",", "+", "_", "-", ".".)')
    }
  }
  
  maf = do.call(read_in_maf, args = read_args)
  old_problems = data.table()
  if('old_problem' %in% names(maf)) {
    old_problems = maf[! is.na(old_problem), -"problem"]
    setnames(old_problems, 'old_problem', 'reason') # to put in exclusion table
    if(old_problems[, .N] > 0) {
      pretty_message("Found a \"problem\" column from preload_maf() and excluded records with problems.")
    }
    maf = maf[is.na(old_problem), -"old_problem"]
  }
  
  sample_data = NULL
  if(length(sample_data_cols) > 0) {
    missing_sample_data = setdiff(sample_data_cols, names(maf))
    if(length(missing_sample_data) > 0) {
      stop("Some sample_data_cols not present in MAF: ", paste(missing_sample_data, collapse = ', '), '.')
    }
    
    bad_cols = character()
    for (col in sample_data_cols) {
      col_good = maf[, .(good = uniqueN(.SD[[1]]) == 1), by = "Unique_Patient_Identifier", .SDcols = col][, all(good)]
      if (! col_good) {
        bad_cols = c(bad_cols, col)
      }
    }
    if(length(bad_cols) > 0) {
      stop("Sample data column(s) do not have consistent values within each sample: ", paste(bad_cols, collapse = ", "), '.')
    }
    
    # save sample data for later (only need 1 row per sample, since it's all sample-level data)
    sample_data = unique(maf[, .SD, .SDcols = c("Unique_Patient_Identifier", sample_data_cols)], by = "Unique_Patient_Identifier")
    maf = maf[, .SD, .SDcols = setdiff(names(maf), sample_data_cols)]
  }
  
  
  # Set aside records with problems and notify user
  initial_num_records = maf[, .N]
  excluded = maf[! is.na(problem), .(Unique_Patient_Identifier, Chromosome, Start_Position, 
                                     Reference_Allele, Tumor_Allele, variant_id, variant_type, problem)]
  maf = maf[is.na(problem), -"problem"]

  num_excluded = excluded[, .N]
  if(num_excluded > 0) {
    msg = paste0(num_excluded, " of ", initial_num_records, " MAF records (", 
                 sprintf("%.1f", 100 * num_excluded / initial_num_records), '%) ',
                 "had problems and were excluded: ")
    problem_summary = excluded[, .(num_records = .N), by = "problem"]
    if(Sys.getenv("RSTUDIO") == "1" && rstudioapi::isAvailable() &&rstudioapi::getThemeInfo()$dark) {
      message(crayon::white(paste0(utils::capture.output(print(problem_summary, row.names = F)), collapse = "\n")))
    } else {
      message(crayon::black(paste0(utils::capture.output(print(problem_summary, row.names = F)), collapse = "\n")))
    }
    if(num_excluded / initial_num_records > .05) {
      warning("More than 5% of input records had problems.")
    }
    nt = c("A", "C", "G", "T")
    snv_mismatch = excluded[problem == "reference_mismatch" & Reference_Allele %in% nt & Tumor_Allele %in% nt, .N]
    if(snv_mismatch / initial_num_records > .01) {
      msg = paste0(snv_mismatch, " SNV variants were excluded for having reference alleles that do not match the reference genome. You should probably figure out why",
                          " and make sure that the rest of your data set is okay to use before continuing.")
      warning(pretty_message(msg, emit = F))
    }
  }
  setnames(excluded, "problem", "reason")
  
  if(old_problems[, .N] > 0) {
    excluded_cols = names(excluded)
    excluded = rbind(excluded, old_problems[, ..excluded_cols])
  }
  
  new_samples = data.table(Unique_Patient_Identifier = unique(maf$Unique_Patient_Identifier))
  new_samples[, coverage := coverage]
  new_samples[, covered_regions := covered_regions_name]
  
  # see if any sample appears more than once in sample table (happens when one sample has multiple listed groups)
  repeated_samples = new_samples[duplicated(Unique_Patient_Identifier), unique(Unique_Patient_Identifier)]
  if(length(repeated_samples) > 0) {
    stop(paste0("The following samples are associated with multiple groups in the input data:\n", paste(repeated_samples, collapse=", ")))
  }
  
  # make sure no new samples were already in the previously loaded MAF data
  if(cesa@samples[, .N] > 0) {
    repeat_samples = intersect(cesa@samples[, Unique_Patient_Identifier], new_samples[, Unique_Patient_Identifier])
    if (length(repeat_samples) > 0) {
      stop(paste0("Error: Can't load MAF data because some sample IDs already appear in previously loaded data.\n",
                  "Either merge these data sets manually or remove duplicated samples: ",
                  paste(repeat_samples, collapse = ", ")))
    }
  }

  # remove any MAF records that are not in the coverage, unless default exome with enforce_default_exome_coverage = FALSE
  if (covered_regions_name == "genome") {
    num_uncovered = 0
  } else {
    maf_grange = GenomicRanges::makeGRangesFromDataFrame(maf, seqnames.field = "Chromosome", start.field = "Start_Position", 
                                                         end.field = "Start_Position")
    
    # In an "exome+" data set, compare coverage to default exome instead of whatever the current exome+ intervals are
    if (covered_regions_name == "exome+") {
      # equivalent to %within%, but avoids importing
      is_uncovered = ! IRanges::overlapsAny(maf_grange, cesa@coverage[["exome"]][["exome"]], type = "within")
    } else {
      is_uncovered = ! IRanges::overlapsAny(maf_grange, cesa@coverage[[coverage]][[covered_regions_name]], type = "within")
    }
    num_uncovered = sum(is_uncovered)
  }

  if (num_uncovered > 0) {
    total = nrow(maf)
    percent = round((num_uncovered / total) * 100, 1)
    if (covered_regions_name == "exome+") {
      # merge previous exome+ covered_regions with the coverage of the new data
      covered_regions =  GenomicRanges::reduce(GenomicRanges::union(covered_regions, maf_grange[is_uncovered]))
      cesa@coverage[[coverage]][["exome+"]] = covered_regions

      # warn if a lot of records are in uncovered areas; may indicate whole-genome data or low quality exome data
      if (percent > 20) {
        warning(paste0("More than 20% of MAF records are not within the CES genome's default exome intervals.\n",
                       "Could this be whole-genome data? Or if you know the true covered regions, supply them\n",
                       "with the covered_regions argument."))
      }
      msg = paste0("Note: ", format(num_uncovered, big.mark = ','), " MAF records (", percent, 
                   "%) are outside the CESAnalysis's default exome definitions; expanded exome intervals to include them.")
      pretty_message(msg)
    } else {
      uncovered = maf[is_uncovered]
      uncovered$reason = paste0("uncovered_in_", covered_regions_name)
      maf = maf[!is_uncovered]
      excluded = rbind(excluded, uncovered[, names(excluded), with = F])
      pretty_message(paste0("Note: ", num_uncovered, " MAF records out of ", total, " (", percent, 
                     "%) are at loci not covered in the input covered_regions, so they have been excluded."))
    }
  }
  
  # merge in any sample-level data
  if (! is.null(sample_data)) {
    new_samples = merge.data.table(new_samples, sample_data, by = "Unique_Patient_Identifier")
  }
  new_samples[, maf_source := maf_name]
  
  cesa@samples = rbind(cesa@samples, new_samples, fill = TRUE)
  setcolorder(cesa@samples, c("Unique_Patient_Identifier", "coverage", "covered_regions"))
  setkey(cesa@samples, "Unique_Patient_Identifier")
  
  if (excluded[, .N] > 0) {
    cesa@excluded = rbind(cesa@excluded, excluded, fill = T) 
  }
  
  # Set aside new variants for annotation (notably, before MNV prediction; we'll still annotate those as SNVs)
  # To-do: also leave out indels that have previously been annotated (okay to re-annotate for now)
  to_annotate = maf[! cesa@mutations$snv$snv_id, on = 'variant_id']
  pretty_message("Annotating variants...")
  annotations = annotate_variants(refset = refset, variants = to_annotate)
  
  # Check annotations for any SNVs with ambiguous trinuc context; these must be set aside
  snv_table = annotations$snv
  aac_table = annotations$amino_acid_change
  bad_trinuc_context = which(is.na(snv_table$trinuc_mut))
  num_bad = length(bad_trinuc_context)
  if (num_bad > 0) {
    bad_ids = snv_table[bad_trinuc_context, snv_id]
    bad_trinuc_context_maf = maf[bad_ids, .(Unique_Patient_Identifier, Chromosome, Start_Position, Reference_Allele, Tumor_Allele, variant_id, variant_type), on = 'variant_id']
    maf = maf[! bad_ids, on = 'variant_id']
    msg = paste0("Note: ", num_bad, " MAF records excluded due to ambiguous trinucleotide context ",
                  "(likely N's in the reference genome).")
    pretty_message(msg)
    bad_trinuc_context_maf$reason = "ambiguous_trinuc_context"
    cesa@excluded = rbind(cesa@excluded, bad_trinuc_context_maf)
    
    # For simplicity, remove the bad record from SNV and AAC tables
    bad_aa = annotations$aac_snv_key[bad_ids, aac_id, on = 'snv_id', nomatch = NULL]
    snv_table = snv_table[! bad_trinuc_context]
    if(aac_table[, .N] > 0 & length(bad_aa) > 0) {
      aac_table = aac_table[! bad_aa]
    }
  }
  cesa@mutations[["amino_acid_change"]] = unique(rbind(cesa@mutations$amino_acid_change, aac_table, fill = T), by = "aac_id")
  setkey(cesa@mutations$amino_acid_change, "aac_id")
  cesa@mutations[["snv"]] = unique(rbind(cesa@mutations$snv, snv_table, fill = T), by = "snv_id")
  setkey(cesa@mutations$snv, "snv_id")
  cesa@mutations[["aac_snv_key"]] = unique(rbind(cesa@mutations$aac_snv_key, annotations$aac_snv_key))
  setkey(cesa@mutations$aac_snv_key, 'aac_id')
  
  # add genes list to MAF records (may stop including in near future)
  # use of _tmp names required as of data.table 1.13.2 to keep join from failing
  column_order = copy(names(maf))
  maf[, genes_tmp := list(list(NA_character_))]
  maf[cesa@mutations$snv,  genes_tmp := list(genes), on = c(variant_id = "snv_id")]
  
  # No gene for intergenic SNVs (SNV annotation table gives nearest gene, but we won't show that in MAF table)
  intergenic_snv = cesa@mutations$snv[intergenic == T, snv_id]
  maf[intergenic_snv, genes_tmp := list(NA_character_), on = "variant_id"]
  
  
  # temporary way of annotating non-SNVs
  non_snv = maf[variant_type != 'snv']
  if (non_snv[, .N] > 0) {
    grt = as.data.table(refset$gr_genes)
    if ("gene" %in% names(grt)) {
      grt[, names := gene] # use gene field instead of names field (applies when CDS gr has multiple CDS per gene)
      grt[, gene := NULL]
    }
    setnames(grt, c("seqnames", "start", "end", "names"), 
             c("Chromosome", "Start_Position", "End_Position", "gene"))
    setkey(grt, Chromosome, Start_Position, End_Position)
    non_snv[, End_Position := Start_Position] # okay for now
    non_snv_genes = foverlaps(non_snv, grt)
    # use Start_Position from MAF, not gr
    non_snv_genes[, Start_Position := i.Start_Position]
    non_snv_genes = non_snv_genes[, .(genes = list(unique(gene))), by = c("Chromosome", "Start_Position")]
    non_snv_genes[, is_snv := F]
    maf[, is_snv := variant_type == 'snv']
    maf[non_snv_genes, genes_tmp := genes,
        on = c("is_snv", "Chromosome", "Start_Position")]
    maf[, is_snv := NULL]
  }
  
  setnames(maf, "genes_tmp", "genes")
  setcolorder(maf, column_order) # put original columns back in the front
  
  
  # Same-sample variants with 2bp of other variants get set aside as likely MNVs
  # MNVs are only possible in sample/chromosome combinations with more than one MAF record
  hidden_mnv = detect_mnv(maf)
  num_mnv = uniqueN(hidden_mnv$mnv_group)
  if(num_mnv > 0) {
    msg = paste0("There are ", num_mnv, " groups of same-sample variants within 2 bp of each other. ",
                 "These may not have resulted from independent events. Consider using preload_maf() ",
                 "to reclassify these variants as doublet substitutions or other multinucleotide variants.")
    warning(pretty_message(msg, emit = F))
  }

  cesa@maf = rbind(cesa@maf, maf, fill = T)
  
  # Update coverage fields of annotation tables
  # Internal note: Confusingly, update_covered_in also has a side effect of updating
  # cached output of select_variants; this should change in the future.
  cesa = update_covered_in(cesa)
  
  # Cached variants is a table of non-overlapping mutations (in terms of genomic position),
  # with only the "top" (tiebreaker-winning) variant at each site.
  # For sites that have coding effects, we will add this effect to the MAF table.
  # (Edge case: No passing variants in MAF table means nothing to do.)
  if(cesa@maf[, .N] > 0) {
    consequences = cesa@advanced$cached_variants[variant_type != 'snv', .(snv_id = unlist(constituent_snvs), variant_name, gene), by = 'variant_id']
    cesa@maf[consequences, c("top_gene", "top_consequence") := list(gene, variant_name), on = c(variant_id = 'snv_id')]
  }

  msg = paste0("Loaded ", format(maf[, .N], big.mark = ','), " variant records from ", 
                 format(new_samples[, .N], big.mark = ','), " samples into CESAnalysis.")
  pretty_message(msg)
  
  return(cesa)
}
Townsend-Lab-Yale/cancereffectsizeR documentation built on April 28, 2024, 6:14 p.m.