R/generate_allele_matrix.R

Defines functions keep_only_variant_sites remove_invariant_sites identify_variant_sites replace_non_ATGC_with_N

#' In allele matrix replace non-standard alleles with N
#'
#' @param mat Matrix.
#'
#' @return mat: Matrix.
#' @noRd
replace_non_ATGC_with_N <- function(mat){
  check_is_this_class(mat, "matrix")

  # Make all nucleotides upper case
  mat <- apply(mat, 2, toupper)

  # Replace NAs with "N"
  mat[is.na(mat)] <- "N"


  # For single nucleotide variants, replace non-A/T/G/C with N (ignore long
  # strings like insertions)
  mat[nchar(mat) == 1 & !(mat %in% c("A", "T", "G", "C"))] <- "N"
  return(mat)
}

#' Find all rows in allele matrix with SNPs.
#'
#' Give an error if there are no variant sites in the dataset.
#'
#' @param mat Matrix.
#'
#' @return rows_to_keep: Logical vector.
#' @noRd
identify_variant_sites <- function(mat){
  check_is_this_class(mat, "matrix")
  rows_to_keep <- apply(mat, 1, function(row) {
    sum(unique(row) != "N")
  })
  rows_to_keep <- rows_to_keep > 1
  rows_to_keep <- unname(rows_to_keep)

  if (sum(rows_to_keep) == 0) {
    stop("There are no valid variant sites in this dataset")
  }
  return(rows_to_keep)
}

#' Remove any invariant sites from allele matrix.
#'
#' @param mat Matrix.
#' @param rows_to_keep Logical. Vector of TRUE/FALSE. FALSE corresponds to rows
#'   that should be dropped (invariant loci). TRUE corresponds to rows to keep
#'   (variant loci).
#' @param o_ref Character vector. Original reference alleles. Length = Number of
#'   genotypes.
#' @param o_alt Character vector. Original alternative alleles. Length = Number
#'   of genotypes.
#' @param snpeff NULL or list of character vectors. If list, then length(list) =
#'   Number of genotypes. Each list entry can have one or more SnpEff
#'   annotations.
#'
#' @return A list of four objects:
#'   \describe{
#'     \item{mat}{Matrix. Only variant sites are retained. Rows are variants.
#'      Columns are samples.}
#'     \item{o_ref}{Character vector. Original reference alleles. Length =
#'     Number of genotypes with variant sites.}
#'     \item{o_alt}{Character vector. Original alternative alleles. Length =
#'      Number of genotype after remove genotypes with variant sites.}
#'     \item{snpeff}{NULL or list of character vectors. If list, then
#'     length(list) = Number of genotypes with variant sites. Each list
#'     entry can have one or more SnpEff annotations.}
#'   }
#' @noRd
remove_invariant_sites <- function(mat, o_ref, o_alt, snpeff, rows_to_keep){
  check_is_this_class(mat, "matrix")
  check_is_this_class(rows_to_keep, "logical")
  if (length(rows_to_keep) != nrow(mat)) {
    stop("Logical must have length == nrow(mat)")
  }
  if (sum(rows_to_keep) == 0) {
    stop("There are no valid variant sites in this dataset")
  }
  if (length(o_ref) != length(o_alt)) {
    stop("o_ref and o_alt need to have same length")
  }
  if (length(o_ref) != nrow(mat)) {
    stop("o_ref & o_alt need to have an entry for every mat row")
  }
  if (!is.null(snpeff)) {
    check_is_this_class(snpeff, "list")
    check_is_this_class(snpeff[[1]], "character")
    if (length(snpeff) != length(o_ref)) {
      stop("If not null snpeff needs to have an entry for every row in mat")
    }
  }

  mat <- mat[rows_to_keep, , drop = FALSE]
  o_ref <- o_ref[rows_to_keep]
  o_alt <- o_alt[rows_to_keep]
  snpeff <- snpeff[rows_to_keep]

  return(list("mat" = mat,
              "o_ref" = o_ref,
              "o_alt" = o_alt,
              "snpeff" = snpeff))
}

#' Keep genotype information for variant sites
#'
#' @description Given a DNA matrix, its reference and alternative alleles, and
#'   SnpEff annotations keep only those the genotypes with variants.
#' @param dna_mat Matrix.
#' @param o_ref Character vector. Original reference alleles. Length = Number of
#'   genotypes.
#' @param o_alt Character vector. Original alternative alleles. Length = Number
#'   of genotypes.
#' @param snpeff NULL or list of character vectors. If list, then length(list) =
#'   Number of genotypes. Each list entry can have one or more SnpEff
#'   annotations.
#'
#' @return A list of four objects:
#'   \describe{
#'     \item{variant_only_dna_mat}{Matrix. Only variant sites are retained. Rows
#'      are variants. Columns are samples.}
#'     \item{o_ref_var_pos}{Character vector. Original reference alleles.
#'     Length = Number of genotypes with variant sites.}
#'     \item{o_alt_var_pos}{Character vector. Original alternative alleles.
#'     Length = Number of genotype after remove genotypes with variant sites.}
#'     \item{snpeff_var_pos}{NULL or list of character vectors. If list, then
#'     length(list) = Number of genotypes with variant sites. Each list
#'     entry can have one or more SnpEff annotations.}
#'   }
#' @noRd
keep_only_variant_sites <- function(dna_mat, o_ref, o_alt, snpeff){
  check_is_this_class(dna_mat, "matrix")
  if (length(o_ref) != length(o_alt)) {
    stop("Reference and alternative allele vectors must be same length")
  }
  if (nrow(dna_mat) != length(o_ref)) {
    stop("Length of reference and alternative allele vectors must equal nrow(dna_mat)")
  }
  if (!is.null(snpeff)) {
    check_is_this_class(snpeff, "list")
    check_is_this_class(snpeff[[1]], "character")
    if (length(snpeff) != length(o_ref)) {
      stop("If not null snpeff needs to have an entry for every row in mat")
    }
  }

  dna_mat <- replace_non_ATGC_with_N(dna_mat)
  variant_site_log <- identify_variant_sites(dna_mat)
  invariant_sites_removed <- remove_invariant_sites(dna_mat,
                                                    o_ref,
                                                    o_alt,
                                                    snpeff,
                                                    variant_site_log)
  variant_only_dna_mat <- invariant_sites_removed$mat
  o_ref_var_pos <- invariant_sites_removed$o_ref
  o_alt_var_pos <- invariant_sites_removed$o_alt
  snpeff_var_pos <- invariant_sites_removed$snpeff

  return(list("variant_only_dna_mat" = variant_only_dna_mat,
              "o_ref_var_pos" = o_ref_var_pos,
              "o_alt_var_pos" = o_alt_var_pos,
              "snpeff_var_pos" = snpeff_var_pos))
}

Try the prewas package in your browser

Any scripts or data that you put into this service are public.

prewas documentation built on April 2, 2021, 5:06 p.m.