R/max_depth.R

Defines functions max_depth

Documented in max_depth

#' Vizualise and filter based on mean depth across all called SNPs
#'
#' This function can be run in two ways: 1) specify vcfR object only. This will
#' visualize the distribution of mean depth per sample across all SNPs in your vcf file,
#' and will not alter your vcf file. 2) specify vcfR object, and set 'maxdepth' = 'integer value'.
#' This option will show you where your specified cutoff falls in the distribution of SNP depth,
#' and remove all SNPs with a mean depth above the specified threshold from the vcf.
#' Super high depth loci are likely multiple loci stuck together into a single paralogous locus.
#' Note: This function filters on a 'per SNP' basis rather than a 'per genotype' basis,
#' otherwise it would disproportionately remove genotypes from our deepest sequenced samples
#' (because sequencing depth is so variable between samples).
#'
#' @param vcfR a vcfR object
#' @param maxdepth an integer specifying the maximum mean depth for a SNP to be retained
#' @return The vcfR object input, with SNPs above the 'maxdepth' cutoff removed
#' @examples
#' max_depth(vcfR = SNPfiltR::vcfR.example)
#' max_depth(vcfR = SNPfiltR::vcfR.example, maxdepth = 100)
#' @export
max_depth <- function(vcfR,
                      maxdepth=NULL){

  #if specified vcfR is not class 'vcfR', fail gracefully
  if (!inherits(vcfR, what = "vcfR")){
    stop("specified vcfR object must be of class 'vcfR'")
  }

  #if maxdepth cutoff is not specified, start here
  if (is.null(maxdepth)){

    #print user message
    message("cutoff is not specified, exploratory visualization will be generated.")

    #extract depth from the vcf
    dp.matrix<- vcfR::extract.gt(vcfR, element='DP', as.numeric=TRUE)

    #write a test to catch if the variable of interest has not been specified
    if (sum(!is.na(dp.matrix)) < .5){
      stop("genotype depth not specified in input vcf")
    }

    #calculate vector of depth per SNP
    snpdepth<-rowSums(dp.matrix, na.rm = TRUE)/rowSums(is.na(dp.matrix) == FALSE)

    #plot histogram of depth
    graphics::hist(snpdepth,
         xlab = "mean genotype depth at a given SNP",
         main="depth of all called SNPs")
    graphics::abline(v=mean(snpdepth, na.rm = TRUE),
           col="red",
           lty="dashed")

    #zoomed in histogram
    graphics::hist(snpdepth[snpdepth < 200],
         xlab = "mean genotype depth at a given SNP",
         main ="depth of SNPs < 200")
    graphics::abline(v=mean(snpdepth, na.rm = TRUE),
           col="red",
           lty="dashed")

    #print
    message("dashed line indicates a mean depth across all SNPs of ",round(mean(snpdepth, na.rm = TRUE),1))

  }

  #if the maxdepth cutoff has been specified, start here
  else {

    if (is.numeric(maxdepth) != "TRUE"){
      stop("specified max depth cutoff must be numeric")
    }

    #print user message
    message("maxdepth cutoff is specified, filtered vcfR object will be returned")

    #extract depth from the vcf
    dp.matrix<- vcfR::extract.gt(vcfR, element='DP', as.numeric=TRUE)

    #write a test to catch if the variable of interest has not been specified
    if (sum(!is.na(dp.matrix)) < .5){
      stop("genotype depth not specified in input vcf")
    }

    #calculate vector of depth per SNP
    snpdepth<-rowSums(dp.matrix, na.rm = TRUE)/rowSums(is.na(dp.matrix) == FALSE)

    #plot the maxdepth cutoff
    graphics::hist(snpdepth,
         xlab = "mean genotype depth at a given SNP",
         main ="max depth cutoff")
    graphics::abline(v=maxdepth,
           col="red")

    #calculate % of snps that fail the max depth filter
    i<-round(sum(snpdepth > maxdepth, na.rm = TRUE)/length(snpdepth)*100, 2)

    message(i, "% of SNPs were above a mean depth of ", maxdepth, " and were removed from the vcf")

    #filter vcf
    vcfR<-vcfR[snpdepth < maxdepth,]

    #return vcfR
    return(vcfR)
  #close else statement
  }
#close function
}

Try the SNPfiltR package in your browser

Any scripts or data that you put into this service are public.

SNPfiltR documentation built on March 31, 2023, 8:57 p.m.