R/extract_gt.R

Defines functions extract.info extract.indels is.indel extract.haps extract.gt

Documented in extract.gt extract.haps extract.indels extract.info is.indel

#' @title Extract elements from vcfR objects
#'
#' @rdname extract_gt
#'
#' @description
#' Extract elements from the 'gt' slot, convert extracted genotypes to their allelic state, extract indels from the data structure or extract elements from the INFO column of the 'fix' slot.
#'
#' @param x An object of class chromR or vcfR
#' @param element element to extract from vcf genotype data. Common options include "DP", "GT" and "GQ"
#' @param mask a logical indicating whether to apply the mask (TRUE) or return all variants (FALSE). Alternatively, a vector of logicals may be provided.
# @param as.matrix attempt to recast as a numeric matrix
#' @param verbose should verbose output be generated
#' @param as.numeric logical, should the matrix be converted to numerics
#' @param return.alleles logical indicating whether to return the genotypes (0/1) or alleles (A/T)
#' @param IDtoRowNames logical specifying whether to use the ID column from the FIX region as rownames
# @param allele.sep character which delimits the alleles in a genotype (/ or |), here this is not used for a regex (as it is in other functions)
#' @param extract logical indicating whether to return the extracted element or the remaining string
#' @param convertNA logical indicating whether to convert "." to NA.
#'
#' @details
#'
#' The function \strong{extract.gt} isolates elements from the 'gt' portion of vcf data.
#' Fields available for extraction are listed in the FORMAT column of the 'gt' slot.
#' Because different vcf producing software produce different fields the options will vary by software.
#' The mask parameter allows the mask to be implemented when using a chromR object.
#' The 'as.numeric' option will convert the results from a character to a numeric.
#' Note that if the data is not actually numeric, it will result in a numeric result which may not be interpretable.
#' The 'return.alleles' option allows the default behavior of numerically encoded genotypes (e.g., 0/1) to be converted to their nucleic acid representation (e.g., A/T).
# The allele.sep parameter allows the genotype delimiter to be specified.
#' Note that this is not used for a regular expression as similar parameters are used in other functions.
#' Extract allows the user to extract just the specified element (TRUE) or every element except the one specified.
#'
#' Note that when 'as.numeric' is set to 'TRUE' but the data are not actually numeric, unexpected results will likely occur.
#' For example, the genotype field will typically be populated with values such as "0/1" or "1|0".
#' Although these may appear numeric, they contain a delimiter (the forward slash or the pipe) that is non-numeric.
#' This means that there is no straight forward conversion to a numeric and unexpected values should be expected.
#'
#'
#' @seealso
#' \code{is.polymorphic}
#'
#'
#' @examples
#' data(vcfR_test)
#' gt <- extract.gt(vcfR_test)
#' gt <- extract.gt(vcfR_test, return.alleles = TRUE)
#'
#' @export
extract.gt <- function(x, element="GT",
                       mask=FALSE,
                       as.numeric=FALSE,
                       return.alleles=FALSE,
                       IDtoRowNames = TRUE,
#                       allele.sep="/",
                       extract = TRUE,
                       convertNA = TRUE ){

  # Validate that we have an expected data structure
  #if( class(x) != "chromR" & class(x) != "vcfR" ){
  if( !inherits(x, "chromR") & !inherits(x, "vcfR") ){
    stop( "Expected an object of class chromR or vcfR" )
  }
  
  # Catch unreasonable mask specification.
  #if(class(x) == "vcfR"){
  if( inherits(x, "vcfR") ){
    if(length(mask) == 1 && mask == TRUE){
      # This condition does not appear to make
      # sense and should be overridden.
      mask <- FALSE
    }
  }
  
  # If of class chromR, extract the vcf
  #if(class(x) == "chromR"){
  if( inherits(x, "chromR") ){
    tmpMask <- x@var.info$mask
    x <- x@vcf
  }

  # If a mask was specified in the call,
  # override the one from var.info
  if(length(mask) > 1){
    tmpMask <- mask
    mask <- TRUE
  }
  
  # Validate that the gt slot is a matrix
#  if( class(x@gt) != "matrix" ){
  if( !inherits(x@gt, "matrix") ){
    stop( paste("gt slot expected to be of class matrix. Instead found class", class(x@gt)) )
  }

  if(as.numeric == TRUE & return.alleles == TRUE ){
    stop("Invalid parameter choice, as.numeric and return.alleles can't both be true, alleles are characters!")
  }
  
  # If of class vcfR, call compiled code to extract field.
  #if(class(x) == "vcfR"){
  if( inherits(x, "vcfR") ){
    if(colnames(x@gt)[1] != "FORMAT"){
      stop("First column is not named 'FORMAT', this is essential information.")
    }

    outM <- .extract_GT_to_CM(x@fix,
                              x@gt,
                              element,
                              return.alleles,
                              as.integer(extract),
                              convertNA = as.numeric(convertNA) )
  }

  # If as.numeric is true, convert to a numeric matrix.
  if(as.numeric == TRUE){
    outM <- .CM_to_NM(outM)
  }
  
  #
  if( IDtoRowNames == TRUE ){
    if( sum(is.na(x@fix[,'ID'])) > 0 ){
      x <- addID(x)
    }
    if( length(unique(x@fix[,'ID'])) != nrow(x@fix) ){
      stop('ID column contains non-unique names')
    }
    rownames(outM) <- x@fix[,'ID']
  }

  # Apply mask.
  if(mask == TRUE){
    outM <- outM[tmpMask,]
  }

  return(outM)
}



#' @rdname extract_gt
#' @aliases extract.haps
# @param gt.split character which delimits alleles in genotypes
#' @param unphased_as_NA logical specifying how to handle unphased genotypes
#'
#' @details
#' The function \strong{extract.haps} uses extract.gt to isolate genotypes.
#' It then uses the information in the REF and ALT columns as well as an allele delimiter (gt_split) to split genotypes into their allelic state.
#' Ploidy is determined by the first non-NA genotype in the first sample.
#'
#' The VCF specification allows for genotypes to be delimited with a '|' when they are phased and a '/' when unphased.
#' This becomes important when dividing a genotype into two haplotypes.
#' When the alleels are phased this is straight forward.
#' When the alleles are unphased it presents a decision.
#' The default is to handle unphased data by converting them to NAs.
#' When unphased_as_NA is set to TRUE the alleles will be returned in the order they appear in the genotype.
#' This does not assign each allele to it's correct chromosome.
#' It becomes the user's responsibility to make informed decisions at this point.
#'
#'
#' @export
#extract.haps <- function(x, mask=FALSE, gt.split="|",verbose=TRUE){
extract.haps <- function(x,
                         mask=FALSE,
                         unphased_as_NA = TRUE,
                         verbose=TRUE ){
  #if(class(x) == "chromR"){
  if( inherits(x, "chromR") ){
    if(length(mask) == 1 && mask==TRUE){
      x <- chromR2vcfR(x, use.mask = TRUE)
    } else {
      #      x <- chrom_to_vcfR(x)
      x <- x@vcf
    }
  }
  
  if(length(mask) > 1){
    x <- x[mask,]
  }

  # Determine ploidy
  first.gt <- unlist(strsplit(x@gt[,-1][!is.na(x@gt[,-1])][1], ":"))[1]
#  ploidy <- length(unlist(strsplit(first.gt, split = gt.split, fixed = TRUE )))
  ploidy <- length(unlist(strsplit(first.gt, split = "[\\|/]" )))


  if( nrow( x@fix ) == 0 ){
    # No variants, return empty matrix.
    haps <- x@gt[ 0, -1 ]
  } else if ( ploidy == 1 ){
    haps <- extract.gt( x, return.alleles = TRUE )
  } else if ( ploidy > 1 ) {
    gt <- extract.gt( x )
    haps <- .extract_haps(x@fix[,'REF'], x@fix[,'ALT'],
                          gt, as.numeric(unphased_as_NA), as.numeric(verbose))
  } else {
    stop('Oops, we should never arrive here!')
  }

  haps
}


#' @rdname extract_gt
#'
#' @aliases is.indel
#'
#'
#' @details
#' The function \strong{is.indel} returns a logical vector indicating which variants are indels (variants where an allele is greater than one character).
#'
#'
#' @examples
#' data(vcfR_test)
#' is.indel(vcfR_test)
#'
#'
#' @export
is.indel <- function(x){
  #if(class(x) == 'chromR'){
  if( inherits(x, 'chromR') ){
    x <- x@vcf
  }
  #if(class(x) != "vcfR"){
  if( !inherits(x, "vcfR") ){
    stop("Unexpected class! Expecting an object of class vcfR or chromR.")
  }
  
  # Create an evaluation matrix
  isIndel <- matrix(FALSE, nrow=nrow(x), ncol = 2)
  colnames(isIndel) <- c('REF','ALT')

  # Check reference for indels
  isIndel[,'REF'] <- nchar(x@fix[,'REF']) > 1

  # Check alternate for indels
  checkALT <- function(x){
    x <- stats::na.omit(x)
    x <- x[ x != "<NON_REF>" ]
    if( length(x) > 0 ){
      max(nchar(x)) > 1
    } else {
      FALSE
    }
  }
  isIndel[,'ALT'] <- unlist( lapply( strsplit(x@fix[,'ALT'], split=","), checkALT) )

  mask <- rowSums(isIndel)
  mask <- mask > 0

  return(mask)
}




#' @rdname extract_gt
#'
#' @aliases extract.indels
#'
#' @param return.indels logical indicating whether to return indels or not
#'
#' @details
#' The function \strong{extract.indels} is used to remove indels from SNPs.
#' The function queries the 'REF' and 'ALT' columns of the 'fix' slot to see if any alleles are greater than one character in length.
#' When the parameter return_indels is FALSE only SNPs will be returned.
#' When the parameter return_indels is TRUE only indels will be returned.
#'
#'
#' @examples
#' data(vcfR_test)
#' getFIX(vcfR_test)
#' vcf <- extract.indels(vcfR_test)
#' getFIX(vcf)
#' vcf@fix[nrow(vcf@fix),'ALT'] <- ".,A"
#' vcf <- extract.indels(vcf)
#' getFIX(vcf)
#'
#' data(vcfR_test)
#' vcfR_test@fix[1,'ALT'] <- "<NON_REF>"
#' vcf <- extract.indels(vcfR_test)
#' getFIX(vcf)
#'
#' data(vcfR_test)
#' extract.haps(vcfR_test, unphased_as_NA = FALSE)
#' extract.haps(vcfR_test)
#'
#'
#' @export
extract.indels <- function(x, return.indels=FALSE){
  #if(class(x) == 'chromR'){
  if( inherits(x, 'chromR') ){
    x <- x@vcf
  }
  #if(class(x) != "vcfR"){
  if( !inherits(x, "vcfR") ){
    stop("Unexpected class! Expecting an object of class vcfR or chromR.")
  }
  
  mask <- is.indel(x)

  if(return.indels == FALSE){
    x <- x[ !mask, , drop = FALSE ]
  } else {
    x <- x[ mask, , drop = FALSE ]
  }
  
  return(x)
}



#' @rdname extract_gt
#' @aliases extract.info
#'
#' @details
#' The function \strong{extract.info} is used to isolate elements from the INFO column of vcf data.
#'
#' @export
extract.info <- function(x, element, as.numeric=FALSE, mask=FALSE){
  
  #if( class(x) == 'chromR' ){
  if( inherits(x, 'chromR') ){
    mask <- x@var.info$mask
    x <- x@vcf
  }
  #if( class(x) != 'vcfR' ){
  if( !inherits(x, 'vcfR') ){
    stop("Expecting an object of class vcfR or chromR.")
  }
  
#  values <- unlist(
#    lapply(strsplit(unlist(
#      lapply(strsplit(x@fix[,'INFO'], split=";"),
#             function(x){grep(paste("^", element, "=", sep=""), x, value=TRUE)})),
#      split="="), function(x){x[2]})
#  )
  
  values <- strsplit(x@fix[,'INFO'], split=";")
  values <- lapply(values, function(x){grep(paste("^", element, "=", sep=""), x, value=TRUE)})
  values <- lapply(values, function(x){ unlist( strsplit(x, split="=") ) })
  values <- lapply(values, function(x){x[2]})
  values <- lapply(values, function(x){ if(is.null(x)){NA}else{x} })
  values <- unlist(values)
  

  if(as.numeric == TRUE){
    values <- as.numeric(values)
  }
#  if( mask != FALSE & !is.null(mask) ){
#    values <- values[x@var.info$mask]
#    values <- values[mask]
#  }
  values
}





##### ##### ##### ##### #####
# EOF.

Try the vcfR package in your browser

Any scripts or data that you put into this service are public.

vcfR documentation built on May 29, 2024, 10:57 a.m.