R/ReadVCF.R

Defines functions ReadVCF

Documented in ReadVCF

#' Read VCF file
#'
#' @param File
#' Path to VCF file
#' @param AlleleDepthField
#' Allele read depth FORMAT field in the VCF to get allele depth proportions.
#' If NULL (default), dosage is obtained from GT field
#' @param AlleleDepthType
#' Allele read depth type: either the depths associated with the REF and ALT
#' alleles (alleles, by default), 
#' or the depths associated with each haplotypes reported in the GT field (haplotypes). 
#' The latter can be used for a phased VCF generated by 
#' \href{https://gitlab.cirad.fr/agap/seg/haplocharmer}{HaploCharmer}.
#' @param MaxMarkerMissing
#' Maximum proportion of missing values for marker above which the marker is
#' discarded
#' @param MaxIndMissing
#' Maximum proportion of missing values for individual above which the
#' individual is discarded (applied after marker filtering)
#' @param NbThreads
#' Number of threads to be used (positive integer) with a default value of 0
#' setting automatically all threads available
#' @param Verbose
#' A boolean describing if detailed information should be printed
#'
#' @return
#' A list of three items: a filtered list of genotying matrices (`Geno`),
#' a dataframe with variant information (`MarkerInfo`),
#' and a dataframe to be used as a genetic map proxy for local admixture 
#' inference (`GeneticMap`). 
#' The genetic map proxy is based on physical positions assuming all 
#' chromosomes are 100cM long. 
#'
#' @description
#' This function reads and format VCF files
#' 
#' @details
#' The function `ReadVCF()` reads Variant Call Format (VCF) files.
#' 
#' By default, the dosages are obtained from the GT field of the VCF. However,
#' genotypic data are commonly "diploidized" in polyploid species where all 
#' heterozygous classes are aggregated into one class. In this case, it is 
#' recommended to specify the allele depth field of the VCF to work with allele
#' read depth ratios instead, e.g. `AlleleDepthField=AD`.
#' 
#' If the VCF has been generated by the workflow 
#' \href{https://gitlab.cirad.fr/agap/seg/haplocharmer}{HaploCharmer} where 
#' alleles are phased within blocks, the read depths are not associated to alleles
#' but to haplotypes. In this case, read depth ratios associated to each allele 
#' can be obtained by specifying `AlleleDepthType="haplotypes"`.
#' 
#' Filtering of missing data can be applied to individuals and markers by
#' setting the maximum percentages `MaxMarkerMissing` and `MaxIndMissing`,
#' respectively.
#' 
#' By default, all available threads/CPU cores are used but the number
#' can be chosen using `NbThreads`.
#' 
#'
#' @importFrom data.table fread
#' @importFrom magrittr %>%
#' @importFrom magrittr set_names
#' @import dplyr
#'
#' @seealso [ReadHPA()] to read Haplotype Presence-Absence files generated by
#' \href{https://gitlab.cirad.fr/agap/seg/haplocharmer}{HaploCharmer}.
#'
#' @export
#'
#' @examples
#' ## Read test VCF
#' vcf_path <- system.file("extdata", "Test.vcf", package = "AdmixPoly")
#' DataVCF <- AdmixPoly::ReadVCF(File = vcf_path, NbThreads=1)
ReadVCF <- function(File, AlleleDepthField=NULL, AlleleDepthType="alleles",
                    MaxMarkerMissing=0.2,MaxIndMissing=0.2,
                    NbThreads=0L,Verbose=TRUE){
  ## Initial checks
  stopifnot("No file was found" =
              file.exists(File))
  stopifnot("MaxMarkerMissing must be a proportion between 0 (excluded) and 1" =
              MaxMarkerMissing>0 && MaxMarkerMissing<=1)
  stopifnot("MaxIndMissing must be a proportion between 0 (excluded) and 1" =
              MaxIndMissing>0 && MaxIndMissing<=1)

  ## Define global variable for tidy operations
  MARKER <- NULL
  CHROM <- NULL
  POS <- NULL
  ID <- NULL
  REF <- NULL
  ALT <- NULL
  QUAL <- NULL
  FILTER <- NULL
  INFO <- NULL
  Chromosome <- NULL
  Marker <- NULL
  Distance <- NULL
  
  ## Load hpa using data.table
  if(Verbose) cat("Load VCF using data.table:\n")
  RawVCF <- data.table::fread(file = File, skip = "CHROM")

  ## Variant information
  MarkerInfo <- RawVCF[,1:9] %>%
    as_tibble %>% 
    rename(CHROM = "#CHROM") %>% 
    unite(MARKER,CHROM,POS,remove = F)
  
  ## Additional checks
  stopifnot("Absence of GT in FORMAT field" =
              grepl("GT",RawVCF$FORMAT[1]))
  if(!is.null(AlleleDepthField)){
    stopifnot("AlleleDepthField was not found in FORMAT field" =
                grepl(AlleleDepthField,RawVCF$FORMAT[1]))
    stopifnot("AlleleDepthType must be informed along with AlleleDepthField" =
                !is.null(AlleleDepthType) && AlleleDepthType%in%c("alleles","haplotypes"))
  }
  
  ## Format markers
  Geno <- .FormatVariant(RawVCF, MarkerInfo, AlleleDepthField, AlleleDepthType, 
                         NbThreads, Verbose)

  ## Test if filtering needed
  # if(all(MaxMarkerMissing==1 && MaxIndMissing==1)){
  #   if(Verbose) cat("No filtering for missing values.\n")
  #   Geno_filt <- Geno
  # }else{
    Geno_filt <- .FilterMissing(Geno, MaxMarkerMissing, MaxIndMissing, 
                                NbThreads, Verbose)
    if(Verbose) cat("Filtering for missing values: updating MarkerInfo\r")
    MarkerInfo <- MarkerInfo %>% 
      filter(MARKER%in%names(Geno_filt)) %>% 
      select(MARKER,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO)
    if(Verbose) {
      cat("                                                 \r")
      cat("Filtering for missing values:\n")
    }
  # }
  
  ## Print information
  if(Verbose) {
    cat(" - Number of individuals: ",nrow(Geno_filt[[1]]),"\n",sep="")
    cat(" - Number of markers: ",length(Geno_filt),"\n",sep="")
  }
  
  ## Build genetic map
  if(Verbose) cat('Building genetic map proxy\n')
  GeneticMap <- MarkerInfo %>% 
    select(MARKER,CHROM,POS) %>% 
    mutate(POS = as.numeric(POS)) %>%
    group_by(CHROM) %>%
    mutate(Distance = if (n() == 1) 0 else (POS - min(POS)) / (max(POS) - min(POS)) * 100) %>% 
    ungroup %>% 
    rename(Marker=MARKER) %>%
    rename(Chromosome=CHROM) %>%
    select(c(Chromosome,Marker,Distance))

  ## Return outputs
  return(list(Geno=Geno_filt, MarkerInfo=MarkerInfo, GeneticMap=GeneticMap))
}

Try the AdmixPoly package in your browser

Any scripts or data that you put into this service are public.

AdmixPoly documentation built on June 18, 2026, 1:06 a.m.