Nothing
#' Read VCF file
#'
#' @param File
#' Path to VCF file
#' @param AlleleDepthField
#' Allele read depth FORMAT field in the VCF to get allele depth proportions.
#' If NULL (default), dosage is obtained from GT field
#' @param AlleleDepthType
#' Allele read depth type: either the depths associated with the REF and ALT
#' alleles (alleles, by default),
#' or the depths associated with each haplotypes reported in the GT field (haplotypes).
#' The latter can be used for a phased VCF generated by
#' \href{https://gitlab.cirad.fr/agap/seg/haplocharmer}{HaploCharmer}.
#' @param MaxMarkerMissing
#' Maximum proportion of missing values for marker above which the marker is
#' discarded
#' @param MaxIndMissing
#' Maximum proportion of missing values for individual above which the
#' individual is discarded (applied after marker filtering)
#' @param NbThreads
#' Number of threads to be used (positive integer) with a default value of 0
#' setting automatically all threads available
#' @param Verbose
#' A boolean describing if detailed information should be printed
#'
#' @return
#' A list of three items: a filtered list of genotying matrices (`Geno`),
#' a dataframe with variant information (`MarkerInfo`),
#' and a dataframe to be used as a genetic map proxy for local admixture
#' inference (`GeneticMap`).
#' The genetic map proxy is based on physical positions assuming all
#' chromosomes are 100cM long.
#'
#' @description
#' This function reads and format VCF files
#'
#' @details
#' The function `ReadVCF()` reads Variant Call Format (VCF) files.
#'
#' By default, the dosages are obtained from the GT field of the VCF. However,
#' genotypic data are commonly "diploidized" in polyploid species where all
#' heterozygous classes are aggregated into one class. In this case, it is
#' recommended to specify the allele depth field of the VCF to work with allele
#' read depth ratios instead, e.g. `AlleleDepthField=AD`.
#'
#' If the VCF has been generated by the workflow
#' \href{https://gitlab.cirad.fr/agap/seg/haplocharmer}{HaploCharmer} where
#' alleles are phased within blocks, the read depths are not associated to alleles
#' but to haplotypes. In this case, read depth ratios associated to each allele
#' can be obtained by specifying `AlleleDepthType="haplotypes"`.
#'
#' Filtering of missing data can be applied to individuals and markers by
#' setting the maximum percentages `MaxMarkerMissing` and `MaxIndMissing`,
#' respectively.
#'
#' By default, all available threads/CPU cores are used but the number
#' can be chosen using `NbThreads`.
#'
#'
#' @importFrom data.table fread
#' @importFrom magrittr %>%
#' @importFrom magrittr set_names
#' @import dplyr
#'
#' @seealso [ReadHPA()] to read Haplotype Presence-Absence files generated by
#' \href{https://gitlab.cirad.fr/agap/seg/haplocharmer}{HaploCharmer}.
#'
#' @export
#'
#' @examples
#' ## Read test VCF
#' vcf_path <- system.file("extdata", "Test.vcf", package = "AdmixPoly")
#' DataVCF <- AdmixPoly::ReadVCF(File = vcf_path, NbThreads=1)
ReadVCF <- function(File, AlleleDepthField=NULL, AlleleDepthType="alleles",
MaxMarkerMissing=0.2,MaxIndMissing=0.2,
NbThreads=0L,Verbose=TRUE){
## Initial checks
stopifnot("No file was found" =
file.exists(File))
stopifnot("MaxMarkerMissing must be a proportion between 0 (excluded) and 1" =
MaxMarkerMissing>0 && MaxMarkerMissing<=1)
stopifnot("MaxIndMissing must be a proportion between 0 (excluded) and 1" =
MaxIndMissing>0 && MaxIndMissing<=1)
## Define global variable for tidy operations
MARKER <- NULL
CHROM <- NULL
POS <- NULL
ID <- NULL
REF <- NULL
ALT <- NULL
QUAL <- NULL
FILTER <- NULL
INFO <- NULL
Chromosome <- NULL
Marker <- NULL
Distance <- NULL
## Load hpa using data.table
if(Verbose) cat("Load VCF using data.table:\n")
RawVCF <- data.table::fread(file = File, skip = "CHROM")
## Variant information
MarkerInfo <- RawVCF[,1:9] %>%
as_tibble %>%
rename(CHROM = "#CHROM") %>%
unite(MARKER,CHROM,POS,remove = F)
## Additional checks
stopifnot("Absence of GT in FORMAT field" =
grepl("GT",RawVCF$FORMAT[1]))
if(!is.null(AlleleDepthField)){
stopifnot("AlleleDepthField was not found in FORMAT field" =
grepl(AlleleDepthField,RawVCF$FORMAT[1]))
stopifnot("AlleleDepthType must be informed along with AlleleDepthField" =
!is.null(AlleleDepthType) && AlleleDepthType%in%c("alleles","haplotypes"))
}
## Format markers
Geno <- .FormatVariant(RawVCF, MarkerInfo, AlleleDepthField, AlleleDepthType,
NbThreads, Verbose)
## Test if filtering needed
# if(all(MaxMarkerMissing==1 && MaxIndMissing==1)){
# if(Verbose) cat("No filtering for missing values.\n")
# Geno_filt <- Geno
# }else{
Geno_filt <- .FilterMissing(Geno, MaxMarkerMissing, MaxIndMissing,
NbThreads, Verbose)
if(Verbose) cat("Filtering for missing values: updating MarkerInfo\r")
MarkerInfo <- MarkerInfo %>%
filter(MARKER%in%names(Geno_filt)) %>%
select(MARKER,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO)
if(Verbose) {
cat(" \r")
cat("Filtering for missing values:\n")
}
# }
## Print information
if(Verbose) {
cat(" - Number of individuals: ",nrow(Geno_filt[[1]]),"\n",sep="")
cat(" - Number of markers: ",length(Geno_filt),"\n",sep="")
}
## Build genetic map
if(Verbose) cat('Building genetic map proxy\n')
GeneticMap <- MarkerInfo %>%
select(MARKER,CHROM,POS) %>%
mutate(POS = as.numeric(POS)) %>%
group_by(CHROM) %>%
mutate(Distance = if (n() == 1) 0 else (POS - min(POS)) / (max(POS) - min(POS)) * 100) %>%
ungroup %>%
rename(Marker=MARKER) %>%
rename(Chromosome=CHROM) %>%
select(c(Chromosome,Marker,Distance))
## Return outputs
return(list(Geno=Geno_filt, MarkerInfo=MarkerInfo, GeneticMap=GeneticMap))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.