Nothing
#' Read Haplotype Presence-Absence (HPA) file
#'
#' @param File
#' Path to HPA file
#' @param TotalDepthField
#' Total read depth FORMAT field in the HPA file to get allele
#' depth proportions
#' @param AlleleDepthField
#' Alternative allele read depth FORMAT field in the HPA file to get allele
#' depth proportions
#' @param MaxMarkerMissing
#' Maximum proportion of missing values for marker above which the marker is
#' discarded
#' @param MaxIndMissing
#' Maximum proportion of missing values for individual above which the
#' individual is discarded (applied after marker filtering)
#' @param NbThreads
#' Number of threads to be used (positive integer) with a default value of 0
#' setting automatically all threads available
#' @param Verbose
#' A boolean describing if detailed information should be printed
#'
#' @return
#' A list of three items: a filtered list of genotying matrices (`Geno`),
#' a dataframe with variant information (`MarkerInfo`),
#' and a dataframe to be used as a genetic map proxy for local admixture
#' inference (`GeneticMap`).
#' The genetic map proxy is based on physical positions assuming all
#' chromosomes are 100cM long.
#'
#' @description
#' This function reads and format HPA files generated by
#' \href{https://gitlab.cirad.fr/agap/seg/haplocharmer}{HaploCharmer}.
#'
#' @details
#' The function `ReadHPA()` reads Hapotype Presence-Absence (HPA) files generated
#' by \href{https://gitlab.cirad.fr/agap/seg/haplocharmer}{HaploCharmer}.
#'
#' This function generates genotypic data coded as haplotype read depth ratios
#' using the read depths from the haplotype (`AlleleDepthField`) and total
#' read depth (`TotalDepthField`) fields of the HPA file.
#'
#' Filtering of missing data can be applied to individuals and markers by
#' setting the maximum percentages `MaxMarkerMissing` and `MaxIndMissing`,
#' respectively.
#'
#' By default, all available threads/CPU cores are used but the number
#' can be chosen using `NbThreads`.
#'
#' @importFrom data.table fread
#' @importFrom magrittr %>%
#' @importFrom magrittr set_names
#' @import dplyr
#'
#' @seealso [ReadHPA()] to read VCF files
#'
#' @export
#'
#' @examples
#' ## Read HPA
#' hpa_path <- system.file("extdata", "Test.hpa", package = "AdmixPoly")
#' DataHPA <- AdmixPoly::ReadHPA(File = hpa_path, NbThreads=1)
ReadHPA <- function(File, MaxMarkerMissing=0.2,MaxIndMissing=0.2,
TotalDepthField="DP", AlleleDepthField="AD",
NbThreads=0L, Verbose=TRUE){
## Initial checks
stopifnot("No file was found" =
file.exists(File))
stopifnot("MaxMarkerMissing must be a proportion between 0 (excluded) and 1" =
MaxMarkerMissing>0 && MaxMarkerMissing<=1)
stopifnot("MaxIndMissing must be a proportion between 0 (excluded) and 1" =
MaxIndMissing>0 && MaxIndMissing<=1)
## Define global variable for tidy operations
HAPLOTYPE <- NULL
PS <- NULL
CHROM <- NULL
START <- NULL
END <- NULL
SEQUENCE <- NULL
POS <- NULL
Chromosome <- NULL
Marker <- NULL
Distance <- NULL
## Load hpa using data.table
if(Verbose) cat("Load HPA using data.table:\n")
RawHPA <- data.table::fread(file = File, skip = "CHROM")
## Variant information
MarkerInfo <- RawHPA[,1:6] %>%
as_tibble %>%
rename(HAPLOTYPE = "#HAPLOTYPE") %>%
unite(PS,CHROM,START,END,remove = FALSE) %>%
separate(HAPLOTYPE, into = c(NA,NA,NA,"HAPLOTYPE"))
## Get unique PS
PS_unique <- unique(MarkerInfo$PS)
if(Verbose) cat("Number of phase sets identified:",length(PS_unique),"\n")
## Additional checks
stopifnot("TotalDepthField was not found in FORMAT field" =
grepl(TotalDepthField,RawHPA$FORMAT[1]))
stopifnot("AlleleDepthField was not found in FORMAT field" =
grepl(AlleleDepthField,RawHPA$FORMAT[1]))
## Format phase sets
Geno <- .FormatHaplotype(RawHPA, MarkerInfo, PS_unique, NbThreads, Verbose)
## Test if filtering needed
# if(all(MaxMarkerMissing==1 && MaxIndMissing==1)){
# Geno_filt <- Geno
# if(Verbose) cat("No filtering for missing values.\n")
# }else{
Geno_filt <- .FilterMissing(Geno, MaxMarkerMissing, MaxIndMissing,
NbThreads, Verbose)
if(Verbose) cat("Filtering for missing values: updating MarkerInfo\r")
Haplotype_filt <- sapply(1:length(Geno_filt),function(i)
paste(names(Geno_filt)[i],colnames(Geno_filt[[i]]),sep="_")) %>%
unlist
MarkerInfo <- MarkerInfo %>%
filter(paste(PS,HAPLOTYPE,sep="_")%in%Haplotype_filt) %>%
select(HAPLOTYPE,PS,CHROM,START,END,SEQUENCE)
if(Verbose){
cat(" \r")
cat("Filtering for missing values:\n")
}
# }
## Print information
if(Verbose){
cat(" - Number of individuals: ",nrow(Geno_filt[[1]]),"\n",sep="")
cat(" - Number of markers: ",length(Geno_filt),"\n",sep="")
}
## Build genetic map
if(Verbose) cat('Building genetic map proxy\n')
GeneticMap <- MarkerInfo %>%
select(PS,CHROM,START) %>%
filter(!duplicated(PS)) %>%
mutate(POS = as.numeric(START)) %>%
group_by(CHROM) %>%
mutate(Distance = if (n() == 1) 0 else (POS - min(POS)) / (max(POS) - min(POS)) * 100) %>%
ungroup %>%
rename(Marker=PS) %>%
rename(Chromosome=CHROM) %>%
select(c(Chromosome,Marker,Distance))
## Return outputs
return(list(Geno=Geno_filt, MarkerInfo=MarkerInfo, GeneticMap=GeneticMap))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.