R/ReadHPA.R

Defines functions ReadHPA

Documented in ReadHPA

#' Read Haplotype Presence-Absence (HPA) file 
#'
#' @param File
#' Path to HPA file
#' @param TotalDepthField
#' Total read depth FORMAT field in the HPA file to get allele
#' depth proportions
#' @param AlleleDepthField
#' Alternative allele read depth FORMAT field in the HPA file to get allele
#' depth proportions
#' @param MaxMarkerMissing
#' Maximum proportion of missing values for marker above which the marker is
#' discarded
#' @param MaxIndMissing
#' Maximum proportion of missing values for individual above which the
#' individual is discarded (applied after marker filtering)
#' @param NbThreads
#' Number of threads to be used (positive integer) with a default value of 0
#' setting automatically all threads available
#' @param Verbose
#' A boolean describing if detailed information should be printed
#'
#' @return
#' A list of three items: a filtered list of genotying matrices (`Geno`),
#' a dataframe with variant information (`MarkerInfo`),
#' and a dataframe to be used as a genetic map proxy for local admixture 
#' inference (`GeneticMap`). 
#' The genetic map proxy is based on physical positions assuming all 
#' chromosomes are 100cM long. 
#' 
#' @description
#' This function reads and format HPA files generated by 
#' \href{https://gitlab.cirad.fr/agap/seg/haplocharmer}{HaploCharmer}.
#'
#' @details
#' The function `ReadHPA()` reads Hapotype Presence-Absence (HPA) files generated 
#' by \href{https://gitlab.cirad.fr/agap/seg/haplocharmer}{HaploCharmer}.
#' 
#' This function generates genotypic data coded as haplotype read depth ratios 
#' using the read depths from the haplotype (`AlleleDepthField`) and total 
#' read depth (`TotalDepthField`) fields of the HPA file.
#' 
#' Filtering of missing data can be applied to individuals and markers by
#' setting the maximum percentages `MaxMarkerMissing` and `MaxIndMissing`,
#' respectively.
#' 
#' By default, all available threads/CPU cores are used but the number
#' can be chosen using `NbThreads`.
#'
#' @importFrom data.table fread
#' @importFrom magrittr %>%
#' @importFrom magrittr set_names
#' @import dplyr
#' 
#' @seealso [ReadHPA()] to read VCF files
#'
#' @export
#'
#' @examples
#' ## Read HPA
#' hpa_path <- system.file("extdata", "Test.hpa", package = "AdmixPoly")
#' DataHPA <- AdmixPoly::ReadHPA(File = hpa_path, NbThreads=1)
ReadHPA <- function(File, MaxMarkerMissing=0.2,MaxIndMissing=0.2,
                    TotalDepthField="DP", AlleleDepthField="AD",
                    NbThreads=0L, Verbose=TRUE){
  ## Initial checks
  stopifnot("No file was found" =
              file.exists(File))
  stopifnot("MaxMarkerMissing must be a proportion between 0 (excluded) and 1" =
              MaxMarkerMissing>0 && MaxMarkerMissing<=1)
  stopifnot("MaxIndMissing must be a proportion between 0 (excluded) and 1" =
              MaxIndMissing>0 && MaxIndMissing<=1)
  
  ## Define global variable for tidy operations
  HAPLOTYPE <- NULL
  PS <- NULL
  CHROM <- NULL
  START <- NULL
  END <- NULL
  SEQUENCE <- NULL
  POS <- NULL
  Chromosome <- NULL
  Marker <- NULL
  Distance <- NULL
  
  ## Load hpa using data.table
  if(Verbose) cat("Load HPA using data.table:\n")
  RawHPA <- data.table::fread(file = File, skip = "CHROM")
  
  ## Variant information
  MarkerInfo <- RawHPA[,1:6] %>%
    as_tibble %>% 
    rename(HAPLOTYPE = "#HAPLOTYPE") %>% 
    unite(PS,CHROM,START,END,remove = FALSE) %>% 
    separate(HAPLOTYPE, into = c(NA,NA,NA,"HAPLOTYPE"))
  
  ## Get unique PS
  PS_unique <- unique(MarkerInfo$PS)
  if(Verbose) cat("Number of phase sets identified:",length(PS_unique),"\n")

  ## Additional checks
  stopifnot("TotalDepthField was not found in FORMAT field" =
              grepl(TotalDepthField,RawHPA$FORMAT[1]))
  stopifnot("AlleleDepthField was not found in FORMAT field" =
              grepl(AlleleDepthField,RawHPA$FORMAT[1]))

  ## Format phase sets
  Geno <- .FormatHaplotype(RawHPA, MarkerInfo, PS_unique, NbThreads, Verbose)

  ## Test if filtering needed
  # if(all(MaxMarkerMissing==1 && MaxIndMissing==1)){ 
  #   Geno_filt <- Geno
  #   if(Verbose) cat("No filtering for missing values.\n")
  # }else{
    Geno_filt <- .FilterMissing(Geno, MaxMarkerMissing, MaxIndMissing, 
                                NbThreads, Verbose)
    if(Verbose) cat("Filtering for missing values: updating MarkerInfo\r")
    Haplotype_filt <- sapply(1:length(Geno_filt),function(i)
      paste(names(Geno_filt)[i],colnames(Geno_filt[[i]]),sep="_")) %>% 
      unlist
    MarkerInfo <- MarkerInfo %>% 
      filter(paste(PS,HAPLOTYPE,sep="_")%in%Haplotype_filt) %>% 
      select(HAPLOTYPE,PS,CHROM,START,END,SEQUENCE)
    if(Verbose){
      cat("                                                 \r")
      cat("Filtering for missing values:\n")
    }
  # }
  
  ## Print information
  if(Verbose){
    cat(" - Number of individuals: ",nrow(Geno_filt[[1]]),"\n",sep="")
    cat(" - Number of markers: ",length(Geno_filt),"\n",sep="")
  }
  
  ## Build genetic map
  if(Verbose) cat('Building genetic map proxy\n')
  GeneticMap <- MarkerInfo %>% 
    select(PS,CHROM,START) %>% 
    filter(!duplicated(PS)) %>% 
    mutate(POS = as.numeric(START)) %>%
    group_by(CHROM) %>%
    mutate(Distance = if (n() == 1) 0 else (POS - min(POS)) / (max(POS) - min(POS)) * 100) %>% 
    ungroup %>% 
    rename(Marker=PS) %>%
    rename(Chromosome=CHROM) %>%
    select(c(Chromosome,Marker,Distance))

  ## Return outputs
  return(list(Geno=Geno_filt, MarkerInfo=MarkerInfo, GeneticMap=GeneticMap))
}

Try the AdmixPoly package in your browser

Any scripts or data that you put into this service are public.

AdmixPoly documentation built on June 18, 2026, 1:06 a.m.