R/genotype_dose_matrix.R

Defines functions extractGenotypeMatrixFromDosage

Documented in extractGenotypeMatrixFromDosage

#' Extract genotype matrix from a dosage file generated by bcftools
#'
#' @param chr chromosome of the genomic region
#' @param start start coordinate of the genomic region
#' @param end end coordinate of the genomic region
#' @param dosage_file Path to the tabix-indexed dosage file (first four columns need to be CHROM, POS, REF, ALT)
#'
#' @return genotype matrix
#' @export
extractGenotypeMatrixFromDosage <- function(chr, start, end, dosage_file){
  #Make a GRanges object
  granges = GenomicRanges::GRanges(seqnames = chr, ranges = IRanges::IRanges(start = start, end = end))
  
  #Extract header
  header = as.character(read.table(dosage_file, nrows = 1, stringsAsFactors = F))
  coltypes = paste0("cccc", paste(rep("d",length(header)-4), collapse = ""))
  
  #Extract genotypes
  gt_df = eQTLUtils::scanTabixDataFrame(dosage_file, granges, col_names = header, col_types = coltypes)[[1]]
  
  #Construct variant ids
  id_df = dplyr::select(gt_df, CHROM, POS, REF, ALT) %>%
    dplyr::mutate(variant_id = paste(paste0("chr", CHROM), POS, REF, ALT, sep = "_"))

  #Make genotype matrix
  gt_matrix = gt_df[,-c(1:4)] %>% as.matrix()
  row.names(gt_matrix) = id_df$variant_id

  return(gt_matrix)
}
kauralasoo/eQTLUtils documentation built on March 12, 2023, 8:50 p.m.