R/gen_tibble_packedancestry.R

Defines functions gen_tibble_packedancestry

# A function to read geno packedancestrymap files
gen_tibble_packedancestry <- function(
    x,
    ...,
    valid_alleles = c("A", "T", "C", "G"),
    missing_alleles = c("0", "."),
    backingfile = NULL,
    quiet = FALSE) {
  # Substitute .geno with .snp
  map_file <- sub("\\.geno$", ".snp", x)
  if (!file.exists(map_file)) {
    stop("snp file ", map_file, " does not exist")
  }
  ind_file <- sub("\\.geno$", ".ind", x)
  if (!file.exists(ind_file)) {
    stop("ind file ", ind_file, " does not exist")
  }

  if (is.null(backingfile)) {
    backingfile <- sub("\\.geno$", "", x)
  }
  if (file_ext(backingfile) == "bk") {
    backingfile <- bigstatsr::sub_bk(backingfile, "")
  }

  # read the packed ancestry in chunks
  # count the individuals
  indiv_table <- utils::read.table(ind_file, header = FALSE)
  names(indiv_table) <- c("id", "sex", "population")
  no_individuals <- nrow(indiv_table)

  # count the loci
  loci_table <- utils::read.table(map_file, header = FALSE)
  names(loci_table) <- c(
    "name",
    "chromosome",
    "genetic_dist",
    "position",
    "allele_ref",
    "allele_alt"
  )
  loci_table$allele_alt[loci_table$allele_alt == "X"] <- NA
  no_variants <- nrow(loci_table)

  # verify that these numbers are compatible with the geno file
  conn <- file(x, "rb")
  hd <- strsplit(readBin(conn, "character", n = 1), " +")[[1]]
  close(conn)
  if (no_individuals != as.numeric(hd[2])) {
    stop(
      "Number of individuals in geno file does not match the number of ",
      "individuals in the ind file"
    )
  }
  if (no_variants != as.numeric(hd[3])) {
    stop(
      "Number of variants in geno file does not match the number of ",
      "variants in the snp file"
    )
  }


  # create a matrix to store the data
  file_backed_matrix <- bigstatsr::FBM.code256(
    nrow = no_individuals,
    ncol = no_variants,
    code = bigsnpr::CODE_012,
    backingfile = backingfile
  )

  # Fill the FBM from bedfile
  reach_eof <- read_packedancestry(x, file_backed_matrix,
    tab = get_packedancestry_code()
  )

  if (!reach_eof) warning("EOF of bedfile has not been reached.")

  # save the fbm
  file_backed_matrix$save()

  # convert the info from the indiv and loci table
  fam <- tibble(
    family.ID = indiv_table$population,
    sample.ID = indiv_table$id,
    paternal.ID = 0,
    maternal.ID = 0,
    sex = dplyr::case_when(
      indiv_table$sex %in% c("M", "male", 1) ~ 1L,
      indiv_table$sex %in% c("F", "female", 2) ~ 2L,
      TRUE ~ 0L
    ),
    affection = 0,
    ploidy = 2
  )

  map <- tibble(
    chromosome = loci_table$chromosome,
    marker.ID = loci_table$name,
    genetic.dist = loci_table$genetic_dist,
    physical.pos = loci_table$position,
    # note the swap below due to the different conventions in plink vs
    # packed ancestry
    allele1 = loci_table$allele_alt,
    allele2 = loci_table$allele_ref
  )

  bigsnp_obj <- structure(
    list(
      genotypes = file_backed_matrix,
      fam = fam,
      map = map
    ),
    class = "bigSNP"
  )

  bigsnp_obj <- bigsnpr::snp_save(bigsnp_obj)

  gen_tibble(
    bigsnp_obj$genotypes$rds,
    valid_alleles = valid_alleles,
    missing_alleles = missing_alleles,
    backingfile = backingfile,
    quiet = TRUE
  ) # silence it as output will be generated by the parent function
}

Try the tidypopgen package in your browser

Any scripts or data that you put into this service are public.

tidypopgen documentation built on Aug. 28, 2025, 1:08 a.m.