#' @title Make input files for rehh package
#'
#' @description This function firstly extracts data according to user requirements, then convert .haps
#' in .thap files, updating their allele codification and creating a unique map (.inp) file for all
#' chromosomes and populations aimed by the user (files required as input in rehh package).
#'
#' @usage make.input.files(populations = "all", chrs = 1:22, haps.subset = F, output)
#'
#' @param populations Character vector with target population's file names (assumed to be in
#' 'populations' directory). If populations = "all" (default), then all files (populations)
#' will be included on analysis.
#' @param chrs Numeric vector with chromosome ID(s).
#' @param haps.subset Logical scalar. If TRUE, new .haps & .sample files will be generated
#' to the populations under analysis (default = FALSE).
#' @param output A character vector containing the desired population names as output.
#'
#' @details
#' R binary file acquired by make.binary.haplotypes.R() function is used at this function,
#' therefore essentially necessary ('haplotypes/haps-sample.RData').
#'
#' By default, all populations files in 'populations' directory are included in this function,
#' although users can also specify the populations in a vector with the exact file names
#' (without providing the path, which is supposed to be in 'populations' directory).
#'
#' The 'haps.subset' parameter enables the user to extract data from specific populations
#' as they are originally (.haps and .sample formats).
#'
#' Be careful when using 'output' parameter while using 'populations' parameter's default argument,
#' once you would not have control of the file names' order. If using the 'output' parameter,
#' you'd rather specify the populations' names as well. Default values to 'output' parameter
#' are automatically set according to acquired/loaded population's file names so that
#' there are no issues to worry about.
#'
#' @return Input files for rehh package: .thap, .sample, and .inp .
#' @seealso \code{\link{make.binary.haplotypes}}
#'
#' @examples
#' \dontrun{
#' make.input.files()
#' make.input.files(populations = c('Afr.txt', 'EUR.txt'), chrs = 1:10, haps.subset = T,
#' output = c('Africans','Europeans'))
#' }
#'
#' @export
make.input.files <- function(populations = "all", chrs = 1:22, haps.subset = F, output) {
# Population's files to import
if (length(populations) == 1 && populations == "all") {
populations <- list.files(path = "populations/", all.files = TRUE)
populations <- grep(pattern = "[[:alnum:]]", x = populations, value = TRUE)
}
populations.data <- lapply(X = populations, FUN = function(x) readLines(paste0("populations/", x)))
populations <- LD:::rm.txt(populations)
if (missing(output)) output <- populations
if (length(populations) != length(output)) {
stop("Output length must be equal to the number of populations analyzed.")
}
for(i in 1:length(populations.data)) comment(populations.data[[i]]) <- output[i]
names(populations.data) <- output
rm(populations)
cat(" Loading haps-sample.RData file... ")
load("./haplotypes/haps-sample.RData")
cat(" OK\n\n")
cat(" Extracting data for specified populations... \n\n")
populations.data <- lapply(populations.data, function(x) {
#### Extracting populations
phased.dataset <- mapply(function(haps, haps.info, sample, sample.info) {
# Assessing individuals' position in `.sample` and respective columns in `.haps` files for Amazonians
pos_sample <- which(sample$V2 %in% x)
pos_in_haps_a1 <- pos_sample * 2 - 1
pos_in_haps_a2 <- pos_sample * 2
pos_in_haps_both_alleles <- c(rbind(pos_in_haps_a1, pos_in_haps_a2))
# Writing filtered haps & sample files
new.haps <- cbind(haps.info, haps[, pos_in_haps_both_alleles])
new.sample <- rbind(sample.info, sample[pos_sample, ])
list(new.haps, new.sample)
}, haps = haps, haps.info = haps.info, sample = sample, sample.info = sample.info)
# splitting haps & sample files in two lists
new.haps <- phased.dataset[c(T, F)]
new.sample <- phased.dataset[c(F, T)]
new_haps_sample <- list(new.haps, new.sample)
cat(paste0("\tSuccessfully extracted data for \"", comment(x), "\"\n"))
return(new_haps_sample)
})
# Saving new haps & sample files
if (haps.subset == T) {
cat(paste("\n Saving new .haps & .sample files...\n\n"))
for (n in 1:length(populations.data)) {
for (i in 1:length(chrs)) {
data.table::fwrite(x = populations.data[[n]][[1]][[i]][, -c(1:5)],
file = paste0("rehh_in/chr", chrs[i], "_", output[n], ".haps"), sep = " ", quote = F, row.names = F, col.names = F)
data.table::fwrite(x = populations.data[[n]][[2]][[i]],
file = paste0("rehh_in/chr", chrs[i], "_", output[n], ".sample"), sep = " ", quote = F, row.names = F, col.names = F)
}
}
for (i in 1:length(populations.data)) cat(paste0("\tNew haps/sample files created for \"", output[i], "\" population\n"))
}
# Converting and saving new haps files to .thap
OS <- tolower(.Platform$OS.type)
if (OS == "unix") {
cat("\n Converting and saving to .thap & .sample files... \n\n")
for (n in 1:length(populations.data)) {
for (i in 1:length(chrs)) {
data.table::fwrite(populations.data[[n]][[1]][[i]][, -c(1:5)], file = paste0("rehh_in/chr", chrs[i], "_", output[n], ".rehh.thap"),
sep = " ", quote = F, row.names = F, col.names = F)
data.table::fwrite(populations.data[[n]][[2]][[i]], file = paste0("rehh_in/chr", chrs[i], "_", output[n], ".rehh.sample"),
sep = " ", quote = F, row.names = F, col.names = F)
}
cat(paste0("\tNew thap/sample files created for \"", output[n], "\" population\n"))
}
cat("\tUpdating code alleles from 0/1 to 1/2 as requested by rehh package...")
for (n in 1:length(populations.data)) {
for (i in 1:length(chrs)) {
system(paste0("tr 01 12 < rehh_in/chr", chrs[i], "_", output[n], ".rehh.thap > rehh_in/chr", chrs[i],
".thap && mv rehh_in/chr", chrs[i], ".thap rehh_in/chr", chrs[i], "_", output[n], ".rehh.thap"))
}
}
cat(" done!")
} else {
cat("\n Converting and saving to .thap & .sample files... \n\n")
for(n in 1:length(populations.data)) {
gt <- list()
for(i in 1:length(chrs)) {
gt[[i]] <- apply(populations.data[[n]][[1]][[i]][, -c(1:5)], 2, function(x) ifelse(x == 1, 2, 1))
data.table::fwrite(x = data.table::data.table(gt[[i]]),
file = paste0("rehh_in/chr", chrs[i], "_", output[n], ".rehh.thap"), sep = " ", col.names = F)
data.table::fwrite(x = populations.data[[n]][[2]][[i]],
file = paste0("rehh_in/chr", chrs[i], "_", output[n], ".rehh.sample"), sep = " ", quote = F, row.names = F, col.names = F)
}
cat(paste0("\tNew thap/sample files created for \"", output[n], "\" population\n"))
}
}
cat(paste0("\n\n Conversion completed!\n"))
# Making the map file
inp <- do.call("rbind", haps.info)[, c(2, 1, 3:5)]
write.table(inp, file = "rehh_in/mapfileR.inp", quote = F, row.names = F, col.names = F)
cat(" Map file created.\n\n")
if(haps.subset == T) {
cat(paste0("\t", (length(populations.data) * (length(chrs) * 4)) + 1, " files have been written into \"input\" directory:\n"))
} else {
cat(paste0("\t", (length(populations.data) * (length(chrs) * 2)) + 1, " files have been written into \"input\" directory:\n"))
}
if(haps.subset == T) {
cat(paste0("\t - ", length(populations.data) * (length(chrs)), " files .haps\n"))
cat(paste0("\t - ", length(populations.data) * (length(chrs)), " files .sample\n"))
}
cat(paste0("\t - ", length(populations.data) * (length(chrs)), " files rehh.thap\n"))
cat(paste0("\t - ", length(populations.data) * (length(chrs)), " files rehh.sample\n"))
cat(paste0("\t - ", 1, " map file (.inp)\n"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.