#Function: ghap.haplotyping
#License: GPLv3 or later
#Modification date: 16 Dec 2022
#Written by: Yuri Tani Utsunomiya & Marco Milanesi
#Contact: ytutsunomiya@gmail.com, marco.milanesi.mm@gmail.com
#Description: Output haplotype genotype matrix for user-defined haplotype blocks
ghap.haplotyping <- function(
object,
blocks,
outfile,
freq=c(0,1),
drop.minor=FALSE,
only.active.samples=TRUE,
only.active.markers=TRUE,
batchsize=NULL,
binary=TRUE,
ncores=1,
verbose=TRUE
){
#Check if phase is a GHap.phase object
if(inherits(object, "GHap.phase") == FALSE){
stop("Argument phase must be a GHap.phase object.")
}
#Insert suffix to outfile
hapgenotypes <- paste(outfile,"hapgenotypes",sep=".")
hapalleles <- paste(outfile,"hapalleles",sep=".")
hapsamples <- paste(outfile,"hapsamples",sep=".")
if(binary == TRUE){
hapgenotypes <- paste(hapgenotypes,"b",sep="")
}
#Check if output will overwrite existing files before opening connection
if(file.exists(hapsamples) == TRUE){
stop(paste("File", hapsamples, "already exists!"))
}
if(file.exists(hapgenotypes) == TRUE){
stop(paste("File", hapgenotypes, "already exists!"))
}
if(file.exists(hapalleles) == TRUE){
stop(paste("File", hapalleles, "already exists!"))
}
#Check if inactive markers and samples should be reactived
if(only.active.markers == FALSE){
object$marker.in <- rep(TRUE,times=object$nmarkers)
object$nmarkers.in <- length(which(object$marker.in))
}
if(only.active.samples == FALSE){
object$id.in <- rep(TRUE,times=2*object$nsamples)
object$nsamples.in <- length(which(object$id.in))/2
}
#Identify activated samples
ids.in <- which(object$id.in)
id <- object$id[ids.in]
id <- id[1:length(id) %% 2 == 0]
pop <- object$pop[ids.in]
pop <- pop[1:length(pop) %% 2 == 0]
ids.n <- length(id)
#Output hapsamples file
fwrite(x = as.data.table(cbind(pop,id)), file = hapsamples, quote = FALSE,
sep=" ", row.names = FALSE, col.names=FALSE)
#Generate batch index
if(is.null(batchsize) == TRUE){
batchsize <- ceiling(nrow(blocks)/10)
}
if(batchsize > nrow(blocks)){
batchsize <- nrow(blocks)
}
id1<-seq(1,nrow(blocks),by=batchsize)
id2<-(id1+batchsize)-1
id1<-id1[id2<=nrow(blocks)]
id2<-id2[id2<=nrow(blocks)]
id1 <- c(id1,id2[length(id2)]+1)
id2 <- c(id2,nrow(blocks))
if(id1[length(id1)] > nrow(blocks)){
id1 <- id1[-length(id1)]; id2 <- id2[-length(id2)]
}
#Log message
if(verbose == TRUE){
cat("Processing ", nrow(blocks), " blocks in:\n", sep="")
batch <- table((id2-id1)+1)
for(i in 1:length(batch)){
cat(batch[i]," batches of ",names(batch[i]),"\n",sep="")
}
}
# Initialize lookup table for output
lookup2 <- rep(NA,times=256)
lookup2[1:2] <- c(0,1)
d <- 10
i <- 3
while(i <= 256){
b <- d + lookup2[1:(i-1)]
lookup2[i:(length(b)+i-1)] <- b
i <- i + length(b)
d <- d*10
}
lookup2 <- sprintf(fmt="%08d", lookup2)
lookup2 <- sapply(lookup2, function(i){intToUtf8(rev(utf8ToInt(i)))})
# Print magic number [01101100 00011011] and mode [00000001]
# For compatibility with PLINK
if(binary == TRUE){
hapgenotypes.con <- file(hapgenotypes, open = "ab")
writeBin(object = as.integer(c(108,27,1)), con = hapgenotypes.con, size = 1)
close(con = hapgenotypes.con)
}
# Define block function
block.iter.FUN<-function(i){
outline <- NULL
#Get block info
block.info <- blocks[i,c("BLOCK","CHR","BP1","BP2")]
#SNPs in the block
snps <- which(object$chr == block.info$CHR &
object$bp >= block.info$BP1 &
object$bp <= block.info$BP2 &
object$marker.in == TRUE)
phase.A0 <- object$A0[snps]
phase.A1 <- object$A1[snps]
if(length(snps) >= 1){
#Subset block
if(length(snps) == 1){
block.subset <- ghap.slice(object = object,
ids = unique(object$id[ids.in]),
variants = object$marker[snps])
haplotypes <- as.character(block.subset)
}else{
block.subset <- ghap.slice(object = object,
ids = unique(object$id[ids.in]),
variants = object$marker[snps])
haplotypes <- apply(block.subset,MARGIN = 2, paste, collapse="")
}
#Haplotype library
lib <- table(haplotypes)/(2*ids.n)
lib <- sort(lib)
if(drop.minor == TRUE & length(lib) > 1){
lib <- lib[-1]
}
lib <- lib[lib >= freq[1] & lib <= freq[2]]
#Output genotype matrix
if(length(lib) >= 1){
for(j in names(lib)){
phase.geno <- as.integer(haplotypes[1:length(haplotypes) %% 2 == 0] == j)
phase.geno <- phase.geno + as.integer(haplotypes[1:length(haplotypes) %% 2 == 1] == j)
j <- unlist(strsplit(j,""))
phase.allele <- rep(NA,times=length(j))
phase.allele[j == "0"] <- phase.A0[j == "0"]
phase.allele[j == "1"] <- phase.A1[j == "1"]
phase.allele <- paste(phase.allele,collapse="")
phase.info<-paste(paste(block.info,collapse=" "),phase.allele,sep=" ")
phase.geno<-paste(phase.geno,collapse=" ")
outline <- c(outline,phase.info,phase.geno)
}
}
}
return(outline)
}
#Compute bitloss
bitloss <- 8 - ((2*ids.n) %% 8)
if(bitloss == 8){
bitloss <- 0
}
#Define binary conversion function
toBitFUN <- function(i){
line <- scan(text=hapgenotypes.out[i], what = "character", sep=" ", quiet = TRUE)
line[which(line == "0")] <- "00"
line[which(line == "1")] <- "01"
line[which(line == "2")] <- "11"
line <- c(line,rep("0",times=bitloss))
line <- paste(line, collapse = "")
nc <- nchar(line)
n <- seq(1, nc, by = 8)
line <- substring(line, n, c(n[-1]-1, nc))
line <- strtoi(lookup2[line], base=2)
return(line)
}
#Iterate blocks
nblocks.done <- 0
ncores <- min(c(detectCores(), ncores))
for(i in 1:length(id1)){
#Compute blocks
if(Sys.info()["sysname"] == "Windows"){
cl <- makeCluster(ncores)
mylines <- unlist(parLapply(cl = cl, fun = block.iter.FUN, X = id1[i]:id2[i]))
stopCluster(cl)
}else{
mylines <- unlist(mclapply(FUN = block.iter.FUN, X = id1[i]:id2[i], mc.cores = ncores))
}
#Write batch to files
if(is.null(mylines) == F){
#Write hapalleles
hapalleles.out <- mylines[1:length(mylines) %% 2 == 1]
hapalleles.con <- file(hapalleles, open = "a")
writeLines(text = hapalleles.out, con=hapalleles.con)
close(con = hapalleles.con)
#Write hapgenotypes
hapgenotypes.out <- mylines[1:length(mylines) %% 2 == 0]
if(binary == TRUE){
if(Sys.info()["sysname"] == "Windows"){
cl <- makeCluster(ncores)
hapgenotypes.out <- unlist(parLapply(cl = cl, fun = toBitFUN, X = 1:length(hapgenotypes.out)))
stopCluster(cl)
}else{
hapgenotypes.out <- unlist(mclapply(FUN = toBitFUN, X = 1:length(hapgenotypes.out), mc.cores = ncores))
}
hapgenotypes.con <- file(hapgenotypes, open = "ab")
writeBin(object = hapgenotypes.out, con = hapgenotypes.con, size = 1)
close(con = hapgenotypes.con)
}else{
hapgenotypes.con <- file(hapgenotypes, open = "a")
writeLines(text = hapgenotypes.out, con=hapgenotypes.con)
close(con = hapgenotypes.con)
}
}
#Log message
if(verbose == TRUE){
nblocks.done <- nblocks.done + (id2[i]-id1[i]) + 1
cat(nblocks.done, "blocks written to file\r")
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.