# Contains supermassa_error, depth_prepare and supermassa_parallel
##' Runs SuperMassa for multiple markers using doParallel packages
##' Updates OneMap object in genotypes, error probabilitie and marker type
##' Also removes non-informative markers according to SuperMassa genotyping.
##' @param vcfR.object object output of the vcfR package
##' @param onemap.object object of class onemap
##' @param vcf.par Field of VCF that informs the depth of alleles
##' @param parent1 parent 1 identification in vcfR object
##' @param parent2 parent 2 identification in vcfR objetc
##' @param f1 f1 individual identification if f2 cross type
##' @param recovering logical defining if markers should be recovered from VCF
##' @param mean_phred the mean phred score of the sequencer technology
##' @param cores number of threads
##' @param depths list containing a matrix for ref and other for alt allele counts, samples ID in colum and markers ID in rows
##' @param global_error number from 0 to 1 defining the global error to be considered together
##' with the genotype errors or the genotype probabilities or NULL to not considered any global error
##' @param use_genotypes_errors if \code{TRUE} the error probability of each genotype will be considered in emission function of HMM
##' @param use_genotype_probs if \code{TRUE} the probability of each possible genotype will be considered in emission function of HMM
##'
##' @import doParallel parallel
##'
##' @importFrom matrixStats logSumExp
##'
##' @export
supermassa_genotype <- function(vcfR.object=NULL,
onemap.object= NULL,
vcf.par = c("AD", "DPR"),
parent1="P1",
parent2="P2",
f1=NULL,
recovering = FALSE,
mean_phred = 20,
cores = 2,
depths = NULL,
global_error = NULL,
use_genotypes_errors = TRUE,
use_genotypes_probs = FALSE,
rm_multiallelic = TRUE){
if(is.null(depths)){
extracted_depth <- extract_depth(vcfR.object=vcfR.object,
onemap.object=onemap.object,
vcf.par=vcf.par,
parent1=parent1,
parent2=parent2,
f1=f1,
recovering=recovering)
} else {
p1 <- which(colnames(depths[[1]]) == parent1)
p2 <- which(colnames(depths[[1]]) == parent2)
if(is.null(f1)){
palt <- depths[[2]][,c(p1,p2)]
pref <- depths[[1]][,c(p1,p2)]
oalt <- depths[[2]][,-c(p1,p2)]
oref <- depths[[1]][,-c(p1,p2)]
oalt <- oalt[,match(rownames(onemap.object$geno),colnames(oalt))]
oref <- oref[,match(rownames(onemap.object$geno),colnames(oref))]
psize <- palt + pref
osize <- oalt + oref
rownames(palt) <- rownames(pref) <- rownames(oalt)
} else {
f1i <- which(colnames(depths[[1]]) == f1)
palt <- as.numeric(depths[[2]][,f1i])
pref <- as.numeric(depths[[1]][,f1i])
oalt <- depths[[2]][,-c(p1,p2,f1i)]
oref <- depths[[1]][,-c(p1,p2,f1i)]
oalt <- oalt[,match(rownames(onemap.object$geno),colnames(oalt))]
oref <- oref[,match(rownames(onemap.object$geno),colnames(oref))]
psize <- palt + pref
osize <- oalt + oref
names(palt) <- names(pref) <- rownames(oalt)
}
if(recovering==FALSE){
palt <- palt[which(rownames(palt) %in% colnames(onemap.object$geno)),]
pref <- pref[which(rownames(pref) %in% colnames(onemap.object$geno)),]
psize <- psize[which(rownames(psize) %in% colnames(onemap.object$geno)),]
oalt <- oalt[which(rownames(oalt) %in% colnames(onemap.object$geno)),]
oref <- oref[which(rownames(oref) %in% colnames(onemap.object$geno)),]
osize <- osize[which(rownames(osize) %in% colnames(onemap.object$geno)),]
}
extracted_depth <- list("palt"=palt, "pref"=pref, "psize"=psize,
"oalt"=as.matrix(oalt), "oref"=as.matrix(oref), "osize"=as.matrix(osize),
n.mks = dim(oalt)[1], n.ind = dim(oalt)[2],
inds = colnames(oalt), mks = rownames(oalt),
CHROM = sapply(strsplit(rownames(oalt), split="_"), "[",1),
POS = sapply(strsplit(rownames(oalt), split="_"), "[",2),
onemap.object = onemap.object)
}
# removing the multiallelics from the vcf
multi <- grepl(",",vcfR.object@fix[,5])
multi.mks <- vcfR.object@fix[,3][multi]
if(any(multi)){
idx <- which(rownames(extracted_depth$palt) %in% multi.mks)
extracted_depth$palt <- extracted_depth$palt[-idx,]
extracted_depth$pref <- extracted_depth$pref[-idx,]
extracted_depth$psize <- extracted_depth$psize[-idx,]
extracted_depth$oalt <- extracted_depth$oalt[-idx,]
extracted_depth$oref <- extracted_depth$oref[-idx,]
extracted_depth$osize <- extracted_depth$osize[-idx,]
extracted_depth$n.mks = dim(extracted_depth$oalt)[1]
extracted_depth$mks = rownames(extracted_depth$oalt)
extracted_depth$CHROM = extracted_depth$CHROM[-idx]
extracted_depth$POS = extracted_depth$POS[-idx]
# removing the multiallelics from the onemap object
if(crosstype == "outcross"){
idx.multi <- which(!(onemap.object$segr.type %in% c("B3.7", "D1.10", "D2.15")))
mult.obj <- split_onemap(onemap.object, mks= idx.multi)
idx.bi <- which(onemap.object$segr.type %in% c("B3.7", "D1.10", "D2.15"))
onemap.object <- split_onemap(onemap.object, mks= idx.bi)
} else if(crosstype == "f2 intercross"){
idx.multi <- which(!(onemap.object$segr.type %in% c("A.H.B")))
mult.obj <- split_onemap(onemap.object, mks= idx.multi)
idx.bi <- which(onemap.object$segr.type %in% c("A.H.B"))
onemap.object <- split_onemap(onemap.object, mks= idx.bi)
}
extracted_depth$onemap.object <- onemap.object
}
prepared_depth <- depth_prepare(extracted_depth)
class <- class(extracted_depth$onemap.object)[2]
cl <- parallel::makeCluster(cores, type="FORK")
registerDoParallel(cl)
result <- parLapply(cl, prepared_depth, function(x) supermassa_parallel(supermassa_4parallel = x, class=class))
stopCluster(cl)
mks <- unlist(lapply(result, "[", 1))
rm.mk <- unlist(lapply(result, "[", 2))
mks.type <- unlist(lapply(result, "[", 3))
geno <- matrix(unlist(lapply(result, "[", 4)), ncol = extracted_depth$n.mks, byrow = FALSE)
error <- lapply(result, "[[", 5)
error <- do.call(rbind, error[!as.logical(rm.mk)])
geno[which(is.na(geno))] <- 0
colnames(geno) <- extracted_depth$mks
rownames(geno) <- extracted_depth$inds
n.mar <- length(rm.mk) - sum(rm.mk)
onemap_supermassa <- extracted_depth$onemap.object
error$ind <- factor(error$ind, levels = extracted_depth$inds)
error <- error[order(error$ind),]
onemap_supermassa$error <- as.matrix(error[,3:5])
colnames(onemap_supermassa$error) <- NULL
if (any(rm.mk == 1)){
onemap_supermassa$geno <- geno[,-which(rm.mk==1)]
onemap_supermassa$segr.type <- unname(mks.type[-which(rm.mk==1)])
onemap_supermassa$CHROM <- extracted_depth$CHROM[-which(rm.mk==1)]
onemap_supermassa$POS <- as.numeric(extracted_depth$POS[-which(rm.mk==1)])
} else{
onemap_supermassa$geno <- geno
onemap_supermassa$segr.type <- unname(mks.type)
onemap_supermassa$CHROM <- extracted_depth$CHROM
onemap_supermassa$POS <- as.numeric(extracted_depth$POS)
}
onemap_supermassa$n.mar <- n.mar
mks.type.num <- onemap_supermassa$segr.type
mks.type.num[which(onemap_supermassa$segr.type == "B3.7")] <- 4
mks.type.num[which(onemap_supermassa$segr.type == "D2.15")] <- 7
mks.type.num[which(onemap_supermassa$segr.type == "D1.10")] <- 6
mks.type.num[which(onemap_supermassa$segr.type == "A.H.B")] <- 1
onemap_supermassa$segr.type.num <- mks.type.num
# Genotypes percent changed
idx <- which(colnames(onemap_supermassa$geno) %in% colnames(extracted_depth$onemap.object$geno))
idx2 <- which(colnames(extracted_depth$onemap.object$geno) %in% colnames(onemap_supermassa$geno))
if(length(idx) > 0){
diffe <- sum(!extracted_depth$onemap.object$geno[,idx2] == onemap_supermassa$geno[,idx])
} else {diffe <- 0}
perc <- (diffe/length(extracted_depth$onemap.object$geno))*100
cat(perc, "% of the genotypes were changed by this approach\n")
# Recovery markers
cat(sum(!colnames(onemap_supermassa$geno) %in% colnames(extracted_depth$onemap.object$geno)),"were recovered from vcf\n")
# Removed markers
cat(sum(!colnames(extracted_depth$onemap.object$geno) %in% colnames(onemap_supermassa$geno)),
"markers from old onemap object were considered non-informative and removed of analysis\n")
maxpostprob <- unlist(apply(onemap_supermassa$error, 1, function(x) x[which.max(x)]))
maxpostprob <- matrix(maxpostprob, nrow = onemap_supermassa$n.ind, ncol = onemap_supermassa$n.mar, byrow = F)
colnames(maxpostprob) <- colnames(onemap_supermassa$geno)
rownames(maxpostprob) <- rownames(onemap_supermassa$geno)
if(use_genotypes_probs){
onemap_supermassa.new <- create_probs(onemap.obj = onemap_supermassa,
genotypes_probs = onemap_supermassa$error,
global_error = global_error)
} else if(use_genotypes_errors){
onemap_supermassa.new <- create_probs(onemap.obj = onemap_supermassa,
genotypes_errors = 1- maxpostprob,
global_error = global_error)
} else if(!is.null(global_error)){
onemap_supermassa.new <- create_probs(onemap.obj = onemap_supermassa,
global_error = global_error)
}
if(!rm_multiallelic){
if(length(multi.mks) > 0)
onemap_supermassa.new <- combine_onemap(onemap_supermassa.new, mult.obj)
}
return(onemap_supermassa.new)
}
##' Creates list with first level refering to markers and second level with
##' marker name, offspring names and alternative and reference allele counts.
##' This function was created to parallelize the SuperMassa process. Its output
##' is the input of supermassa_error
##' @export
depth_prepare <- function(extracted_depth){
mylist <- rep(list(NA),extracted_depth$n.mks)
for(i in 1:extracted_depth$n.mks){
mk.name <- extracted_depth$mks[i]
inds <- extracted_depth$inds
odepth <- data.frame(extracted_depth$oref[i,], extracted_depth$oalt[i,])
if(class(extracted_depth$onemap.object)[2]=="f2"){
pdepth <- data.frame(extracted_depth$pref[i], extracted_depth$palt[i])
} else {
pdepth <- data.frame(extracted_depth$pref[i,], extracted_depth$palt[i,])
}
mylist[[i]] <- list(mk.name, inds,odepth,pdepth)
}
return(mylist)
}
##' Use output list from depth_prepare and run Supermassa for each marker (first level of the list)
##' Extract from the SuperMassa output, the offspring genotype, the marker type, and the error
##' probabilities
##' @export
supermassa_parallel <- function(supermassa_4parallel, class=NULL){
n.ind <- length(supermassa_4parallel[[2]])
all.ind.names <- supermassa_4parallel[[2]]
if(length(which(supermassa_4parallel[[4]][,1] == 0 & supermassa_4parallel[[4]][,2]== 0)) > 0) {
# Marker with parents missing data
rm.mk <- 1 #index to markers to be removed
geno_one <- prob_error <- rep(NA, n.ind)
mk.type <- NA
probs <- NA
}else if(all(supermassa_4parallel[[3]]==0)){
# Marker with missing data in all progeny
rm.mk <- 1 #index to markers to be removed
geno_one <- prob_error <- rep(NA, n.ind)
mk.type <- NA
probs <- NA
} else {
rm.mk <- 0
write.table(supermassa_4parallel[[3]], file = paste0("odepth_temp", supermassa_4parallel[[1]],".txt"), quote = FALSE, col.names = FALSE)
if(class=="f2"){
write.table(rbind(supermassa_4parallel[[4]],supermassa_4parallel[[4]]), file = paste0("pdepth_temp", supermassa_4parallel[[1]],".txt"), quote = FALSE, col.names = FALSE)
} else {
write.table(supermassa_4parallel[[4]], file = paste0("pdepth_temp", supermassa_4parallel[[1]],".txt"), quote = FALSE, col.names = FALSE)
}
odepth_temp <- paste0("odepth_temp", supermassa_4parallel[[1]],".txt")
pdepth_temp <- paste0("pdepth_temp", supermassa_4parallel[[1]],".txt")
out_file <- paste0("out_prob",supermassa_4parallel[[1]],".txt")
command_mass <- paste("python", paste0(system.file(package = "genotyping4onemap"),"/python_scripts/","SuperMASSA.py"),
"--inference f1 --file", paste0(getwd(),"/",odepth_temp), "--ploidy_range 2 ",
"--f1_parent_data", paste0(getwd(),"/",pdepth_temp),
" --print_genotypes --naive_posterior_reporting_threshold 0",
"--save_geno_prob_dist", paste0(getwd(),"/",out_file))
SuperMASSA.output<-system(command_mass, intern=TRUE)
file.remove(odepth_temp,pdepth_temp)
############ Start code from Molli
## Assessing estimated ploidy level
est.ploidy<-as.numeric(strsplit(SuperMASSA.output[4], split = " |,")[[1]][6])
#cat(" (ploidy ", est.ploidy, ") ", sep = "")
## Assessing estimated parental dosage
dpdq <- as.numeric(strsplit(x = SuperMASSA.output[4], split = "\\(|,")[[1]][c(5,8)])
## Reading SuperMASSA output
d<-scan(file=out_file, what="character", nlines=1, quiet = TRUE)
P<-matrix(as.numeric(gsub("[^0-9]", "", unlist(d))), ncol=2, byrow=TRUE)
A<-read.table(file=out_file, skip=1)
file.remove(out_file)
M<-matrix(NA, nrow = length(all.ind.names), ncol = est.ploidy+1,
dimnames = list(all.ind.names, c(0:est.ploidy)))
M.temp<-apply(A[,2:(est.ploidy+2)], 2, parse.geno)
if(class(M.temp) == "numeric") M.temp <- t(as.matrix(M.temp))
M.temp<-M.temp[,order(P[,2], decreasing=TRUE)]
if(class(M.temp) == "numeric") M.temp <- t(as.matrix(M.temp))
dimnames(M.temp)<-list(as.character(A[,1]), c(0:est.ploidy))
M[rownames(M.temp),]<-M.temp
mrk1<-apply(M, 1, function(x) exp(x-matrixStats::logSumExp(x)))
## Filling NAs with mendelian expectations
## z<-which(apply(mrk1, 2, function(x) any(is.na(x))))
## if(length(z) > 0)
## {
## if((dpdq[1] == 1 & dpdq[2] == 2) |(dpdq[1] == 2 & dpdq[2] == 1)){
## a <- c(aa=0,Aa=0.5,AA=0.5)
## } else if((dpdq[1] == 1 & dpdq[2] == 0) | (dpdq[1] == 0 & dpdq[2] == 1)){
## a <- c(aa=0.5,Aa=0.5,AA=0)
## } else if((dpdq[1] == 1 & dpdq[2] == 1)){
## a <- c(aa=0.25, Aa=0.5, AA=0.25)
## } else if((dpdq[1] == 2 & dpdq[2] == 0) | (dpdq[1] == 0 & dpdq[2] == 2)){
## a <- c(aa=0, Aa=1, AA=0)
## } else if((dpdq[1] == 2 & dpdq[2] == 2)){
## a <- c(aa=0, Aa=0, AA=1)
## } else if((dpdq[1] == 0 & dpdq[2] == 0)){
## a <- c(aa=1, Aa=0, AA=0)
## }
## for(k in names(z))
## mrk1[,z]<-a
## rownames(mrk1)<-names(a)<-0:est.ploidy
## mrk1[names(which(a==0)),][]<-0
## for(k1 in 1:ncol(mrk1))
## {
## if(any(mrk1[names(which(a!=0)),k1] > 0.5))
## mrk1[,k1] <- mrk1[,k1]/sum(mrk1[,k1])
## else mrk1[,k1] <- a
## }
## }
probs <- t(mrk1)
probs <- data.frame(mrk = supermassa_4parallel[[1]],
ind = rownames(probs),
probs)
rownames(probs)<-NULL
colnames(probs)<-c("mrk", "ind", 0:est.ploidy)
pgeno_temp <- strsplit(SuperMASSA.output[grep("ploidy", SuperMASSA.output)], split = ":")
pgeno_temp <- unlist(strsplit(pgeno_temp[[1]][length(pgeno_temp[[1]])], split = ","))
# Code for OneMap
pgeno <- rep(NA, 4)
pgeno[grep("0", pgeno_temp)] <- 0
pgeno[grep("1", pgeno_temp)] <- 1
pgeno[grep("2", pgeno_temp)] <- 2
if(class=="outcross"){
# only biallelic markers and diploid
if((pgeno[1] == 0 | pgeno[1] == 2) & (pgeno[3] == 2 | pgeno[3] == 0)){
# non-informative markers
rm.mk <- 1
geno_one <- prob_error <- rep(NA, n.ind)
mk.type <- NA
} else {
rm.mk <- 0
ind_geno <- unlist(lapply(strsplit(SuperMASSA.output[grep("\t", SuperMASSA.output)], split = "\t"), "[", 1))
ind_geno <- gsub('.{1}$', '', ind_geno)
geno <- unlist(lapply(strsplit(SuperMASSA.output[grep("\t", SuperMASSA.output)], split = "\t"), "[", 2))
ind_geno_tot <- cbind(supermassa_4parallel[[2]], rep(NA, n.ind))
ind_geno_tot[,2][match(ind_geno,ind_geno_tot[,1])] <- geno
geno_one <- rep(NA, n.ind)
# Code for OneMap
idx <- which(ind_geno_tot[,2] == "(1, 1) <br>")
geno_one[idx] <- 2
mk.type <- vector()
if(pgeno[1] == 0){
mk.type <- c(mk.type,"D2.15")
idx <- which(ind_geno_tot[,2] == "(2, 0) <br>")
geno_one[idx] <- 0
idx <- which(ind_geno_tot[,2] == "(0, 2) <br>")
geno_one[idx] <- 1
} else if(pgeno[2] == 0){
mk.type <- c(mk.type,"D2.15")
idx <- which(ind_geno_tot[,2] == "(2, 0) <br>")
geno_one[idx] <- 1
idx <- which(ind_geno_tot[,2] == "(0, 2) <br>")
geno_one[idx] <- 0
} else if(pgeno[3] == 0){
mk.type <- c(mk.type,"D1.10")
idx <- which(ind_geno_tot[,2] == "(2, 0) <br>")
geno_one[idx] <- 0
idx <- which(ind_geno_tot[,2] == "(0, 2) <br>")
geno_one[idx] <- 1
} else if(pgeno[3] == 2){
mk.type <- c(mk.type,"D1.10")
idx <- which(ind_geno_tot[,2] == "(2, 0) <br>")
geno_one[idx] <- 1
idx <- which(ind_geno_tot[,2] == "(0, 2) <br>")
geno_one[idx] <- 0
} else{
mk.type <- c(mk.type,"B3.7")
idx <- which(ind_geno_tot[,2] == "(2, 0) <br>")
geno_one[idx] <- 1
idx <- which(ind_geno_tot[,2] == "(0, 2) <br>")
geno_one[idx] <- 3
}
}
} else {
# only biallelic markers and diploid
if(pgeno[1] != 1 | pgeno[3] != 1){
# non-informative markers
rm.mk <- 1
geno_one <- prob_error <- rep(NA, n.ind)
mk.type <- NA
} else {
rm.mk <- 0
ind_geno <- unlist(lapply(strsplit(SuperMASSA.output[grep("\t", SuperMASSA.output)], split = "\t"), "[", 1))
ind_geno <- gsub('.{1}$', '', ind_geno)
geno <- unlist(lapply(strsplit(SuperMASSA.output[grep("\t", SuperMASSA.output)], split = "\t"), "[", 2))
ind_geno_tot <- cbind(supermassa_4parallel[[2]], rep(NA, n.ind))
ind_geno_tot[,2][match(ind_geno,ind_geno_tot[,1])] <- geno
geno_one <- rep(NA, n.ind)
mk.type <- "A.H.B"
# Code for OneMap
idx <- which(ind_geno_tot[,2] == "(1, 1) <br>")
geno_one[idx] <- 2
idx <- which(ind_geno_tot[,2] == "(2, 0) <br>")
geno_one[idx] <- 1
idx <- which(ind_geno_tot[,2] == "(0, 2) <br>")
geno_one[idx] <- 3
}
}
}
return( list(mk= supermassa_4parallel[[1]],rm.mk, mk.type, geno_one, probs))
}
## function to parse the genotype probabilities
parse.geno<-function(x)
{
y<-strsplit(as.character(x), split="\\[|\\,|\\]")
sapply(y, function(x) as.numeric(x[which.max(nchar(x))]))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.