#' Impute missing data for bi-allelic markers
#'
#' Impute missing data for bi-allelic markers
#'
#' Assumes input file is sorted by position. Markers with no genetic variance are removed.
#'
#' \code{method="pop"} imputes with the population mean for \code{geno="DS"} and population mode for \code{geno="GT"}.
#'
#' \code{method="EM"} uses parameter "tol" (default is 0.02, see rrBLUP A.mat documentation). Imputed values are truncated if needed to fall in the interval [0,ploidy].
#'
#' \code{method="RF"} uses parameters "ntree" (default 100) for number of trees and "nflank" (default 100) for the number of flanking markers (on each side) to use as predictors. Because RF first uses EM to generate a complete dataset, parameter "tol" is also recognized.
#'
#'
#' @param in.file VCF input file
#' @param out.file VCF output file
#' @param ploidy ploidy
#' @param method One of the following: "pop","EM","RF"
#' @param geno One of the following: "GT","DS"
#' @param min.DP genotypes below this depth are set to missing (default=1)
#' @param max.missing remove markers above this threshold, as proportion of population
#' @param params list of method-specific parameters
#' @param n.core multicore processing
#'
#' @export
#' @import vcfR
#' @importFrom rrBLUP A.mat
#' @importFrom parallel makeCluster stopCluster parApply clusterExport
#' @importFrom randomForest randomForest
#' @importFrom stats sd
#' @importFrom utils read.csv write.csv
impute <- function(in.file,out.file,ploidy,method,geno,min.DP=1,
max.missing,params=NULL,n.core=1) {
stopifnot(geno %in% c("GT","DS"))
stopifnot(method %in% c("pop","EM","RF"))
stopifnot(min.DP >= 1)
if (!is.null(params) && !is.list(params))
stop("Use a list for argument params")
if (method=="EM")
params1 <- list(tol=0.02)
if (method=="RF")
params1 <- list(ntree=100,nflank=100,tol=0.02)
if (!is.null(params)) {
if (method=="EM")
stopifnot(names(params)=="tol")
if (method=="RF")
stopifnot(names(params) %in% c("ntree","nflank","tol"))
params1[names(params)] <- params
}
impute.mode <- function(x) {
miss <- which(is.na(x))
if (length(miss)>0) {
x[miss] <- as.integer(names(which.max(table(x))))
}
return(x)
}
impute.mean <- function(x) {
miss <- which(is.na(x))
if (length(miss)>0) {
x[miss] <- mean(x[-miss])
}
return(x)
}
convert <- function(x,field) {
if (field=="DS")
return(as.numeric(x))
if (field=="GT") {
z <- as.integer(strsplit(x,split="/",fixed=T)[[1]])
return(sum(z))
}
}
impute.RF <- function(y,x,ntree) {
miss <- which(is.na(y))
n.miss <- length(miss)
if (n.miss > 0) {
ans <- suppressWarnings(randomForest(x=x[-miss,],
y=y[-miss],
xtest=x[miss,,drop=FALSE],
ntree=ntree))
y[miss] <- ans$test$predicted
}
if (is.factor(y)) {
return(as.integer(as.character(y)))
} else {
return(y)
}
}
data <- read.vcfR(file=in.file,verbose=FALSE)
ixa <- grep(",",data@fix[,"ALT"])
fields <- vcf_field_names(data, tag = "FORMAT")$ID
geno.key <- geno
if (!(geno.key %in% fields))
stop(sub("Q",geno.key,"Q not present in VCF file"))
geno <- extract.gt(data,element=geno.key)
all.marks <- apply(data@fix[,c("CHROM","POS")],1,paste,collapse=":")
rownames(geno) <- all.marks
if (length(ixa) > 0) {
cat("Multi-allelic variants detected and will be removed.\n")
geno <- geno[-ixa,]
}
if (min.DP > 1) {
dnames <- dimnames(geno)
geno <- split(geno,f=1:nrow(geno))
if (!("DP" %in% fields))
stop("Sample DP not present in VCF file")
DP <- extract.gt(data,element="DP")
if (length(ixa) > 0)
DP <- DP[-ixa,]
DP <- lapply(split(DP,f=1:length(geno)),as.integer)
geno <- t(mapply(FUN=function(x,dp,min.DP){x[dp < min.DP] <- NA; x},geno,DP,min.DP=min.DP))
dimnames(geno) <- dnames
}
geno <- apply(geno,MARGIN=c(1,2),FUN=convert,field=geno.key)
n <- ncol(geno)
ix <- which(apply(geno,1,sd,na.rm=T) > 0)
nd <- nrow(geno) - length(ix)
if (nd > 0)
cat(sub("X",nd,"Removed X markers without genetic variance\n"))
iu <- which(apply(geno[ix,],1,function(x){sum(is.na(x))})/n <= max.missing)
nd2 <- length(ix) - length(iu)
if (nd2 > 0)
cat(sub("X",nd2,"Removed X markers due to missing data\n"))
geno <- geno[ix[iu],,drop=FALSE]
m <- nrow(geno)
marks <- rownames(geno)
ik <- match(marks,all.marks)
#begin imputation
#geno.imp is transposed vs geno
if (method=="pop") {
if (geno.key=="GT") {
geno.imp <- apply(geno,1,impute.mode)
} else {
geno.imp <- apply(geno,1,impute.mean)
}
}
if (method %in% c("EM","RF")) {
ans <- A.mat(t(geno)/(ploidy/2) - 1,
impute.method = "EM",n.core=n.core,
return.imputed=TRUE,min.MAF=0,tol=params1$tol)
geno.imp <- apply(ans$imputed,2,function(x){ifelse(abs(x)>1,1*sign(x),x)})
if (geno.key=="GT") {
digits <- 0
} else {
digits <- 2
}
geno.imp <- round((geno.imp+1)*ploidy/2,digits)
}
if (method=="RF") {
geno.imp.old <- geno.imp
if (n.core > 1) {
cl <- makeCluster(n.core)
clusterExport(cl=cl,varlist=NULL)
geno.imp <- parApply(cl,array(1:m),MARGIN=1,
function(k){
y <- geno[k,]
if (geno.key=="GT")
y <- factor(y)
impute.RF(y=y,
x=geno.imp.old[,setdiff(c(max(1,k-params1$nflank):min(m,k+params1$nflank)),k),drop=FALSE],
ntree=params1$ntree)})
stopCluster(cl)
} else {
geno.imp <- apply(array(1:m),MARGIN=1,
function(k){
y <- geno[k,]
if (geno.key=="GT")
y <- factor(y)
impute.RF(y=y,
x=geno.imp.old[,setdiff(c(max(1,k-params1$nflank):min(m,k+params1$nflank)),k),drop=FALSE],
ntree=params1$ntree)})
}
}
if (geno.key=="GT") {
geno.imp <- apply(t(geno.imp),c(1,2),make_GT,ploidy=ploidy)
} else {
geno.imp <- round(t(geno.imp),1)
}
dimnames(geno.imp) <- dimnames(geno)
if (geno.key=="GT")
fields <- setdiff(fields,"GQ")
nf <- length(fields)
geno <- vector("list",nf)
names(geno) <- fields
for (i in 1:nf) {
if (fields[i]==geno.key) {
geno[[i]] <- geno.imp
} else {
geno[[i]] <- extract.gt(data,element=fields[i])[ik,]
}
}
fixed <- data@fix[ik,]
fixed[,"INFO"] <- "."
fixed[is.na(fixed)] <- "."
write_vcf(filename=out.file, fixed=fixed, geno=geno)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.