#' Post imputation quality control
#'
#' @description
#' Perform quality control and data management after imputation.
#' @param plink an executable program in either the current working
#' directory or somewhere in the command path.
#' @param inputPrefix the prefix of the final imputed PLINK files.
#' @param out1 the prefix of well imputed PLINK files with the index.
#' @param out2 the prefix of well imputed PLINK files after removing any
#' SNPs with the same positions (if any), the index is also appended.
#' @param out3 the prefix of well imputed PLINK files with the index
#' after adding previously identified monomorphic SNPs if any.
#' @param out4 the prefix of final well imputed PLINK files with the index.
#' @param infoScore the cutoff of filtering imputation quality score for
#' each variant. The default value is 0.6.
#' @param outputMonoSNPfile the output pure text file that stores
#' the removed monomorphic SNPs, one per line, if any.
#' @param outputInfoFile the output file of impute2 info scores consisting of
#' two columns: all imputed SNPs and their info scores.
#' @param prefixAlign2ref the prefix of the output PLINK binary
#' files after removing SNPs whose alleles are not in the imputation reference,
#' taking their genomic positions into account.
#' @param missCutoff the cutoff of the least number of instances for
#' a SNP that is not missing. The default is 20.
#' @param outRemovedSNPfile the output file of SNPs with pre-defined
#' missing values that are removed.
#' @param outRetainSNPfile the output file of SNPs that are retained.
#' @param referencePanel a string indicating the type of imputation
#' reference panels is used: c("1000Gphase1v3_macGT1", "1000Gphase3").
#' @return All imputed genotype data in PLINK format, well imputed data and
#' its variants, as well as a set of pure text files containing removed or
#' retained SNPs.
#' @export
#' @author Junfang Chen
postImpQC <- function(plink, inputPrefix, out1, out2, out3, out4,
outputInfoFile, infoScore=0.6, outputMonoSNPfile,
prefixAlign2ref, missCutoff=20,
outRemovedSNPfile, outRetainSNPfile, referencePanel){
## step 3
## Filtered imputed data set; Remove imputed SNPs with (info < 0.6),
## only retain "Good" SNPs.
.filterImputeData2(plink, outputInfoFile, infoScore,
inputPrefix, outputPrefix=out1)
## step 4
## Remove previously identified monomorphic SNPs in the imputed dataset.
## Note that snps with same genomic position but can have different snp name.
## if no monomorphic SNPs:
if (file.size(paste0(outputMonoSNPfile)) == 0 ){
renamePlinkBFile(inputPrefix=out1, outputPrefix=out2, action="copy")
} else {
## extract PLINK files contain only monomorphic SNPs from
## the original aligned (lifted and QC-ed) data set.
system(paste0(plink, " --bfile ", prefixAlign2ref,
" --extract ", outputMonoSNPfile, " --make-bed --out ",
prefixAlign2ref, "Tmp"))
bim1 <- read.table(paste0(prefixAlign2ref, "Tmp.bim"),
stringsAsFactors=FALSE)
system(paste0("awk '{print $1, $2, $4}' ",
out1, ".bim > tmpFilterImp.txt"))
bim2 <- read.table("tmpFilterImp.txt", stringsAsFactors=F)
colnames(bim1) <- c("chr", "rsID", "gd", "pos", "a0", "a1")
colnames(bim2) <- c("chr", "rsID", "pos")
outputFile <- "tmp.txt"
.snpSharedPos(inputFile1=bim1, inputFile2=bim2, outputFile, nCore=25)
system(paste0(plink, " --bfile ", out1,
" --exclude tmp.txt --make-bed --out ", out2))
system("rm tmpFilterImp.txt tmp.txt")
}
## step 5
## Add previously identified monomorphic SNPs in the imputed dataset.
## if no monomorphic SNPs:
if ( file.size(paste0(outputMonoSNPfile))==0 ){
renamePlinkBFile(inputPrefix=out1, outputPrefix=out3, action="copy")
} else {
## merge both datasets
system(paste0(plink, " --bfile ", out2, " --bmerge ",
prefixAlign2ref, ".bed ",
prefixAlign2ref, ".bim ",
prefixAlign2ref, ".fam ", "--make-bed --out ", out3))
}
## step 6
## Remove SNPs which have a non missing value for less then 20 instances.
removedSnpMissPostImp(plink, inputPrefix=out3, missCutoff,
outRemovedSNPfile, outRetainSNPfile, outputPrefix=out4)
## special case:
## 1.) if the input ref panel is from phase3.
## (2) Impute4 modified imputed output rsID if the original SNPs are not given.
## e.g. rs456706:11001104:C:T << rsID rs456706
if (referencePanel == "1000Gphase3"){
outRename <- "4_6_renameSNP"
renamePlinkBFile(inputPrefix=out4, outputPrefix=outRename, action="move")
snp2renameV0 <- read.table(outRetainSNPfile, stringsAsFactors=FALSE)
str(snp2renameV0)
whDiff <- grep(":", snp2renameV0[,1])
str(whDiff)
if (length(whDiff) > 0){
snp2renameV1 <- snp2renameV0[whDiff,]
tmp = strsplit(snp2renameV1, ":", fixed = TRUE)
snpNew <- unlist(lapply(tmp, function(s){s[[1]]}))
snpNewOld <- cbind(snpNew, snp2renameV0[whDiff,1])
str(snpNewOld)
## rename the plink files.
snpRenameFile <- "4_6_renameSNP.txt"
write.table(snpNewOld, file=snpRenameFile, quote=FALSE,
row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
system(paste0(plink, " --bfile ", outRename, " --update-name ",
snpRenameFile, " 1 2 --make-bed --out ", outRename, "tmp"))
renamePlinkBFile(inputPrefix=paste0(outRename, "tmp"),
outputPrefix=out4, action="move")
system(paste0("rm ", outRename, "*"))
}
}
}
#' Post imputation data extraction and expansion
#'
#' @description
#' Reduce well imputed dataset to have SNPs before imputation and then added
#' genotype data that are different from the imputation reference panel.
#' @param plink an executable program in either the current working
#' directory or somewhere in the command path.
#' @param referencePanel a string indicating the type of imputation
#' reference panels is used: c("1000Gphase1v3_macGT1", "1000Gphase3").
#' @param inputPrefix the prefix of final well imputed PLINK files.
#' @param inputQCprefix the prefix of QCed PLINK files.
#' @param snpRefAlleleFile the output plain text file that stores the
#' removed SNPs whose alleles are in the imputation reference,
#' taking their genomic positions into account.
#' @param snpDiffAlleleFile the output plain text file that stores the removed
#' SNPs whose alleles are not in the imputation reference,
#' taking their genomic positions into account.
#' @param snpMissPosFile the output plain text file that stores the removed SNPs
#' whose genomic positions are not in the imputation reference,
#' ingoring SNP names.
#' @param snpSameNameDifPosFile the output plain text file that stores the
#' removed SNPs whose genomic positions are not in the imputation reference,
#' taking SNP names into account.
#' @param reducedToSpecificfn the prefix of PLINK files which are reduced in the
#' number of SNPs and contain only SNPs prior to imputation.
#' @param specificDiffAllelefn the prefix of PLINK files which added genotypes
#' with different alleles other than the imputation to the file with the
#' prefix: reducedToSpecificfn.
#' @param specificMissPosfn the prefix of PLINK files which added genotypes
#' with missing positions other than the imputation to the file with the
#' prefix: specificDiffAllelefn.
#' @param specificDiffPosfn the prefix of PLINK files which added genotypes
#' with different positions other than the imputation to the file with the
#' prefix: specificMissPosfn .
#' @return Extracted genotypes from the imputed data which contain SNPs before
#' imputation. On the basis of extracted genotypes, genotypes different from
#' the imputation reference panel are appended, including SNPs with different
#' alleles, missing positions, and different positions.
#' @export
#' @author Junfang Chen
reductExpand <- function(plink, referencePanel, inputPrefix, inputQCprefix,
snpRefAlleleFile, snpDiffAlleleFile,
snpMissPosFile, snpSameNameDifPosFile,
reducedToSpecificfn, specificDiffAllelefn,
specificMissPosfn, specificDiffPosfn){
## 1. Reduce the imputed dataset to the SNPs before imputation.
if (referencePanel == "1000Gphase3"){
## Before doing this, make sure > no variant with 3+ alleles present.
## Remove if any.
bimOriQCed <- read.table(paste0(inputQCprefix, ".bim"),
stringsAsFactors=FALSE)
bimPostImp <- read.table(paste0(inputPrefix, ".bim"), stringsAsFactors=FALSE)
## check duplicated.
dupPostImp <- bimPostImp[duplicated(bimPostImp[,2]),]
# str(dupPostImp)
allele3 <- intersect(dupPostImp[,2], bimOriQCed[,2])
# str(allele3)
## remove these duplicated and with 3+alleles
write.table(allele3, file="snpAllele3.txt", quote=FALSE,
row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
outAllele2 <- paste0(inputPrefix, "Allele2")
system(paste0(plink, " --bfile ", inputPrefix,
" --exclude snpAllele3.txt --make-bed --out ", outAllele2))
} else { outAllele2 <- inputPrefix }
system(paste0(plink, " --bfile ", outAllele2,
" --extract ", snpRefAlleleFile, " --make-bed --out ",
reducedToSpecificfn))
## 2. Add the SNPs with different alleles with their values
## from the dataset before removing SNPs.
if (file.size(snpDiffAlleleFile) == 0){
renamePlinkBFile(inputPrefix=reducedToSpecificfn,
outputPrefix=specificDiffAllelefn, action="copy")
} else {
system(paste0(plink, " --bfile ", inputQCprefix,
" --extract ", snpDiffAlleleFile, " --make-bed --out tmp"))
system(paste0(plink, " --bfile ", reducedToSpecificfn,
" --bmerge tmp.bed tmp.bim tmp.fam --make-bed --out ",
specificDiffAllelefn))
system("rm tmp.*")
}
## 3. Add the SNPs with missing positions with their values
## from the dataset before removing SNPs.
if (file.size(snpMissPosFile) == 0){
renamePlinkBFile(inputPrefix=specificDiffAllelefn,
outputPrefix=specificMissPosfn, action="copy")
} else {
system(paste0(plink, " --bfile ", inputQCprefix,
" --extract ", snpMissPosFile, " --make-bed --out tmp"))
system(paste0(plink, " --bfile ", specificDiffAllelefn,
" --bmerge tmp.bed tmp.bim tmp.fam --make-bed --out ",
specificMissPosfn))
system("rm tmp.*")
}
## 4. Add the SNPs with different positions by their values
## from the dataset before removing SNPs.
if (file.size(snpSameNameDifPosFile) == 0){
renamePlinkBFile(inputPrefix=specificMissPosfn,
outputPrefix=specificDiffPosfn, action="copy")
} else {
system(paste0(plink, " --bfile ", inputQCprefix,
" --extract ", snpSameNameDifPosFile, " --make-bed --out tmp"))
system(paste0(plink, " --bfile ", specificMissPosfn,
" --bmerge tmp.bed tmp.bim tmp.fam --make-bed --out ",
specificDiffPosfn))
system("rm tmp.*")
}
system("rm *.log")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.