#######################################################################
#
# Package Name: Gimpute
# Description:
# Gimpute -- An efficient genetic data imputation pipeline
#
# Gimpute R package, An efficient genetic data imputation pipeline
# Copyright (C) 2018 Junfang Chen (junfang.chen@zi-mannheim.de)
# All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#' Exclude monomorphic SNPs
#'
#' @description
#' Detect monomorphic SNPs from PLINK BIM file and exclude them if any.
#' @param plink an executable program in either the current
#' working directory or somewhere in the command path.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param outputPrefix the prefix of the output PLINK binary files
#' after removing monomorphic SNPs.
#' @param outputSNPfile the output pure text file that stores
#' the removed monomorphic SNPs, one per line, if any.
#' @return The output PLINK binary files after removing monomorphic SNPs
#' and a pure text file with removed monomorphic SNPs.
#' @export
#' @author Junfang Chen
#' @examples
#' ## In the current working directory
#' bedFile <- system.file("extdata", "alignedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "alignedData.bim", package="Gimpute")
#' famFile <- system.file("extdata", "alignedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, " ."))
#' system(paste0("scp ", bimFile, " ."))
#' system(paste0("scp ", famFile, " ."))
#' inputPrefix <- "alignedData"
#' outputPrefix <- "removedMonoSnp"
#' outputSNPfile <- "monoSNP.txt"
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## removedMonoSnp(plink, inputPrefix, outputPrefix, outputSNPfile)
removedMonoSnp <- function(plink, inputPrefix, outputPrefix, outputSNPfile){
## input BIM file
bim <- read.table(paste0(inputPrefix, ".bim"), stringsAsFactors=FALSE)
monoSNPs <- bim[which(bim[,5] == 0),2]
write.table(monoSNPs, file=outputSNPfile, quote=FALSE, row.names=FALSE,
col.names=FALSE, eol="\r\n", sep=" ")
system(paste0(plink, " --bfile ", inputPrefix, " --exclude ", outputSNPfile,
" --make-bed --out ", outputPrefix))
}
#' Split genome-wide genotyping data into chromosome-wide PLINK binary files.
#'
#' @description
#' Split the whole genome genotyping data chromosome-wise;
#' allow parallel computating for all chromosomes.
#' @param plink an executable program in either the current
#' working directory or somewhere in the command path.
#' @param inputPrefix the prefix of the input PLINK binary files
#' before splitting.
#' @param chrXPAR1suffix if chromosome 25 is available and with PAR1,
#' then generate the suffix with X_PAR1 for chrX_PAR1.
#' @param chrXPAR2suffix if chromosome 25 is available and with PAR2,
#' then generate the suffix with X_PAR2 for chrX_PAR2.
#' @param nCore the number of cores used for parallel computation.
#' The default value is 25.
#' @return The output PLINK binary files for each chromosome with the same
#' prefix as the inputPrefix but appended with the chromosome codes, and
#' possibly also the logical value for the pseudo-autosomal region (PAR)
#' indicating if PAR exists in the input genotyping data or not.
#' @details If chromosome 25 is also available, namely the pseudo-autosomal
#' region of chromosome X, then further split chr25 (PAR or Chr_XY)
#' into PAR1 and PAR2 according to the genomic coordination GRCh37
#' from https://en.wikipedia.org/wiki/Pseudoautosomal_region.
#' The locations of the PARs within GRCh37 are:
#' PAR1 X 60001 2699520;
#' PAR2 X 154931044 155260560.
#' @export
#' @import doParallel
#' @author Junfang Chen
#' @examples
#' ## In the current working directory
#' bedFile <- system.file("extdata", "alignedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "alignedData.bim", package="Gimpute")
#' famFile <- system.file("extdata", "alignedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, " ."))
#' system(paste0("scp ", bimFile, " ."))
#' system(paste0("scp ", famFile, " ."))
#' inputPrefix <- "alignedData"
#' chrXPAR1suffix <- "X_PAR1"
#' chrXPAR2suffix <- "X_PAR2"
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## chrWiseSplit(plink, inputPrefix, chrXPAR1suffix, chrXPAR2suffix, nCore)
chrWiseSplit <- function(plink, inputPrefix, chrXPAR1suffix,
chrXPAR2suffix, nCore=25){
## check which chromosomes are available to be splitted from the .bim file
bim <- read.table(paste0(inputPrefix, ".bim"), stringsAsFactors=FALSE)
chrs <- as.integer(names(table(bim[,1])))
chrslist <- as.list(chrs)
mclapply(chrslist, function(i){
cmd <- paste0(plink, " --bfile ", inputPrefix, " --chr ", i,
" --make-bed --out ", inputPrefix, i)
system(cmd)
}, mc.cores=nCore)
## if chromosome 25 is also available then re-arrange it
if (is.element(25, chrs)){
print("PAR is available in chrX!")
bim25 <- read.table(paste0(inputPrefix, "25.bim"),
stringsAsFactors=FALSE)
pos4PAR1 <- c(60001, 2699520)
## first check for PAR1 and afterwards for PAR2
if ( length(which(bim25[,4] <= pos4PAR1[2]))!= 0 ){
print("PAR1 is available in chrX!")
bimPos4par1 <- which(bim25[,4]<= pos4PAR1[2])
rs4PAR1 <- bim25[bimPos4par1,2]
write.table(rs4PAR1, file="rs4PAR1.txt", quote=FALSE,
row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
system(paste0(plink, " --bfile ", inputPrefix,
"25 --extract rs4PAR1.txt --make-bed --out ",
inputPrefix, chrXPAR1suffix))
par1 <- TRUE
## check for PAR2, if any SNPs out of PAR1, PAR2 also available
if (length(bimPos4par1)<nrow(bim25)){
print("PAR2 is available in chrX!")
## just to exclude rs4PAR1.txt
system(paste0(plink, " --bfile ", inputPrefix,
"25 --exclude rs4PAR1.txt --make-bed --out ",
inputPrefix, chrXPAR2suffix))
par2 <- TRUE
} else {
par2 <- FALSE
print("PAR2 is NOT available in chrX!")
}
} else {
print("PAR2 is available in chrX! But NOT PAR1, all chr25 on PAR2")
par1 <- FALSE
par2 <- TRUE
renamePlinkBFile(inputPrefix=paste0(inputPrefix, "25"),
outputPrefix=paste0(inputPrefix, chrXPAR2suffix),
action="copy")
}
} else {
print("PAR is NOT available in chrX!")
par1 <- FALSE
par2 <- FALSE
}
return(par=list(par1, par2))
}
#' Chunk each chromosome into multiple segments
#'
#' @description
#' Chunk each chromosome genotyping data into multiple segments
#' by a predefined window size.
#' @param inputPrefix the prefix of the input PLINK .bim file for
#' each chromosome, without the chromosome codes.
#' @param outputPrefix the prefix of the output pure text files that keep
#' all the chunks for each chromosome separately.
#' @param chrs specifiy the chromosome codes for chunking.
#' @param windowSize the window size of each chunk.
#' The default value is 3000000.
#' @return The output pure text files include all the chunks
#' for each chromosome separately.
#' @export
#' @author Junfang Chen
#' @examples
#' ## In the current working directory
#' bimFile <- system.file("extdata", "gwas_data_chr23.bim", package="Gimpute")
#' system(paste0("scp ", bimFile, " ."))
#' inputPrefix <- "gwas_data_chr"
#' outputPrefix <- "chunks_chr"
#' chrs <- 23
#' print(chrs)
#' chunk4eachChr(inputPrefix, outputPrefix, chrs, windowSize=3000000)
chunk4eachChr <- function(inputPrefix, outputPrefix, chrs, windowSize=3000000){
for (i in chrs){
bim <- paste0(inputPrefix, i, ".bim")
bimdata <- read.table(file=bim, sep="\t", stringsAsFactors=FALSE)
position <- bimdata[,4]
posStart <- head(position,1)
posEnd <- tail(position,1)
chunkStart <- seq(posStart, posEnd, windowSize)
chunkEnd <- chunkStart + windowSize -1
chunkMatrix <- cbind(chunkStart, chunkEnd)
## positions are only within a chunk
if (nrow(chunkMatrix) == 1){
chunks <- chunkMatrix
} else {
## it may happen that only a few SNPs from the last chunk;
## if the last chunk is large, then specify -allow_large_regions
chunks <- head(chunkMatrix, -1) ## merge last-second to last
chunks[nrow(chunks), 2] <- posEnd
}
## check if any chunk with NO snps within it
SNPcountsPerChunk <- c()
for (j in seq_len(nrow(chunks))){
chunkbottom <- chunks[j,1]
chunkup <- chunks[j,2]
## ## which fall within chunk
tmp <- which(position >= chunkbottom & position <= chunkup)
tmp <- length(tmp)
SNPcountsPerChunk <- c(SNPcountsPerChunk, tmp)
}
wh0 <- which(SNPcountsPerChunk == 0)
print(paste0("chr",i))
print(wh0)
chunkLength <- nrow(chunks) - length(wh0)
## remove such chunks if wh0
if (length(wh0) != 0){ chunks <- chunks[-wh0,] }
print(nrow(chunks) == chunkLength)
chunkfilename <- paste0(outputPrefix, i, ".txt")
write.table(chunks, file=chunkfilename, quote=FALSE,
row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
}
}
#' Prephasing genotypes using SHAPEIT
#'
#' @description
#' Perform prephasing for study genotypes by SHAPEIT for the autosomal and
#' sex chromosome haplotypes using a reference panel (pre-set).
#' @param shapeit an executable program in either the
#' current working directory or somewhere in the command path.
#' @param chrs specifiy the chromosome codes for phasing.
#' @param dataDIR the directory where genotype PLINK binary files are located.
#' @param prefix4eachChr the prefix of PLINK binary files for
#' each chromosome.
#' @param referencePanel a string indicating the type of imputation
#' reference panels is used: c("1000Gphase1v3_macGT1", "1000Gphase3").
#' @param impRefDIR the directory where the imputation reference files
#' are located.
#' @param phaseDIR the directory where resulting pre-phased files
#' will be located.
#' @param nThread the number of threads used for computation.
#' The default value is 40.
#' @param effectiveSize this parameter controls the effective population size.
#' The default value is 20000.
#' @param nCore the number of cores used for computation. This can be tuned
#' along with nThread. The default value is 1.
#' @return The pre-phased haplotypes for given chromosomes.
#' @details If ChrX is available then it is done automatically by passing
#' the flag --chrX to SHAPEIT.
#' @export
#' @author Junfang Chen
#' @seealso \code{\link{phaseImpute2}}.
.prePhasingByShapeit <- function(shapeit, chrs, dataDIR, prefix4eachChr,
referencePanel, impRefDIR, phaseDIR,
nThread=40, effectiveSize=20000, nCore=1){
chrslist <- as.list(chrs)
mclapply(chrslist, function(i){
# GWAS data files in PLINK binary format
GWASDATA_BED <- paste0(dataDIR, prefix4eachChr, i, ".bed ")
GWASDATA_BIM <- paste0(dataDIR, prefix4eachChr, i, ".bim ")
GWASDATA_FAM <- paste0(dataDIR, prefix4eachChr, i, ".fam ")
## >>> ref panel
GENMAP_FILE <- paste0(impRefDIR, "genetic_map_chr", i, "_combined_b37.txt ")
GENMAP.chrXnonPAR <- paste0(impRefDIR, "genetic_map_chrX_nonPAR_combined_b37.txt ")
GENMAP.chrXPAR1 <- paste0(impRefDIR, "genetic_map_chrX_PAR1_combined_b37.txt ")
GENMAP.chrXPAR2 <- paste0(impRefDIR, "genetic_map_chrX_PAR2_combined_b37.txt ")
if (referencePanel == "1000Gphase1v3_macGT1"){
## autosome
impPrefix <- "ALL_1000G_phase1integrated_v3_chr"
HAPS_FILE <- paste0(impRefDIR, impPrefix, i,
"_impute_macGT1.hap.gz ")
LEGEND_FILE <- paste0(impRefDIR, impPrefix, i,
"_impute_macGT1.legend.gz ")
SAMPLE_FILE <- paste0(impRefDIR,
"ALL_1000G_phase1integrated_v3.sample ")
## chrX_nonPAR
HAPS.chrXnonPAR <- paste0(impRefDIR, impPrefix,
"X_nonPAR_impute_macGT1.hap.gz ")
LEGEND.chrXnonPAR <- paste0(impRefDIR, impPrefix,
"X_nonPAR_impute_macGT1.legend.gz ")
## .chrXPAR1
HAPS.chrXPAR1 <- paste0(impRefDIR, impPrefix,
"X_PAR1_impute_macGT1.hap.gz ")
LEGEND.chrXPAR1 <- paste0(impRefDIR, impPrefix,
"X_PAR1_impute_macGT1.legend.gz ")
## .chrXPAR2
HAPS.chrXPAR2 <- paste0(impRefDIR, impPrefix,
"X_PAR2_impute_macGT1.hap.gz ")
LEGEND.chrXPAR2 <- paste0(impRefDIR, impPrefix,
"X_PAR2_impute_macGT1.legend.gz ")
} else if (referencePanel == "1000Gphase3"){
HAPS_FILE <- paste0(impRefDIR, "1000GP_Phase3_chr", i,
".hap.gz ")
## autosome
LEGEND_FILE <- paste0(impRefDIR, "1000GP_Phase3_chr", i,
".legend.gz ")
SAMPLE_FILE <- paste0(impRefDIR, "1000GP_Phase3.sample ")
## chrX_nonPAR
HAPS.chrXnonPAR <- paste0(impRefDIR,
"1000GP_Phase3_chrX_NONPAR.hap.gz ")
LEGEND.chrXnonPAR <- paste0(impRefDIR,
"1000GP_Phase3_chr",
"X_NONPAR.legend.gz ")
## .chrXPAR1
HAPS.chrXPAR1 <- paste0(impRefDIR,
"1000GP_Phase3_chrX_PAR1.hap.gz ")
LEGEND.chrXPAR1 <- paste0(impRefDIR,
"1000GP_Phase3_chrX_PAR1.legend.gz ")
## .chrXPAR2
HAPS.chrXPAR2 <- paste0(impRefDIR,
"1000GP_Phase3_chrX_PAR2.hap.gz ")
LEGEND.chrXPAR2 <- paste0(impRefDIR,
"1000GP_Phase3_chrX_PAR2.legend.gz ")
} else { print("Wrong reference panel during phasing!!")}
## <<< ref panel
# main output file
OUTPUT_HAPS <- paste0(phaseDIR, "chr", i, ".haps ")
OUTPUT_SAMPLE <- paste0(phaseDIR, "chr", i, ".sample ")
OUTPUT_LOG <- paste0(phaseDIR, "chr", i, ".log ")
autosomeCode = seq_len(22)
if (is.element(i, autosomeCode)) { ## prePhasing for the autosome
system(paste0(shapeit,
" --input-bed ", GWASDATA_BED, GWASDATA_BIM, GWASDATA_FAM, " \ ",
" --input-map ", GENMAP_FILE, " \ ",
"--input-ref ", HAPS_FILE, LEGEND_FILE, SAMPLE_FILE, " \ ",
"--thread ", nThread, " \ ",
"--effective-size ", effectiveSize, " \ ",
"--output-max ", OUTPUT_HAPS, OUTPUT_SAMPLE, " \ ",
"--output-log ", OUTPUT_LOG) )
} else if (i == "X_PAR1"){
system(paste0(shapeit,
" --input-bed ", GWASDATA_BED, GWASDATA_BIM, GWASDATA_FAM, " \ ",
" --input-map ", GENMAP.chrXPAR1, " \ ",
"--input-ref ", HAPS.chrXPAR1, LEGEND.chrXPAR1, SAMPLE_FILE, " \ ",
"--thread ", nThread, " \ ",
"--effective-size ", effectiveSize, " \ ",
"--output-max ", OUTPUT_HAPS, OUTPUT_SAMPLE, " \ ",
"--output-log ", OUTPUT_LOG) )
} else if (i == "X_PAR2"){
system(paste0(shapeit,
" --input-bed ", GWASDATA_BED, GWASDATA_BIM, GWASDATA_FAM, " \ ",
" --input-map ", GENMAP.chrXPAR2, " \ ",
"--input-ref ", HAPS.chrXPAR2, LEGEND.chrXPAR2, SAMPLE_FILE, " \ ",
"--thread ", nThread, " \ ",
"--effective-size ", effectiveSize, " \ ",
"--output-max ", OUTPUT_HAPS, OUTPUT_SAMPLE, " \ ",
"--output-log ", OUTPUT_LOG) )
} else if (i == 23){
system(paste0(shapeit,
" --input-bed ", GWASDATA_BED, GWASDATA_BIM, GWASDATA_FAM, " \ ",
" --input-map ", GENMAP.chrXnonPAR, " \ ",
" --chrX \ ", ## special case
"--input-ref ", HAPS.chrXnonPAR, LEGEND.chrXnonPAR, SAMPLE_FILE, " \ ",
"--thread ", nThread, " \ ",
"--effective-size ", effectiveSize, " \ ",
"--output-max ", OUTPUT_HAPS, OUTPUT_SAMPLE, " \ ",
"--output-log ", OUTPUT_LOG) )
}
}, mc.cores=nCore)
}
#' Impute genotypes using IMPUTE2
#'
#' @description
#' Perform imputation by IMPUTE2 for the autosomal and sex chromosome
#' prephased known haplotypes with a reference panel.
#' @param impute2 an executable program in either the current
#' working directory or somewhere in the command path.
#' @param chrs specifiy the chromosome codes for imputation.
#' @param prefixChunk the prefix of the chunk files for each chromosome,
#' along with the proper location directory.
#' @param phaseDIR the directory where prephased haplotypes are located.
#' @param referencePanel a string indicating the type of imputation
#' reference panels is used: c("1000Gphase1v3_macGT1", "1000Gphase3").
#' @param impRefDIR the directory where the imputation reference files
#' are located.
#' @param imputedDIR the directory where imputed files will be located.
#' @param prefix4eachChr the prefix of IMPUTE2 files for each chunk.
#' @param nCore the number of cores used for computation.
#' @param effectiveSize this parameter controls the effective population size.
#' Commonly denoted as Ne. A universal -Ne value of 20000 is suggested.
#' @return The imputed files for all chunks from the given chromosomes.
#' @details If ChrX is available then it is done automatically by passing
#' the flag --chrX to IMPUTE2.
#' @export
#' @import doParallel
#' @author Junfang Chen
#' @seealso \code{\link{phaseImpute2}}.
.imputedByImpute2 <- function(impute2, chrs, prefixChunk, phaseDIR,
referencePanel, impRefDIR, imputedDIR,
prefix4eachChr, nCore, effectiveSize=20000){
for (i in chrs){
chunkfn <- paste0(prefixChunk, i, ".txt")
chunks <- read.table(chunkfn, sep=" ")
chunklist <- as.list(seq_len(nrow(chunks)))
mclapply(chunklist, function(j){
chunkSTART <- chunks[j,1]
chunkEND <- chunks[j,2]
## Input: haplotypes from SHAPEIT phasing (method B)
GWAS_HAPS_FILE <- paste0(phaseDIR, "chr", i, ".haps ")
GWAS_SAMP_FILE <- paste0(phaseDIR, "chr", i, ".sample ")
## >>> ref panel
GENMAP_FILE <- paste0(impRefDIR, "genetic_map_chr", i,
"_combined_b37.txt ")
GENMAP.chrXnonPAR <- paste0(impRefDIR, "genetic_map_chr",
"X_nonPAR_combined_b37.txt ")
GENMAP.chrXPAR1 <- paste0(impRefDIR, "genetic_map_chr",
"X_PAR1_combined_b37.txt ")
GENMAP.chrXPAR2 <- paste0(impRefDIR, "genetic_map_chr",
"X_PAR2_combined_b37.txt ")
if (referencePanel == "1000Gphase1v3_macGT1"){
## autosome
impPrefix <- "ALL_1000G_phase1integrated_v3_chr"
HAPS_FILE <- paste0(impRefDIR, impPrefix, i,
"_impute_macGT1.hap.gz ")
LEGEND_FILE <- paste0(impRefDIR, impPrefix, i,
"_impute_macGT1.legend.gz ")
## chrX_nonPAR
HAPS.chrXnonPAR <- paste0(impRefDIR, impPrefix,
"X_nonPAR_impute_macGT1.hap.gz ")
LEGEND.chrXnonPAR <- paste0(impRefDIR, impPrefix,
"X_nonPAR_impute_macGT1.legend.gz ")
## .chrXPAR1
HAPS.chrXPAR1 <- paste0(impRefDIR, impPrefix,
"X_PAR1_impute_macGT1.hap.gz ")
LEGEND.chrXPAR1 <- paste0(impRefDIR, impPrefix,
"X_PAR1_impute_macGT1.legend.gz ")
## .chrXPAR2
HAPS.chrXPAR2 <- paste0(impRefDIR, impPrefix,
"X_PAR2_impute_macGT1.hap.gz ")
LEGEND.chrXPAR2 <- paste0(impRefDIR, impPrefix,
"X_PAR2_impute_macGT1.legend.gz ")
} else if (referencePanel == "1000Gphase3"){
HAPS_FILE <- paste0(impRefDIR, "1000GP_Phase3_chr", i,
".hap.gz ")
## autosome
LEGEND_FILE <- paste0(impRefDIR, "1000GP_Phase3_chr", i,
".legend.gz ")
## chrX_nonPAR
HAPS.chrXnonPAR <- paste0(impRefDIR,
"1000GP_Phase3_chrX_NONPAR.hap.gz ")
LEGEND.chrXnonPAR <- paste0(impRefDIR,
"1000GP_Phase3_chr",
"X_NONPAR.legend.gz ")
## .chrXPAR1
HAPS.chrXPAR1 <- paste0(impRefDIR,
"1000GP_Phase3_chrX_PAR1.hap.gz ")
LEGEND.chrXPAR1 <- paste0(impRefDIR,
"1000GP_Phase3_chrX_PAR1.legend.gz ")
## .chrXPAR2
HAPS.chrXPAR2 <- paste0(impRefDIR,
"1000GP_Phase3_chrX_PAR2.hap.gz ")
LEGEND.chrXPAR2 <- paste0(impRefDIR,
"1000GP_Phase3_chrX_PAR2.legend.gz ")
} else { print("Wrong reference panel during imputation!!")}
## <<< ref panel
## main output file
OUTPUT_FILE <- paste0(imputedDIR, prefix4eachChr, i,
".pos", chunkSTART,
"-", chunkEND, ".impute2 ")
## impute genotypes from GWAS haplotypes
autosomeCode = seq_len(22)
if (is.element(i, autosomeCode)) {
## impute for the autosomes
system(paste0(impute2,
" -iter 30 \ ",
" -burnin 10 \ ",
" -k_hap 500 \ ",
" -use_prephased_g \ ",
" -m ", GENMAP_FILE, " \ ",
" -h ", HAPS_FILE, " \ ",
" -l ", LEGEND_FILE, " \ ",
" -known_haps_g ", GWAS_HAPS_FILE, " \ ",
" -Ne ", effectiveSize, " \ ",
" -int ", chunkSTART, " ", chunkEND, " \ ",
" -buffer 1000 \ ",
" -o ", OUTPUT_FILE, " \ ",
" -allow_large_regions \ ",
" -seed 367946 \ " ))
} else if (i == "X_PAR1"){
## impute for chrX PAR >> with an additional flag: --Xpar.
system(paste0(impute2,
" -iter 30 \ ",
" -burnin 10 \ ",
" -k_hap 500 \ ",
" -use_prephased_g \ ",
" -Xpar \ ", ########## special
" -m ", GENMAP.chrXPAR1, " \ ",
" -h ", HAPS.chrXPAR1, " \ ",
" -l ", LEGEND.chrXPAR1, " \ ",
" -known_haps_g ", GWAS_HAPS_FILE, " \ ",
" -Ne ", effectiveSize, " \ ",
" -int ", chunkSTART, " ", chunkEND, " \ ",
" -buffer 1000 \ ",
" -o ", OUTPUT_FILE, " \ ",
" -allow_large_regions \ ",
" -seed 367946 \ " ))
} else if (i == "X_PAR2"){
## impute for chrX PAR >> with an additional flag: --Xpar.
system(paste0(impute2,
" -iter 30 \ ",
" -burnin 10 \ ",
" -k_hap 500 \ ",
" -use_prephased_g \ ",
" -Xpar \ ", ########## special
" -m ", GENMAP.chrXPAR2, " \ ",
" -h ", HAPS.chrXPAR1, " \ ",
" -l ", LEGEND.chrXPAR1, " \ ",
" -known_haps_g ", GWAS_HAPS_FILE, " \ ",
" -Ne ", effectiveSize, " \ ",
" -int ", chunkSTART, " ", chunkEND, " \ ",
" -buffer 1000 \ ",
" -o ", OUTPUT_FILE, " \ ",
" -allow_large_regions \ ",
" -seed 367946 \ " ))
} else if (i == 23){
## impute for chrX
## >> additional flag: --chrX, and sample_known_haps_g
system(paste0(impute2,
" -iter 30 \ ",
" -burnin 10 \ ",
" -k_hap 500 \ ",
" -use_prephased_g \ ",
" -chrX \ ", ########### special
" -m ", GENMAP.chrXnonPAR, " \ ",
" -h ", HAPS.chrXnonPAR, " \ ",
" -l ", LEGEND.chrXnonPAR, " \ ",
" -known_haps_g ", GWAS_HAPS_FILE, " \ ",
" -sample_known_haps_g ", GWAS_SAMP_FILE, " \ ",
" -Ne ", effectiveSize, " \ ",
" -int ", chunkSTART, " ", chunkEND, " \ ",
" -buffer 1000 \ ",
" -o ", OUTPUT_FILE, " \ ",
" -allow_large_regions \ ",
" -seed 367946 \ " ))
} else { print(" wrong chromosome code during phasing!!") }
}, mc.cores=nCore)
}
}
#' Convert IMPUTE2 format files into PLINK format
#'
#' @description
#' Convert all chunks of IMPUTE2 format files into binary PLINK format using
#' GTOOL.
#' @param gtool an executable program in either the current
#' working directory or somewhere in the command path.
#' @param chrs specifiy the chromosome codes for conversion.
#' @param prefixChunk the prefix of the chunk files for each chromosome,
#' along with the location directory.
#' @param phaseDIR the directory where pre-phased files are located.
#' @param imputedDIR the directory where the imputated files are located.
#' @param prefix4eachChr the prefix of the input IMPUTE2 files and
#' also the output PLINK binary files for each chunk.
#' @param suffix4imputed the suffix of the IMPUTE2 format file that stores
#' the imputed value. Both ".impute2" and ".gen" are accepted.
#' @param postImputeDIR the directory where converted PLINK binary files
#' will be located.
#' @param threshold threshold for merging genotypes from GEN probability.
#' Default 0.9.
#' @param SNP A logical value indicating if the data is entirely comprised
#' single nucleotide polymorphisms then it can be set as TRUE and the genotypes
#' are expressed as pairs of A,C,G,T and unknowns are represented as N N.
#' @param nCore the number of cores used for computation.
#' @return The converted binary PLINK format files for each chunk from IMPUTE2
#' results.
##' @export
#' @import doParallel
#' @author Junfang Chen
.convertImpute2ByGtool <- function(gtool, chrs, prefixChunk,
phaseDIR, imputedDIR, prefix4eachChr,
suffix4imputed, postImputeDIR,
threshold, SNP = TRUE, nCore){
for (i in chrs){
chunkfn <- paste0(prefixChunk, i, ".txt")
chunks <- read.table(chunkfn, sep=" ")
chunklist <- as.list(seq_len(nrow(chunks)))
mclapply(chunklist, function(j){
chunkSTART <- chunks[j,1]
chunkEND <- chunks[j,2]
## INPUT data files
SAM_FILE <- paste0(phaseDIR, "chr", i, ".sample")
GEN_FILE <- paste0(imputedDIR, prefix4eachChr, i,
".pos", chunkSTART,
"-", chunkEND, suffix4imputed)
## output PLINK binary files
PED_FILE <- paste0(postImputeDIR, prefix4eachChr, i, ".pos",
chunkSTART, "-", chunkEND, ".ped")
MAP_FILE <- paste0(postImputeDIR, prefix4eachChr, i, ".pos",
chunkSTART, "-", chunkEND, ".map")
## converting chunk-wise
if (SNP == TRUE){
system( paste0(gtool, " -G ",
" --g ", GEN_FILE, " \ ",
" --s ", SAM_FILE, " \ ",
"--phenotype plink_pheno \ ",
"--chr ", i, " \ ",
"--ped ", PED_FILE, " \ ",
"--map ", MAP_FILE, " \ ",
"--threshold ", threshold, " \ ",
"--snp ") )
} else if (SNP == FALSE) {
system( paste0(gtool, " -G ",
" --g ", GEN_FILE, " \ ",
" --s ", SAM_FILE, " \ ",
"--phenotype plink_pheno \ ",
"--chr ", i, " \ ",
"--ped ", PED_FILE, " \ ",
"--map ", MAP_FILE, " \ ",
"--threshold ", threshold, " \ "
) )
} else {
stop("Wrong or no SNP indicator provided!!")
}
}, mc.cores=nCore)
}
}
#' Merge chunk-wise PLINK files
#'
#' @description
#' Merge all chunk-wise PLINK binary files into chromosome-wise PLINK
#' binary files then assemble into a genome-wide PLINK binary file set.
#' @param plink an executable program in either the current working
#' directory or somewhere in the command path.
#' @param chrs specifiy the chromosome codes to be merged.
#' @param prefix4eachChr the prefix of the input chunk-wise PLINK
#' files.
#' @param prefix4mergedPlink the prefix of the final output PLINK
#' binary files.
#' @param nCore the number of cores used for computation.
#' @return The merged genome-wide PLINK binary files.
#' @details Create a file containing a list chunk-wise PLINK PED and MAP
#' file names. The prefix of these files must already indicate in which
#' chromosome they belong to and files from the same chromosome will be
#' combined. Then all chromosomal PLINK files are assembled together
#' into one whole genome PLINK binary file set.
##' @export
#' @import doParallel
#' @author Junfang Chen
.mergePlinkData <- function(plink, chrs, prefix4eachChr,
prefix4mergedPlink, nCore){
## firstly, only consider chromosomes from 1:23; as Xpar chrs
## are slightly different for processing.
pureAutoChrs <- setdiff(chrs, c("X_PAR1", "X_PAR2"))
chrslist <- as.list(pureAutoChrs)
mclapply(chrslist, function(i){
pedFile_chr <- system(paste0("ls ", prefix4eachChr, i, ".*.ped"),
intern=TRUE)
mapFile_chr <- system(paste0("ls ", prefix4eachChr, i, ".*.map"),
intern=TRUE)
pedmap_chr <- paste0(pedFile_chr, " ", mapFile_chr)
fA <- gsub(".ped", "", pedFile_chr[1])
pedmap_tobeMerged <- pedmap_chr[-1]
filesetname <- paste0("fileset_chr", i, ".txt")
write.table(pedmap_tobeMerged, file=filesetname, quote=FALSE,
row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
system(paste0(plink, " --file ", fA, " --merge-list ", filesetname,
" --make-bed --out gwasImputed_chr", i))
}, mc.cores=nCore)
## combine chrX_PAR and convert into chr25
if (is.element(c("X_PAR1"), chrs) | is.element(c("X_PAR2"), chrs) ){
pedFile_chr <- system(paste0("ls ", prefix4eachChr, "X_PAR*.ped"),
intern=TRUE)
mapFile_chr <- system(paste0("ls ", prefix4eachChr, "X_PAR*.map"),
intern=TRUE)
pedmap_chr <- paste0(pedFile_chr, " ", mapFile_chr)
fA <- gsub(".ped", "", pedFile_chr[1])
pedmap_tobeMerged <- pedmap_chr[-1]
filesetname <- paste0("fileset_chr25.txt")
write.table(pedmap_tobeMerged, file=filesetname, quote=FALSE,
row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
arg <- paste0(plink, " --file ", fA, " --merge-list ", filesetname,
" --allow-extra-chr --make-bed --out gwasImputedOld25")
system(arg)
## update chr code for XPAR --> 25
bim <- read.table("gwasImputedOld25.bim", stringsAsFactors=FALSE)
updateSNPchr <- cbind(bim[,2], rep(25, length=nrow(bim)))
write.table(updateSNPchr, file="gwasImputed_newchr25.txt", quote=FALSE,
row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
system(paste0(plink, " --bfile gwasImputedOld25 --allow-extra-chr ",
" --update-chr gwasImputed_newchr25.txt 2 1",
" --make-bed --out gwasImputed_chr25"))
# system("rm gwasImputedOld25.* gwasImputed_newchr25.txt")
# system( paste0("rm ", filesetname))
}
## combine all bed files
bedFile_chr <- system(paste0("ls gwasImputed_chr*.bed"), intern=TRUE)
bimFile_chr <- system(paste0("ls gwasImputed_chr*.bim"), intern=TRUE)
famFile_chr <- system(paste0("ls gwasImputed_chr*.fam"), intern=TRUE)
bfile_chr <- paste0(bedFile_chr, " ", bimFile_chr, " ", famFile_chr)
fA <- paste0(gsub(".bed", "", bedFile_chr[1]))
tobeMerged <- bfile_chr[-1]
mergefilesetname <- paste0("mergeGwasImputed.txt")
write.table(tobeMerged, file=mergefilesetname, quote=FALSE,
row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
system(paste0(plink, " --bfile ", fA, " --merge-list ", mergefilesetname,
" --make-bed --out ", prefix4mergedPlink))
}
#' Filter genetic variants
#'
#' @description
#' Filter out genetic variants accoring to the imputation quality score.
#'
#' @param plink an executable program in either the current working
#' directory or somewhere in the command path.
#' @param suffix4impute2info the suffix of input IMPUTE2 generated files that
#' store the imputation quality score for each variant from .impute2_info files.
#' @param outputInfoFile the output file of impute2 info scores consisting of
#' two columns: all imputed SNPs and their info scores.
#' @param infoScore the cutoff of filtering imputation quality score for
#' each variant. The default value is 0.6.
#' @param inputPrefix the prefix of the input imputed PLINK binary files.
#' @param outputPrefix the prefix of the output filtered PLINK binary files.
#' @return A pure text file contains the info scores of all imputed SNPs with
#' two columns: SNP names and the corresponding info scores.
#' A pure text file with all excluded SNPs having bad info scores.
#' The filtered PLINK binary imputed files.
#' @details Filter genetic variants accoring to the imputation quality score
#' with the help of .impute2_info files generated by IMPUTE2.
#' Often, we keep variants with imputation info score of greater than 0.6.
#' Note that imputed SNPs with more than two alleles are not considered.
##' @export
#' @author Junfang Chen
.filterImputeData <- function(plink, suffix4impute2info, outputInfoFile,
infoScore=0.6, inputPrefix, outputPrefix){
## read each .impute2_info file, remove 1st line, add to another file
## and repeat get all impute2_info files for each chunk
files <- system(paste0("ls *", suffix4impute2info), intern=TRUE)
for (i in seq_len(length(files))) {
## impute2infoAllvariants.txt is the temporal file
system(paste0("sed 1d ", files[i], " >> impute2infoAllvariants.txt"))
}
## only keep SNPs and SNPs with two alleles
## impute2infoUpdateTmp.txt > temporal file
arg1 = paste0("grep 'rs' impute2infoAllvariants.txt ")
arg2 = paste0("awk '{if(length($4) == 1 && length($5) == 1) print}' ")
arg3 = paste0("awk '{print $2, $7}' > impute2infoUpdateTmp.txt")
system(paste0(arg1, " | ", arg2, " | ", arg3))
system(paste0("mv impute2infoUpdateTmp.txt ", outputInfoFile))
## added colnames
impute2info <- read.table(file=outputInfoFile, stringsAsFactors=FALSE)
colnames(impute2info) <- c("rsid", "info")
## filtering
snpWithBadInfo <- impute2info[which(impute2info[, "info"] < infoScore), 1]
badImputeSNPfile <- "badImputeSNPs.txt"
write.table(snpWithBadInfo, file=badImputeSNPfile, quote=FALSE,
row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
## extract filtered SNPs
system(paste0(plink, " --bfile ", inputPrefix, " --exclude ",
badImputeSNPfile, " --make-bed --out ", outputPrefix))
# system("rm impute2infoAllvariants.txt")
system(paste0("rm ", badImputeSNPfile))
}
#' Extract info score
#'
#' @description
#' Extract info score file summarized from the output of IMPUTE2.
#'
#' @param suffix4impute2info the suffix of input IMPUTE2 generated files that
#' store the imputation quality score for each variant from .impute2_info files.
#' @param outputInfoFile the output file of impute2 info scores consisting of
#' two columns: all imputed SNPs and their info scores.
#' @return A pure text file contains the info scores of all imputed SNPs with
#' two columns: SNP names and the corresponding info scores. The header names
#' are "rsid" and "info".
##' @export
#' @author Junfang Chen
.getInfoScoreImpute2 <- function(suffix4impute2info, outputInfoFile){
## read each .impute2_info file, remove 1st line, add to another file
## and repeat get all impute2_info files for each chunk
files <- system(paste0("ls *", suffix4impute2info), intern=TRUE)
for (i in seq_len(length(files))) {
## impute2infoAllvariants.txt is the temporal file
system(paste0("sed 1d ", files[i], " >> impute2infoAllvariants.txt"))
}
## only keep SNPs and SNPs with two alleles
arg1 = paste0("grep 'rs' impute2infoAllvariants.txt ")
arg2 = paste0("awk '{if(length($4) == 1 && length($5) == 1) print}' ")
arg3 = paste0("awk '{print $2, $7}' > impute2infoUpdateTmp.txt")
system(paste0(arg1, " | ", arg2, " | ", arg3))
## add header
system("sed -i '1i rsid info' impute2infoUpdateTmp.txt")
system(paste0("mv impute2infoUpdateTmp.txt ", outputInfoFile))
}
#' Filter genetic variants
#'
#' @description
#' Filter out genetic variants accoring to the info score.
#'
#' @param plink an executable program in either the current working
#' directory or somewhere in the command path.
#' @param outputInfoFile the output file of info scores consisting of
#' two columns: SNP names and their info scores. The headers are
#' c("rsid", "info").
#' @param infoScore the cutoff of filtering imputation quality score for
#' each variant. The default value is 0.6.
## #' @param badImputeSNPfile the output file of SNPs with bad info scores.
#' @param inputPrefix the prefix of the input imputed PLINK binary files.
#' @param outputPrefix the prefix of the output filtered PLINK binary files.
#' @return A pure text file contains the info scores of all imputed SNPs with
#' two columns: SNP names and the corresponding info scores.
#' A pure text file with all excluded SNPs having bad info scores.
#' The filtered PLINK binary imputed files,
#' @details Filter genetic variants accoring to the imputation quality score
#' with the help of .impute2_info files generated by IMPUTE2.
#' Often, we keep variants with imputation info score of greater than 0.6.
#' Note that imputed SNPs with more than two alleles are not considered.
#' @export
#' @author Junfang Chen
.filterImputeData2 <- function(plink, outputInfoFile, infoScore=0.6,
inputPrefix, outputPrefix){
## with colnames: rsid, info
impute2info <- read.table(file=outputInfoFile,
stringsAsFactors=FALSE, header=TRUE)
impute2info[,2] <- round(impute2info[,2], 3)
## filtering
snpWithBadInfo <- impute2info[which(impute2info[, "info"] < infoScore), 1]
badImputeSNPfile <- "badImputeSNPs.txt"
write.table(snpWithBadInfo, file=badImputeSNPfile, quote=FALSE,
row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
## extract filtered SNPs
system(paste0(plink, " --bfile ", inputPrefix, " --exclude ",
badImputeSNPfile, " --make-bed --out ", outputPrefix))
system(paste0("rm ", badImputeSNPfile))
}
#' Remove SNPs after post imputation
#'
#' @description
#' Remove SNPs which have a non missing value for less than a predefined
#' number of instances.
#'
#' @param plink an executable program in either the current working
#' directory or somewhere in the command path.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param missCutoff the cutoff of the least number of instances for
#' a SNP that is not missing. The default is 20.
#' @param outRemovedSNPfile the output file of SNPs with pre-defined
#' missing values that are removed.
#' @param outRetainSNPfile the output file of SNPs that are retained.
#' @param outputPrefix the prefix of the PLINK binary files.
#' @return The PLINK binary files after post imputation quality control
#' and a pure text file contains SNPs with pre-defined missing values.
#' @export
#' @author Junfang Chen
#' @examples
#' ## In the current working directory
#' bedFile <- system.file("extdata", "alignedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "alignedData.bim", package="Gimpute")
#' famFile <- system.file("extdata", "alignedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, " ."))
#' system(paste0("scp ", bimFile, " ."))
#' system(paste0("scp ", famFile, " ."))
#' inputPrefix <- "alignedData"
#' outputPrefix <- "removedSnpMissPostImp"
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## removedSnpMissPostImp(plink, inputPrefix, missCutoff=20,
#' ## outRemovedSNPfile, outRetainSNPfile, outputPrefix)
#' ##
removedSnpMissPostImp <- function(plink, inputPrefix, missCutoff,
outRemovedSNPfile, outRetainSNPfile,
outputPrefix){
## get the missing info
system(paste0(plink, " --bfile ", inputPrefix,
" --missing --out ", inputPrefix))
missSNPinfo <- read.table(paste0(inputPrefix, ".lmiss"),
stringsAsFactors=FALSE, header=TRUE)
missSNPinfo[,6] <- missSNPinfo[,"N_GENO"] - missSNPinfo[,"N_MISS"]
manyMissSNPs <- missSNPinfo[which(missSNPinfo[,6] < missCutoff), "SNP"]
write.table(manyMissSNPs, file=outRemovedSNPfile, quote=FALSE,
row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
snpRetain <- setdiff(missSNPinfo[,"SNP"], manyMissSNPs)
write.table(snpRetain, file=outRetainSNPfile, quote=FALSE,
row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
system(paste0(plink, " --bfile ", inputPrefix, " --exclude ",
outRemovedSNPfile, " --make-bed --out ", outputPrefix))
system( "rm *.imiss *.lmiss *.log")
}
#' Phasing and imputation
#'
#' @description
#' Perform phasing, imputation and conversion from IMPUTE2 or GEN format into
#' PLINK binary files.
#' @param inputPrefix the prefix of the input PLINK binary files for
#' the imputation.
#' @param outputPrefix the prefix of the output PLINK binary files
#' after imputation.
#' @param autosome a logical value indicating if only autosomal chromosomes
#' are imputed.
#' @param plink an executable program in either the current
#' working directory or somewhere in the command path.
#' @param shapeit an executable program in either the current
#' working directory or somewhere in the command path.
#' @param imputeTool a string indicating the type of imputation tool is used:
#' "impute2" or "impute4".
#' @param impute an executable program in either the current
#' working directory or somewhere in the command path. It can be either
#' "impute2" or "impute4".
#' @param qctool an executable program in either the current working
#' directory or somewhere in the command path. This is only used if imputeTool
#' is "impute4".
#' @param gtool an executable program in either the current
#' working directory or somewhere in the command path.
#' @param windowSize the window size of each chunk.
#' The default value is 3000000.
#' @param effectiveSize this parameter controls the effective population size.
#' Commonly denoted as Ne. A universal -Ne value of 20000 is suggested.
#' @param nCore the number of cores used for splitting chromosome by PLINK,
#' phasing, imputation, genotype format modification, genotype conversion, and
#' merging genotypes. The default value is 40.
#' @param threshold threshold for merging genotypes from GEN probability.
#' Default 0.9.
#' @param outputInfoFile the output file of impute2 info scores consisting of
#' two columns: all imputed SNPs and their info scores.
#' @param SNP A logical value indicating if the data is entirely comprised
#' single nucleotide polymorphisms then it can be set as TRUE and the genotypes
#' are expressed as pairs of A,C,G,T and unknowns are represented as N N.
#' @param referencePanel a string indicating the type of imputation
#' reference panels is used: "1000Gphase1v3_macGT1" or "1000Gphase3".
#' @param impRefDIR the directory where the imputation reference files
#' are located.
#' @param tmpImputeDir the name of the temporary directory used for
#' storing phasing and imputation results.
#' @param keepTmpDir a logical value indicating if the directory
#' 'tmpImputeDir' should be kept or not. The default is TRUE.
#' @return Note that chromosome X is not supported for the impute4.
#' 1.) The filtered imputed PLINK binary files;
#' 2.) The final PLINK binary files including bad imputed variants;
#' 3.) A pure text file contains the info scores of all imputed SNPs with
#' two columns: SNP names and the corresponding info scores.
#' @details The whole imputation process mainly consists of the following
#' steps:
#' 1.) Phasing the input PLINK data using an existing imputation reference;
#' 2.) Imputing the input PLINK data using phased results and an existing
#' reference data;
#' 3.) Converting IMPUTE2 or GEN format data into PLINK format.
#' 4.) Combining all imputed data into whole-genome PLINK binary files.
#' 5.) Filtering out imputed variants with bad imputation quality.
#' Parallel computing in R is supported.
#' @export
#' @import doParallel
#' @author Junfang Chen
#' @references
#' \enumerate{
#' \item Howie, B., et al. (2012). Fast and accurate genotype imputation
#' in genome-wide association studies through pre-phasing. Nat Genet
#' 44(8): 955-959.
#' \item Howie, B. N., et al. (2009). A flexible and accurate genotype
#' imputation method for the next generation of genome-wide association
#' studies. PLoS Genet 5(6): e1000529.
#' \item Bycroft, C., et al. Genome-wide genetic data on~ 500,000 UK Biobank
#' participants. BioRxiv (2017): 166298.
#' }
#' @examples
#' ## In the current working directory
#' bedFile <- system.file("extdata", "alignedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "alignedData.bim", package="Gimpute")
#' famFile <- system.file("extdata", "alignedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, " ."))
#' system(paste0("scp ", bimFile, " ."))
#' system(paste0("scp ", famFile, " ."))
#' inputPrefix <- "alignedData"
#' outputPrefix <- "gwasImputed"
#' outputInfoFile <- "infoScore.txt"
#' tmpImputeDir <- "tmpImpute"
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## phaseImpute(inputPrefix, outputPrefix, autosome=TRUE,
#' ## plink, shapeit, imputeTool, impute, qctool, gtool,
#' ## windowSize=3000000, effectiveSize=20000,
#' ## nCore=40, threshold=0.9, outputInfoFile, SNP=TRUE,
#' ## referencePanel, impRefDIR, tmpImputeDir, keepTmpDir=TRUE)
phaseImpute <- function(inputPrefix, outputPrefix, autosome=TRUE,
plink, shapeit, imputeTool, impute, qctool, gtool,
windowSize=3000000, effectiveSize=20000,
nCore=40, threshold=0.9, outputInfoFile, SNP=TRUE,
referencePanel, impRefDIR,
tmpImputeDir, keepTmpDir=TRUE){
## create directories for storing tmp imputation output files
## The name of these directories must be fixed for the sake of
## the subsequent steps.
system(paste0("mkdir ", tmpImputeDir))
setwd(tmpImputeDir) ##
## sub-directories
system("mkdir 1-dataFiles")
system("mkdir 2-chunkFile")
system("mkdir 3-phaseResults")
system("mkdir 4-imputeResults")
system("mkdir 5-postImpute")
# define directories
dataDIR <- "1-dataFiles/"
chunkDIR <- "2-chunkFile/"
phaseDIR <- "3-phaseResults/"
imputedDIR <- "4-imputeResults/"
postImputeDIR <- "5-postImpute/"
setwd("..")
## step 2.1
## copy plink files without monomorphic SNPs; prepare for the imputation.
prefix4eachChr <- "gwas_data_chr"
system(paste0("scp ", inputPrefix, ".* ./", tmpImputeDir, "/", dataDIR))
setwd(paste0("./", tmpImputeDir, "/", dataDIR))
renamePlinkBFile(inputPrefix, outputPrefix=prefix4eachChr, action="move")
bimCurrent <- read.table(file=paste0(prefix4eachChr, ".bim"),
stringsAsFactors=FALSE)
currentChr <- names(table(bimCurrent[,1]))
print(currentChr)
chrXPAR1suffix <- "X_PAR1"
chrXPAR2suffix <- "X_PAR2"
## nCore is chosen as the number of chromosomes available
PAR <- chrWiseSplit(plink, inputPrefix=prefix4eachChr, chrXPAR1suffix,
chrXPAR2suffix, nCore)
print(PAR)
if (PAR[[1]]) {par1 <- "X_PAR1"} else {par1 <- NULL}
if (PAR[[2]]) {par2 <- "X_PAR2"} else {par2 <- NULL}
## step 2.2
chunkPrefix <- "chunks_chr"
if (autosome == TRUE){
chrs <- intersect(currentChr, seq_len(22))
} else {
## replace chr25 by pseudo-autosomal region of X
chrs <- setdiff(c(currentChr, par1, par2), 25)
}
chunk4eachChr(inputPrefix=prefix4eachChr,
outputPrefix=chunkPrefix, chrs, windowSize)
setwd("..")
system(paste0("mv ", dataDIR, chunkPrefix, "*.txt ", chunkDIR))
## step 2.3
.prePhasingByShapeit(shapeit, chrs, dataDIR,
prefix4eachChr, referencePanel,
impRefDIR, phaseDIR, nThread=nCore,
effectiveSize, nCore=1)
## step 2.4
prefixChunk <- paste0(chunkDIR, chunkPrefix)
if (imputeTool == "impute2"){
.imputedByImpute2(impute2=impute, chrs, prefixChunk, phaseDIR, referencePanel,
impRefDIR, imputedDIR, prefix4eachChr,
nCore, effectiveSize)
## step 2.5 ## extract only SNPs (without INDELs)
#######################################################
setwd(imputedDIR)
## extract only SNPs starting with "rs"; .
ls <- system("ls gwas*.impute2", intern=TRUE)
snpPrefix <- "rs"
biglists <- as.list(ls)
mclapply(biglists, function(i){
arg1 <- paste0(" awk '{if(length($4) == 1 && length($5) == 1) print}'")
arg2 <- paste0(i, "noINDEL.impute2")
system(paste0("grep '", snpPrefix, "' ", i, " | ", arg1, " > ", arg2))
}, mc.cores=nCore) ## by default
suffix4imputed <- ".impute2noINDEL.impute2"
suffix4impute2info <- ".impute2_info"
.getInfoScoreImpute2(suffix4impute2info, outputInfoFile)
setwd("..")
} else if (imputeTool == "impute4"){
## chrX is not available for impute4.
if (is.element(23, chrs) == TRUE) { chrs <- setdiff(chrs, 23) }
.imputedByImpute4(impute4=impute, chrs, prefixChunk, phaseDIR, referencePanel,
impRefDIR, imputedDIR, prefix4eachChr,
nCore, effectiveSize)
## step 2.5 ## extract only SNPs (without INDELs)
#######################################################
setwd(imputedDIR)
## extract only SNPs starting with "rs"; .
ls <- system("ls gwas*.gen", intern=TRUE)
snpPrefix <- "rs"
biglists <- as.list(ls)
mclapply(biglists, function(i){
arg1 <- paste0(" awk '{if(length($4) == 1 && length($5) == 1) print}'")
arg2 <- paste0(i, "NoINDEL.gen")
system(paste0("grep '", snpPrefix, "' ", i, " | ", arg1, " > ", arg2))
}, mc.cores=nCore) ## by default
suffix4imputed <- ".genNoINDEL.gen"
## compute info score for each chunk, then combine all info scores
computeInfoByQctool(qctool, inputSuffix=suffix4imputed, outputInfoFile)
setwd("..")
} else {
print("Wrong imputation tool or no imputation tool is provided!")
}
.convertImpute2ByGtool(gtool, chrs, prefixChunk, phaseDIR, imputedDIR,
prefix4eachChr, suffix4imputed,
postImputeDIR, threshold, SNP, nCore)
## step 2.6
#######################################################
## Modify missing genotype format.
setwd(postImputeDIR)
## replace 'N' in the .ped files into 0 > missing values.
chrslist <- as.list(chrs)
fn <- mclapply(chrslist, function(i){
system(paste0("sed -i 's/N/0/g' ", prefix4eachChr, i, ".*ped "))
}, mc.cores=nCore) ## by default
prefixMerge <- "gwasMerged"
.mergePlinkData(plink, chrs, prefix4eachChr,
prefixMerge, nCore)
## fam IDs may be changed: a.) if IDs have 'N';
## b.) IID, FID may be switched.
## >> update this as below
## the original PLINK files before imputation
## >> If nothing has changed (the above two cases not vary),
## >> then check if the order of IDs has been changed
setwd("..")
system(paste0("scp ", dataDIR, prefix4eachChr, ".fam ", postImputeDIR))
setwd(postImputeDIR)
## update FAM IDs in the imputed PLINK files
famOrig <- read.table(paste0(prefix4eachChr, ".fam"),
stringsAsFactors=FALSE)
famImpute <- read.table(paste0(prefixMerge,".fam"),
stringsAsFactors=FALSE)
FIDequal <- all.equal(famOrig[,"V1"], famImpute[,"V1"])
IIDequal <- all.equal(famOrig[,"V2"], famImpute[,"V2"])
if (FIDequal == TRUE & IIDequal == TRUE) {
message("All sample IDs after imputation are ordered identical!")
renamePlinkBFile(prefixMerge, outputPrefix, action="copy")
} else {
sumFID <- length(intersect(famOrig[,"V1"], famImpute[,"V1"]))
sumIID <- length(intersect(famOrig[,"V2"], famImpute[,"V2"]))
if ( sumFID == nrow(famOrig) & sumIID == nrow(famOrig) ) {
message("Sample IDs after imputation in .fam file swapped!")
ID4sort <- famOrig[,c("V1", "V2")]
write.table(ID4sort, file="ID4sort.txt", quote=FALSE,
row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
system(paste0(plink, " --bfile ", prefixMerge,
" --indiv-sort f ID4sort.txt --make-bed --out ", outputPrefix))
} else {
message("Sample IDs have changed!!")
## changes ID codes for individuals specified in recoded.txt,
## which should be in the format of 4 cols per row:
## old FID, old IID, new FID, new IID, e.g.
recodMat <- cbind(famImpute[,c("V1", "V2")], famOrig[,c("V1", "V2")])
write.table(recodMat, file="recoded.txt", quote=FALSE,
row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
system(paste0(plink, " --bfile ", prefixMerge,
" --update-ids recoded.txt --make-bed --out ", outputPrefix))
}
}
#######################################################
setwd("..")
setwd("..")
system(paste0("mv ", tmpImputeDir, "/", postImputeDIR, outputPrefix, ".* ."))
system(paste0("mv ", tmpImputeDir, "/", imputedDIR, outputInfoFile, " ."))
if (keepTmpDir == FALSE){
system(paste0("rm -r ", tmpImputeDir))
} else if (keepTmpDir == TRUE){
print("Keep the intermediate imputation folder.")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.