Gimpute: Gimpute: An efficient genetic data processing and imputation pipeline

# File   : runTests.R
# Author : Junfang Chen
# Version0: 28 Jun 2016
# VersionX: 27 Feb 2019

library(Gimpute)
library(lattice)
library(doParallel)

## Start an R session in a directory where you'd like to generate the data.
system("mkdir 1-genoUpdate")
system("mkdir 2-genoQC")
system("mkdir 3-checkAlign")
system("mkdir 4-imputation")
system("mkdir 5-reductAndExpand")
system("mkdir 6-finalResults")

## Define the directory where you place the imputation reference files 
## Reference panel 1
referencePanel <- "1000Gphase1v3_macGT1" ## indicator
impRefDIR1kGp1v3 <- "/data/noether/dataRawReadOnly/reference/1000GP_phase1v3/"
impRefDIR <- impRefDIR1kGp1v3

#  # Alternative Reference panel 2
# referencePanel <- "1000Gphase3" ## indicator 
# impRefDIR1kGp3 <- "/data/noether/dataRawReadOnly/reference/1000GP_Phase3/"
# impRefDIR <- paste0(impRefDIR1kGp3, "1000GP_Phase3/")

## Define required tools
toolDIR <- "/home/junfang.chen/Gimpute/tools/"
plink <- paste0(toolDIR, "plink")
gcta <- paste0(toolDIR, "gcta64") 
shapeit <- paste0(toolDIR, "shapeit") 
gtool <- paste0(toolDIR, "gtool") 
qctool <- paste0(toolDIR, "qctool")
imputeTool <- "impute4"
if (imputeTool == "impute2"){ 
    impute <- paste0(toolDIR, "impute2") 
} else if (imputeTool == "impute4"){ 
    impute <- paste0(toolDIR, "impute4.1_r291.2")
} 


############################################################
## Run the following code, only if you have the above tools 
## and the imputation reference files, configuration files.
############################################################
  
############################################################
## code chunk number 1: SNP information update  
############################################################
runTimeList <- list()
t4genoUpdateTmp <- proc.time()   
## step 0
## Load PLINK binary files and additional files from Gimpute.
setwd("./1-genoUpdate/") 
bedFile <- system.file("extdata", "dataChr21.bed", package="Gimpute")
bimFile <- system.file("extdata", "dataChr21.bim", package="Gimpute") 
famFile <- system.file("extdata", "dataChr21.fam", package="Gimpute")
system(paste0("scp ", bedFile, " ."))   
system(paste0("scp ", bimFile, " ."))   
system(paste0("scp ", famFile, " ."))   

metadataFile <- system.file("extdata", "1_01_metaData.txt", package="Gimpute")
removedSampIDFile <- system.file("extdata", "excludedSampIDs.txt", 
                                 package="Gimpute")
excludedProbeIdsFile <- system.file("extdata", "excludedProbeIDs.txt", 
                                    package="Gimpute")
## Genotyping chip annotation file 
chipAnnoFile <- system.file("extdata", "coriellAffyChip.txt", 
                                 package="Gimpute")

## step 1  
system(paste0("scp ", metadataFile, " ."))  
 
 
############################################################ 
## pipeline function
inputPrefix <- "dataChr21"
ancestrySymbol <- "EUR"
outputPrefix <- "1_11_removedYMtSnp" 
metaDataFile <- "1_01_metaData.txt"
chipType <- "rsIDstudy"
updateGenoInfo(plink, inputPrefix, metaDataFile, removedSampIDFile,
               ancestrySymbol, excludedProbeIdsFile, chipAnnoFile,
               chipType, outputPrefix, keepInterFile=TRUE)
setwd("..")   

## runtime
t4genoUpdate <- proc.time() - t4genoUpdateTmp
print(t4genoUpdate)
runTimeList$t4genoUpdate <- t4genoUpdate

t4genoQCTmp <- proc.time() 

############################################################
### code chunk number 2: Quality Control
############################################################
## step 0
## copy the last output plink files from 1-genoUpdate
inputPrefix4QC <- "1_11_removedYMtSnp"
system(paste0("scp ./1-genoUpdate/", inputPrefix4QC, ".*", " ./2-genoQC/"))
setwd("./2-genoQC/")

## step 1
inputPrefix <- inputPrefix4QC
hhCutOff <- 0.005 ##  can be tuned
outputPrefix <- "2_01_removedSnpHetX" 
outputHetSNPfile <- "2_01_snpHHfreqAll.txt"
outputRetainSNPfile <- "2_01_snpHHfreqRetained.txt"
removedSnpHetX(plink, inputPrefix, hhCutOff, outputPrefix, 
               outputHetSNPfile, outputRetainSNPfile)


## step 2  2_02_removedHetXInst
inputPrefix <- "2_01_removedSnpHetX"
hhSubjCutOff <- 15
outputPrefix <- "2_02_removedInstHetX"
outputSubjHetFile <- "2_02_instHetXfreqAll.txt" 
outputRetainSubjectFile <- "2_02_instHetXfreqRetained.txt"  
outputHetSNPfile <- "2_02_snpHHfreqAll.txt"
removedMaleHetX(plink, inputPrefix, hhSubjCutOff, 
                outputPrefix, outputSubjHetFile, 
                outputRetainSubjectFile, outputHetSNPfile) 

inputPrefix <- "2_02_removedInstHetX"
outputPrefix <- "2_13_removedSnpHweFemaleX" 
genoQC(plink, inputPrefix, 
       snpMissCutOffpre=0.05, 
       sampleMissCutOff=0.02, 
       Fhet=0.2, cutoffSubject=0.05, cutoffSNP=0.1,
       snpMissCutOffpost=0.02, 
       snpMissDifCutOff=0.02,
       femaleChrXmissCutoff=0.05, 
       pval4autoCtl=0.000001, 
       pval4femaleXctl=0.000001, outputPrefix, keepInterFile=TRUE)
 
## step 13 
inputPrefix <- "2_13_removedSnpHweFemaleX" 
outputPC4subjFile <- "2_14_eigenvalAfterQC.txt"
outputPCplotFile <- "2_14_eigenvalAfterQC.png"
nCores <- detectCores() 
plotPCA4plink(gcta, inputPrefix, nThread=nCores, 
              outputPC4subjFile, outputPCplotFile)


############################################################

## Run the following code, only after detecting the potential 
## PCA outliers. Otherwise, set cutoff as "NULL"
############################################################

## This step is optional
## remove outliers 
# cutoff <-  NULL
cutoff <-  c(-0.4, 0.2)
cutoffSign <- "greater" ## not used if cutoff == NULL, and with two values 
inputPC4subjFile <- "2_14_eigenvalAfterQC.txt"
outputPC4outlierFile <- "2_14_eigenval4outliers.txt"
outputPCplotFile <- "2_14_removedOutliers.png"
outputPrefix <- "2_14_removedOutliers" 
nCores <- detectCores() 
removeOutlierByPCs(plink, gcta, inputPrefix, nThread=nCores, 
                   cutoff, cutoffSign, inputPC4subjFile, 
                   outputPC4outlierFile, outputPCplotFile, outputPrefix) 

setwd("..")

## runtime
t4genoQC <- proc.time() - t4genoQCTmp
print(t4genoQC)
runTimeList$t4genoQC <- t4genoQC
t4checkAlignTmp <- proc.time()   

############################################################
### code chunk number 3: check the alignment
############################################################   
system("cp ./2-genoQC/2_14_removedOutliers.* ./3-checkAlign/ ")
setwd("./3-checkAlign/")
renamePlinkBFile(inputPrefix="2_14_removedOutliers", 
                 outputPrefix="3_1_QCdata", action="move")


inputFile <- paste0(impRefDIR,"*.legend.gz")  
## To recode the intermediate disk usage >> keep in the same directory.
bimReferenceFile <- "bimImputeRef.txt"

## If not available  >>> 
.prepareLegend2bim(inputFile, referencePanel, 
                   outputFile=bimReferenceFile, ncore=nCores) 
## If not available  <<< 

inputPrefix <- "3_1_QCdata" 
out2.snp <- "3_2_snpSameNameDiffPos"
out2 <- "3_2_removedSnpSameNameDiffPos"
out3 <- "3_3_removedSnpMissPos"
out3.snp <- "3_3_snpMissPos"
out4 <- "3_4_removedSnpDiffAlleles"
out4.snp <- "3_4_snpDiffAlleles"
out4.snpRetained <- "3_4_snpImpRefAlleles"
nCores <- detectCores() 
checkAlign2ref(plink, inputPrefix, referencePanel, bimReferenceFile, 
               out2, out2.snp, out3, out3.snp, 
               out4, out4.snp, out4.snpRetained, nCore=nCores)
setwd("..") 

## runtime
t4checkAlign <- proc.time() - t4checkAlignTmp
print(t4checkAlign)
runTimeList$t4checkAlign <- t4checkAlign

t4imputeTmp <- proc.time()   

############################################################
### code chunk number 4: Imputation
############################################################
## step 1 
## Remove monomorphic SNPs from lifted/QC-ed data  
prefixAlign2ref <- "3_4_removedSnpDiffAlleles" 
outputPrefix <- "4_1_removedMonoSnp"
outputMonoSNPfile <- "4_1_snpMonoRemoved.txt" # will be used in step 4,5.

## copy plink files from last step; 
system(paste0("cp ./3-checkAlign/", 
       prefixAlign2ref, ".* ./4-imputation/"))
## remove Monomorphic SNPs
setwd("4-imputation")
removedMonoSnp(plink, inputPrefix=prefixAlign2ref, 
               outputPrefix, outputSNPfile=outputMonoSNPfile) 
# step 2 
#########################################################################
######################################################################### 
# imputation main pipeline
 
inputPrefix <- "4_1_removedMonoSnp"  
outputPrefix <- "4_2_imputedDataset"   
outputInfoFile <- "4_2_snpImputedInfoScore.txt"
tmpImputeDir <- paste0(imputeTool, "_", referencePanel)
nCores <- detectCores()
phaseImpute(inputPrefix, outputPrefix, autosome=TRUE,
            plink, shapeit, imputeTool, impute, qctool, gtool, 
            windowSize=3000000, effectiveSize=20000, 
            nCore=nCores, threshold=0.9, outputInfoFile, SNP=TRUE,
            referencePanel, impRefDIR, tmpImputeDir, keepTmpDir=TRUE)

##################################################### After imputation
## runtime
t4impute <- proc.time() - t4imputeTmp
print(t4impute)
runTimeList$t4impute <- t4impute
t4postImputeTmp <- proc.time()   



inputPrefix <- "4_2_imputedDataset" 
out1 <- "4_3_wellImputeData"
out2 <- "4_4_removedMonoSnpAfter"
out3 <- "4_5_addedMonoSnpAfter"
out4 <- "4_6_removedSnpMissPostImp" 
outputInfoFile <- "4_2_snpImputedInfoScore.txt"
outputMonoSNPfile <- "4_1_snpMonoRemoved.txt"  
prefixAlign2ref <- "3_4_removedSnpDiffAlleles" 
outRemovedSNPfile <- "4_6_snpRemovedMissPostImp.txt"
outRetainSNPfile <- "4_6_snpRetainMissPostImp.txt"

postImpQC(plink, inputPrefix, out1, out2, out3, out4,
          outputInfoFile, infoScore=0.6, outputMonoSNPfile, 
          prefixAlign2ref, missCutoff=20, outRemovedSNPfile, 
          outRetainSNPfile, referencePanel)
   
setwd("..")

############################################################
### code chunk number 5: Data subset and expansion 
############################################################

inputPrefix <- "4_6_removedSnpMissPostImp"
inputQCprefix <- "3_1_QCdata"
snpRefAlleleFile <- "3_4_snpImpRefAlleles.txt"
snpDiffAlleleFile <- "3_4_snpDiffAlleles.txt"
snpMissPosFile <- "3_3_snpMissPos.txt"
snpSameNameDifPosFile <- "3_2_snpSameNameDiffPos.txt" 

dir5 <- "./5-reductAndExpand/"
## imputed dataset
system(paste0("scp ./4-imputation/", inputPrefix, ".* ", dir5)) 
system(paste0("scp ./3-checkAlign/", inputQCprefix, ".* ", dir5))
system(paste0("scp ./3-checkAlign/", snpRefAlleleFile, " ", dir5))
system(paste0("scp ./3-checkAlign/", snpDiffAlleleFile, " ", dir5))
system(paste0("scp ./3-checkAlign/", snpMissPosFile, " ", dir5))
system(paste0("scp ./3-checkAlign/", snpSameNameDifPosFile, " ", dir5))

setwd(dir5)
reducedToSpecificfn <- "5_1_reducedToSpecific"
specificDiffAllelefn <- "5_2_specificDiffAllele"
specificMissPosfn <- "5_3_specificMissPos"
specificDiffPosfn <- "5_4_specificDiffPos" 

reductExpand(plink, referencePanel, inputPrefix, inputQCprefix, 
             snpRefAlleleFile, snpDiffAlleleFile, 
             snpMissPosFile, snpSameNameDifPosFile, 
             reducedToSpecificfn, specificDiffAllelefn, 
             specificMissPosfn, specificDiffPosfn)

setwd("..")

############################################################
### code chunk number 6: Final result
############################################################

dir6 <- "./6-finalResults/"
system(paste0("scp ./1-genoUpdate/1_01_metaData.txt ", dir6))
system(paste0("scp ./4-imputation/4_6_removedSnpMissPostImp.* ", dir6))  
system(paste0("scp ./5-reductAndExpand/5_4_specificDiffPos.* ", dir6))
setwd(dir6)
renamePlinkBFile(inputPrefix="4_6_removedSnpMissPostImp", 
                 outputPrefix="imputedSnpsDataset", action="move")
renamePlinkBFile(inputPrefix="5_4_specificDiffPos", 
                 outputPrefix="specificSnpsDataset", action="move")



## runtime
t4postImpute <- proc.time() - t4postImputeTmp
print(t4postImpute)
runTimeList$t4postImpute <- t4postImpute 

totallist <- lapply(runTimeList, function(time){time[[3]]})
print(totallist)   
runTimeList$totalMin <- sum(unlist(totallist))/60
runTimeList$totalSec <- sum(unlist(totallist))
print(runTimeList)



############################################################
### code chunk number 6: Extending pipeline
############################################################
 
############################################################

## Run the following code, only after you have installed genipe
## and the specified imputation reference files.

############################################################

# ## additional configuration files for Genipe
# mainRefGenipe <- "/data/noether/dataProcessResults/10_Common/"
# impRefDir <- paste0(mainRefGenipe, "imputeReference/1000G_Phase3_2014/")
# fastaFile <- paste0(mainRefGenipe, "imputeReference/hg19/hg19.fasta")

# imputeTool <- "impute2"
# ## Impute genotypes using Genipe
# chrs <- 21
# inputPrefix <- "dataChr21"
# thread4impute2 <- 20 ## tune by yourself
# thread4shapeit <- 30
# segmentSize <- 3000000
# # imputedByGenipe(chrs, impRefDir, inputPrefix, shapeit, impute, 
# #                plink, fastaFile, segmentSize, thread4impute2, thread4shapeit) 

# ## merge chunked genomic imputed results
# ## example
# chr <- 21 
# inputImpute2 <- "chr2.33000001_36000000.impute2"
# probability <- 0.9
# completionRate <- 0.98
# # info <- 0.6
# outputPrefix <- paste0("imputedChr", chr)
# # mergeByGenipe(inputImpute2, chr, probability, 
# #              completionRate, info, outputPrefix)
 
# ## extract imputed markers using Genipe 
# chr <- 3
# inputImpute2 <- paste0("chr", chr,".imputed.impute2")
# inputMAP <- paste0("chr", chr,".imputed.map")
# format <- "bed"
# prob <- 0.9
# outputPrefix <- paste0("imputedChr", chr)  
# # extractByGenipe(inputImpute2, inputMAP, outputPrefix, format, prob)
   
#
transbioZI/Gimpute documentation built on April 10, 2022, 4:20 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
transbioZI/Gimpute
Gimpute: An efficient genetic data processing and imputation pipeline

tests/runTests.R
In transbioZI/Gimpute: Gimpute: An efficient genetic data processing and imputation pipeline

R Package Documentation

Browse R Packages

We want your feedback!

transbioZI/Gimpute Gimpute: An efficient genetic data processing and imputation pipeline

tests/runTests.R In transbioZI/Gimpute: Gimpute: An efficient genetic data processing and imputation pipeline

R Package Documentation

Browse R Packages

We want your feedback!

transbioZI/Gimpute
Gimpute: An efficient genetic data processing and imputation pipeline

tests/runTests.R
In transbioZI/Gimpute: Gimpute: An efficient genetic data processing and imputation pipeline