R/genotypeQC.R

Defines functions genoQC removeOutlierByPCs plotPCA4plink removedSnpFemaleChrXhweControl removedSnpHWEauto removedSnpFemaleChrXmiss removedSnpMissDiff removedMendelErr removedParentIdsMiss removedInstFhet removedInstMiss removedSnpMiss setHeteroHaploMissing removedMaleHetX removedSnpHetX

Documented in genoQC plotPCA4plink removedInstFhet removedInstMiss removedMaleHetX removedMendelErr removedParentIdsMiss removedSnpFemaleChrXhweControl removedSnpFemaleChrXmiss removedSnpHetX removedSnpHWEauto removedSnpMiss removedSnpMissDiff removeOutlierByPCs setHeteroHaploMissing

#######################################################################
#
# Package Name: Gimpute
# Description:
#   Gimpute -- An efficient genetic data imputation pipeline
#
# Gimpute R package, An efficient genetic data imputation pipeline
# Copyright (C) 2018  Junfang Chen (junfang.chen@zi-mannheim.de)
# All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# 

##########################################################################


#' Remove heterozygous SNPs in male chromosome X
#'
#' @description
#' Remove heterozygous SNPs in haploid male chromosome X only if chromosome X 
#' data exists.


#' @param plink an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param hhCutOff the cutoff for removing male haploid heterozygous SNPs 
#' on the chromosome X. The default is 0.005.
#' @param outputPrefix the prefix of the output PLINK binary files.
#' @param outputHetSNPfile the output pure text file that stores 
#' all heterozygous SNPs with their frequency (the number of males for the 
#' corresponding SNP), if any. Lines are sorted by descending number.
#' @param outputRetainSNPfile the output pure text file that stores 
#' retained heterozygous SNPs with their frequency (the number of males for 
#' the corresponding SNP), if any. Lines are sorted by descending number.

#' @return 1.) The output PLINK binary files. 2.) A pure text files (if any)
#' with two columns: SNPs and their corresponding frequency. 3.) After SNP 
#' removal, a pure text files (if any) with two columns: SNPs and their 
#' corresponding frequency.
#' @details  A haploid heterozygous is a male genotype that is heterozygous, 
#' which could be an error given the haploid nature of the male X chromosome.
#' In principle, one has to remove all heterozygous SNPs of chromosome X 
#' in males. 
#' However, too many SNPs might be removed in some data sets. 
#' Therefore a small percentage of such SNPs in the data set is allowed.

#' @export  
#' @author Junfang Chen  
#' @examples 
#' ## In the current working directory
#' bedFile <- system.file("extdata", "genoUpdatedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "genoUpdatedData.bim", package="Gimpute") 
#' famFile <- system.file("extdata", "genoUpdatedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, bimFile, famFile, " ."))  
#' inputPrefix <- "genoUpdatedData" 
#' hhCutOff <- 0.005 ##  can be tuned
#' outputPrefix <- "2_01_removedSnpHetX" 
#' outputHetSNPfile <- "2_01_snpHHfreqAll.txt"
#' outputRetainSNPfile <- "2_01_snpHHfreqRetained.txt"
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## removedSnpHetX(plink, inputPrefix, hhCutOff, outputPrefix, 
#' ##                outputHetSNPfile, outputRetainSNPfile)


removedSnpHetX <- function(plink, inputPrefix, hhCutOff, outputPrefix, 
                           outputHetSNPfile, outputRetainSNPfile){

    bim <- read.table(paste0(inputPrefix, ".bim"), stringsAsFactors=FALSE)  
    chrCodes <- names(table(bim[,1]))
    chr23check <- length(grep(23, chrCodes))

    if (chr23check == 1) {  
        ## just to get .hh file and .fam file 
        system(paste0(plink, " --bfile ", inputPrefix, 
               " --chr 23 --filter-males --make-bed --out male23nonPAR"))

        if (is.na(file.size("male23nonPAR.hh"))) { 
            ## copy/rename all snp info updated plink files
            renamePlinkBFile(inputPrefix, outputPrefix, action="copy")  
        } else {      

            ## *.hh: A text file with one line per error (sorted primarily by 
            ## variant ID, 2nd by sample ID) with the following 3 fields:
            # Family ID  Within-family ID Variant ID
            hh <- read.table("male23nonPAR.hh", stringsAsFactors=FALSE)
            fam <- read.table("male23nonPAR.fam", stringsAsFactors=FALSE)

            hetSNPsFreq <- table(hh[,3])
            # hetSNPFreqFreq <- table(hetSNPs) 

            cut4removeHetSNP <- nrow(fam) * hhCutOff
            mostFakeSNPs <- hetSNPsFreq[which(hetSNPsFreq >= cut4removeHetSNP)] 
            mostFakeSNPs <- names(mostFakeSNPs) 
            write.table(mostFakeSNPs, file="mostFakeSNPs.txt", quote=FALSE, 
                        row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ") 
            ## remove these fake SNPs
            system(paste0(plink, " --bfile ", inputPrefix, 
                   " --exclude mostFakeSNPs.txt --make-bed --out ", 
                   outputPrefix))
            ## remove unwanted files
            system(paste0("rm mostFakeSNPs.txt"))

            ## generate hetSNPsFreq in .txt file 
            hetSNPinstNum <- as.data.frame(hetSNPsFreq, stringsAsFactors=FALSE)
            hetSNPinstNum <- hetSNPinstNum[order(hetSNPinstNum[,2], 
                                                 decreasing=TRUE),] 
            ## all heterozygous SNPs 
            write.table(hetSNPinstNum, file=outputHetSNPfile, quote=FALSE, 
                        row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ") 

            ## remaining heterozygous SNPs 
            hetSNPinstNumSub <- hetSNPinstNum[!is.element(hetSNPinstNum[,1], 
                                                          mostFakeSNPs), ]
            write.table(hetSNPinstNumSub, file=outputRetainSNPfile, 
                        quote=FALSE, row.names=FALSE, 
                        col.names=FALSE, eol="\r\n", sep=" ") 
        } 

        system(paste0("rm male23nonPAR.* ", inputPrefix,".*")) 

    } else { 
        ## copy/rename plink files
        renamePlinkBFile(inputPrefix, outputPrefix, action="copy") 
        system(paste0("rm ", inputPrefix,".*"))
    }
}

 
##########################################   
##########################################
#' Remove male subjects with haploid heterozygous SNPs 
#'
#' @description
#' Determine the frequency of male subjects that have heterozygous SNPs on 
#' chromosome X and a reasonable cutoff to remove those affect males, if 
#' chromosome X data exists.


#' @param plink an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param hhSubjCutOff the cutoff for removing male subjects with haploid 
#' heterozygous SNPs on the chromosome X. The default is 15.
#' @param outputPrefix the prefix of the output PLINK binary files.
#' @param outputSubjHetFile the output pure text file that stores male subjects 
#' that have heterozygous SNPs with their frequency (if any), i.e. the number 
#' of .hh SNPs in this male. Lines are sorted by descending number.
#' @param outputRetainSubjectFile the output pure text file that stores
#' male subjects that have heterozygous SNPs with their frequency after 
#' subject removal (if any). Lines are sorted by descending number.
#' @param outputHetSNPfile the output pure text file that stores all 
#' heterozygous SNPs with their frequency (the number of males for this SNP)
#' , if any. Lines are sorted by descending number.

#' @return 1.) The output PLINK binary files. 2.) A pure text file with 
#' two columns: heterozygous male subjects and their corresponding heterozygous
#' SNPs. 3.) After subject removal, a pure text file consisting of two columns:
#' heterozygous male subjects and their corresponding heterozygous SNPs. 
#' A pure text file with two columns: all heterozygous SNPs and their frequency.

#' @details  A haploid heterozygous is a male genotype that is heterozygous, 
#' which could be an error given the haploid nature of the male X chromosome.
#' In principle, one has to remove all males that have heterozygous SNPs on the 
#' chromosome X. However, too many males might be removed in some data sets. 
#' Therefore a small percentage of such males in the data set is allowed.
 
#' @export 
#' @author Junfang Chen 
#' @examples 
#' ## In the current working directory
#' bedFile <- system.file("extdata", "genoUpdatedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "genoUpdatedData.bim", package="Gimpute") 
#' famFile <- system.file("extdata", "genoUpdatedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, bimFile, famFile, " ."))  
#' inputPrefix <- "genoUpdatedData" 
#' hhSubjCutOff <- 15 ##  can be tuned
#' outputPrefix <- "2_02_removedInstHetX" 
#' outputSubjHetFile <- "2_02_instHetXfreqAll.txt" 
#' outputRetainSubjectFile <- "2_02_instHetXfreqRetained.txt"  
#' outputHetSNPfile <- "2_02_snpHHfreqAll.txt"
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## removedMaleHetX(plink, inputPrefix, hhSubjCutOff,
#' ##                 outputPrefix, outputSubjHetFile, 
#' ##                 outputRetainSubjectFile, outputHetSNPfile)


removedMaleHetX <- function(plink, inputPrefix, hhSubjCutOff=15, outputPrefix, 
                            outputSubjHetFile, outputRetainSubjectFile, 
                            outputHetSNPfile){        

    bim <- read.table(paste0(inputPrefix, ".bim"), stringsAsFactors=FALSE) 
    chrCodes <- names(table(bim[,1]))
    chr23check <- length(grep(23, chrCodes))
    if (chr23check == 1) { 
        ## just to get .hh file and .fam file 
        system(paste0(plink, " --bfile ", inputPrefix, 
               " --chr 23 --filter-males --make-bed --out male23nonPAR"))
        if ( is.na(file.size("male23nonPAR.hh")) ){  
            ## copy/rename all snp info updated plink files
            renamePlinkBFile(inputPrefix, outputPrefix, action="copy") 
            system(paste0("touch ", outputSubjHetFile) )
            system(paste0("touch ", outputRetainSubjectFile) )

        } else {     

            ## .hh: A text file with one line per error (sorted primarily by  
            ## variant ID,secondarily by sample ID) with the following 3 fields:
            ## Family ID  Within-family ID Variant ID
            hh <- read.table("male23nonPAR.hh", stringsAsFactors=FALSE)
            fam <- read.table("male23nonPAR.fam", stringsAsFactors=FALSE)
                            
            hetInstFreq <- table(hh[,2])   
            mostFakeInst <- hetInstFreq[which(hetInstFreq >= hhSubjCutOff)]
            whFakeID <- is.element(fam[,2], names(mostFakeInst))  
            mostFakeInstID <- fam[whFakeID, c("V1", "V2")]

            write.table(mostFakeInstID, file="mostFakeInst4plink.txt", 
                        quote=FALSE, row.names=FALSE, 
                        col.names=FALSE, eol="\r\n", sep=" ") 
            ## remove these fake SNPs
            system(paste0(plink, " --bfile ", inputPrefix, 
                   " --remove mostFakeInst4plink.txt --make-bed --out ", 
                   outputPrefix))
            system(paste0("rm mostFakeInst4plink.txt"))

            ## generate hetSNPsFreq in .txt file 
            InstHetSNP <- as.data.frame(hetInstFreq, stringsAsFactors=FALSE) 
            InstHetSNP <- InstHetSNP[order(InstHetSNP[,2], decreasing=TRUE),] 
            write.table(InstHetSNP, file=outputSubjHetFile, quote=FALSE, 
                        row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ") 

            ## remaining males with heterozygous SNPs 
            InstHetSNPsub <- InstHetSNP[!is.element(InstHetSNP[,1], 
                                                    names(mostFakeInst)), ] 
            write.table(InstHetSNPsub, file=outputRetainSubjectFile, 
                        quote=FALSE, row.names=FALSE, 
                        col.names=FALSE, eol="\r\n", sep=" ") 
          } 

        system(paste0("rm male23nonPAR.*")) 
        ## output a text file that stores all heterozygous SNPs 
        ## with their frequency after the above steps; 
        system(paste0(plink, " --bfile ", outputPrefix, 
               " --chr 23 --filter-males --make-bed --out male23nonPAR"))

        if (is.na(file.size("male23nonPAR.hh"))) {  
            system(paste0("touch ", outputHetSNPfile)) ## with empty output      
        } else {      
            hh <- read.table("male23nonPAR.hh", stringsAsFactors=FALSE)  
            hetSNPsFreq <- table(hh[,3])  
            hetSNPinstNum <- as.data.frame(hetSNPsFreq, stringsAsFactors=FALSE)
            hetSNPinstNum <- hetSNPinstNum[order(hetSNPinstNum[,2], 
                                                 decreasing=TRUE),] 
            ## all heterozygous SNPs 
            write.table(hetSNPinstNum, file=outputHetSNPfile, 
                        quote=FALSE, row.names=FALSE, 
                        col.names=FALSE, eol="\r\n", sep=" ")   
        }
        system(paste0("rm male23nonPAR.*"))    
     } else { 
        ## copy/rename plink files
        renamePlinkBFile(inputPrefix, outputPrefix, action="copy") 
    }
}





##########################################   
##########################################
#' Set haploid heterozygous SNPs as missing 
#'
#' @description
#' Set all heterozygous alleles of chromosome X SNPs in male as missing.

#' @param plink an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param outputPrefix the prefix of the output PLINK binary files.
#' @return The output PLINK binary files after setting haploid heterozygous 
#' SNPs as missing.

#' @export 

#' @author Junfang Chen 
#' @examples
#' ## In the current working directory
#' bedFile <- system.file("extdata", "genoUpdatedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "genoUpdatedData.bim", package="Gimpute") 
#' famFile <- system.file("extdata", "genoUpdatedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, bimFile, famFile, " ."))  
#' inputPrefix <- "genoUpdatedData" 
#' outputPrefix <- "2_03_setHeteroHaploMissing" 
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## setHeteroHaploMissing(plink, inputPrefix, outputPrefix)

setHeteroHaploMissing <- function(plink, inputPrefix, outputPrefix){
     
    system(paste0(plink, " --bfile ", inputPrefix, 
           " --set-hh-missing --make-bed --out ", outputPrefix))  

}




##########################################   
##########################################
#' Remove SNPs with missing values
#'
#' @description
#' Remove SNPs with missingness of greater than a certain threshold.

#' @param plink an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param snpMissCutOff the cutoff of the missingness for removing SNPs.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param outputPrefix the prefix of the output PLINK binary files.

#' @return The output PLINK binary files after removing SNPs with pre-defined 
#' missing values.

#' @export 
#' @author Junfang Chen 
#' @examples  
#' ## In the current working directory
#' bedFile <- system.file("extdata", "genoUpdatedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "genoUpdatedData.bim", package="Gimpute") 
#' famFile <- system.file("extdata", "genoUpdatedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, bimFile, famFile, " ."))  
#' snpMissCutOff <- 0.05
#' inputPrefix <- "genoUpdatedData" 
#' outputPrefix <- "2_04_removedSnpMissPre" 
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## removedSnpMiss(plink, snpMissCutOff, inputPrefix, outputPrefix)
 
removedSnpMiss <- function(plink, snpMissCutOff, inputPrefix, outputPrefix){
 
    system(paste0(plink, " --bfile ", inputPrefix, " --geno ", snpMissCutOff, 
           " --make-bed --out ", outputPrefix))  
}


##########################################   
########################################## 
#' Remove subjects with missing values
#'
#' @description
#' Remove Subjects or instances with missingness of greater than a certain 
#' threshold.

#' @param plink an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param sampleMissCutOff the cutoff of the missingness for removing 
#' subjects/instances.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param outputPrefix the prefix of the output PLINK binary files.

#' @return The output PLINK binary files after removing subjects with  
#' a pre-defined removal cutoff.

#' @export 
#' @author Junfang Chen 
#' @examples  
#' ## In the current working directory
#' bedFile <- system.file("extdata", "genoUpdatedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "genoUpdatedData.bim", package="Gimpute") 
#' famFile <- system.file("extdata", "genoUpdatedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, bimFile, famFile, " ."))  
#' sampleMissCutOff <- 0.02
#' inputPrefix <- "genoUpdatedData" 
#' outputPrefix <- "2_05_removedInstMiss" 
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## removedInstMiss(plink, sampleMissCutOff, inputPrefix, outputPrefix)
 

removedInstMiss <- function(plink, sampleMissCutOff, inputPrefix, outputPrefix){
 
    system(paste0(plink, " --bfile ", inputPrefix, " --mind ", sampleMissCutOff, 
           " --make-bed --out ", outputPrefix)) 
    # system(paste0("rm ", outputPrefix, ".irem"))

}



##########################################   
##########################################    
#' Remove subjects with abnormal autosomal heterozygosity deviation
#'
#' @description
#' Remove subjects with great autosomal heterozygosity deviation. 

#' @param plink an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param Fhet the cutoff of the autosomal heterozygosity deviation.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param outputPrefix the prefix of the output PLINK binary files.

#' @return The output PLINK binary files after removing subjects with great 
#' autosomal heterozygosity deviation.
#' @details If the cutoff of the autosomal heterozygosity deviation is set to 
#' be greater than 0.2, i.e. |Fhet| >= 0.2, then this analysis will 
#' automatically skip haploid markers (male X and Y chromosome markers).

#' @export 
#' @author Junfang Chen 
#' @examples
#' ## In the current working directory
#' bedFile <- system.file("extdata", "genoUpdatedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "genoUpdatedData.bim", package="Gimpute") 
#' famFile <- system.file("extdata", "genoUpdatedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, bimFile, famFile, " ."))  
#' Fhet <- 0.2
#' inputPrefix <- "genoUpdatedData" 
#' outputPrefix <- "2_06_removedInstFhet" 
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## removedInstFhet(plink, Fhet, inputPrefix, outputPrefix)
 

removedInstFhet <- function(plink, Fhet, inputPrefix, outputPrefix){ 

    system(paste0(plink, " --bfile ", inputPrefix, 
           " --het --out ", outputPrefix))
    ##  F inbreeding coefficient estimate
    autoHet <- read.table(file=paste0(outputPrefix, ".het"), header=TRUE)  
    fhet <- autoHet[, "F"]
    qc_data_fhet <- autoHet[which(abs(fhet) >= Fhet), c(1, 2)]  
    ## the individual IDs to be removed  
    write.table(qc_data_fhet, file=paste0(outputPrefix, ".txt"), quote=FALSE, 
                row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
  
    ## To remove certain individuals 
    system(paste0(plink, " --bfile ", inputPrefix, " --remove ", 
           paste0(outputPrefix, ".txt"), " --make-bed --out ", outputPrefix))
    system(paste0("rm ", outputPrefix, ".het"))
    system(paste0("rm ", outputPrefix, ".txt"))
 
}



##########################################   
##########################################
#' Reset paternal and maternal codes  
#'
#' @description
#' Reset paternal and maternal codes of non-founders if parents not present. 
#' Replace the paternal ID and maternal ID of subjects (childs) by the
#' value zero if the paternal ID and the maternal ID do not belong to any
#' subject (parent) with the same family ID as the child. 

#' @param plink an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param outputPrefix the prefix of the output PLINK binary files.

#' @return The output PLINK binary files.
#' @details Do make sure that all your family relationships are correct 
#' in your input data before applying this function. By default, 
#' if parental IDs are provided for a sample, 
#' they are not treated as a founder even if neither parent is 
#' in the dataset.  With no modifiers, --make-founders clears 
#' both parental IDs whenever at least one parent is not in the dataset, 
#' and the affected samples are now considered as founders. 


#' @export  
#' @author Junfang Chen 
#' @examples
#' ## In the current working directory
#' bedFile <- system.file("extdata", "genoUpdatedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "genoUpdatedData.bim", package="Gimpute") 
#' famFile <- system.file("extdata", "genoUpdatedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, bimFile, famFile, " ."))  
#' inputPrefix <- "genoUpdatedData" 
#' outputPrefix <- "2_07_removedParentIdsMiss" 
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## removedParentIdsMiss(plink, inputPrefix, outputPrefix)

 
removedParentIdsMiss <- function(plink, inputPrefix, outputPrefix){ 

    # Remove the parent IDs which do not belong to subjects
    system(paste0(plink, " --bfile ", inputPrefix, 
           " --make-founders require-2-missing --make-bed --out ", 
           outputPrefix)) 
}



##########################################   
##########################################
#' Check Mendel errors for family-based data
#'
#' @description
#' Exclude subjects and/or genetic variants (SNPs) based on Mendel errors in 
#' the family data (trio/duo). 

#' @param plink an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param cutoffSubject the cutoff determines that families (subjects) with 
#' more than the predefined cutoff of Mendel errors by considering all SNPs 
#' will be removed. The default is 0.05.
#' @param cutoffSNP the cutoff indicates that SNPs with more than the 
#' predefined cutoff of Mendel error rate will be excluded 
#' (i.e. based on the number of trios/duos). The default is 0.1.
#' @param outputPrefix the prefix of the output PLINK binary files.

#' @return The output PLINK binary files.
#' @details Do make sure that all your family relationships are correct 
#' in your input data before applying this function. The input PLINK data 
#' should have complete sex and group/outcome information. By default, trios 
#' and duos are both considered. If no family information is given at all 
#' (only founders), then this function will not remove any subjects or variants 
#' but give the warning showing that no duos or trios are present. 

#' @export  
#' @author Junfang Chen 
#' @examples
#' ## In the current working directory
#' bedFile <- system.file("extdata", "dataWithFamChr21.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "dataWithFamChr21.bim", package="Gimpute") 
#' famFile <- system.file("extdata", "dataWithFamChr21.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, bimFile, famFile, " ."))  
#' inputPrefix <- "dataWithFamChr21" 
#' outputPrefix <- "removedMendelErr" 
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## removedMendelErr(plink, inputPrefix, 
#' ##                  cutoffSubject, cutoffSNP, outputPrefix)

  
removedMendelErr <- function(plink, inputPrefix, cutoffSubject=0.05, 
                             cutoffSNP=0.1, outputPrefix){ 

    system(paste0(plink, " --bfile ", inputPrefix, 
           " --me ", cutoffSubject, " ", cutoffSNP, 
           " --mendel-duos --make-bed --out ", outputPrefix)) 
}



##########################################   
##########################################  
#' Remove SNPs with difference in SNP missingness between cases and controls. 
#'
#' @description
#' Remove SNPs with difference in SNP missingness between cases and controls. 
#' To test for differential call rates between cases and controls for each SNP
 
#' @param plink an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param snpMissDifCutOff the cutoff of the difference in missingness between 
#' cases and controls. 
#' @param outputPrefix the prefix of the output PLINK binary files.
#' @param groupLabel a string value indicating the outcome label: "control",  
#' or, "case" or "caseControl" for both existing groups. For more details, see 
#' \code{\link{getGroupLabel}}.
 
#' @return The output PLINK binary files.
#' @details Only if both case-control groups exist in the input genotype data, 
#' differential SNPs are removed. 

#' @export 
#' @author Junfang Chen 
#' @seealso \code{\link{getGroupLabel}}.
#' @examples  
#' ## In the current working directory
#' bedFile <- system.file("extdata", "genoUpdatedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "genoUpdatedData.bim", package="Gimpute") 
#' famFile <- system.file("extdata", "genoUpdatedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, bimFile, famFile, " ."))  
#' inputPrefix <- "genoUpdatedData" 
#' snpMissDifCutOff <- 0.02
#' outputPrefix <- "2_09_removedSnpMissDiff" 
#' groupLabel <- "control"
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## removedSnpMissDiff(plink, inputPrefix, snpMissDifCutOff, 
#' ##                    outputPrefix, groupLabel)

removedSnpMissDiff <- function(plink, inputPrefix, snpMissDifCutOff, 
                               outputPrefix, groupLabel){


    if (groupLabel != "caseControl"){   
        ## this is only for the control data set
        renamePlinkBFile(inputPrefix, outputPrefix, action="copy") 

    } else if (groupLabel == "caseControl") { 
        outputPrefix.tmp <- paste0(outputPrefix, "tmp")
        system (paste0(plink, " --bfile ", inputPrefix, 
                " --test-missing --out ", outputPrefix.tmp) ) 
        ## Write case/control missingness test to [ *.missing ]  
        ## compute differential call rates 
        ## between cases and controls for each SNP 
        ccmissing <- read.table(file=paste0(outputPrefix.tmp, ".missing"), 
                                header=TRUE, sep="")  
        fmiss <- abs(ccmissing[, "F_MISS_A"] - ccmissing[, "F_MISS_U"])
        whmiss <- which(fmiss >= snpMissDifCutOff) 
        SNPmissCC <- ccmissing[whmiss, "SNP"]
        SNPdifCallrate <- paste0(outputPrefix, ".txt")
        write.table(SNPmissCC, file=SNPdifCallrate, quote=FALSE, 
                    row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")

        ## exclude SNPs 
        system(paste0(plink, " --bfile ", inputPrefix, " --exclude ", 
               SNPdifCallrate, " --make-bed --out ", outputPrefix))  
        system(paste0("rm ", outputPrefix.tmp, ".*"))
        system(paste0("rm ", SNPdifCallrate))
    }
}


##########################################   
##########################################    
#' remove chromosome X SNPs in females
#'
#' @description
#' Remove SNPs on the chromosome X with a pre-defined cutoff for 
#' missingness in females.

#' @param plink an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param femaleChrXmissCutoff the cutoff of the missingness 
#' in female chromosome X SNPs.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param outputPrefix the prefix of the output PLINK binary files.

#' @return The output PLINK binary files.

#' @export 
#' @author Junfang Chen 
#' @examples  
#' ## In the current working directory
#' bedFile <- system.file("extdata", "genoUpdatedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "genoUpdatedData.bim", package="Gimpute") 
#' famFile <- system.file("extdata", "genoUpdatedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, bimFile, famFile, " ."))  
#' femaleChrXmissCutoff <- 0.05
#' inputPrefix <- "genoUpdatedData"  
#' outputPrefix <- "2_10_removedSnpFemaleChrXmiss" 
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## removedSnpFemaleChrXmiss(plink, femaleChrXmissCutoff, 
#' ##                          inputPrefix, outputPrefix)


removedSnpFemaleChrXmiss <- function(plink, femaleChrXmissCutoff, 
                                      inputPrefix, outputPrefix){ 

    bim <- read.table(paste0(inputPrefix, ".bim"), stringsAsFactors=FALSE)  
    chrCodes <- names(table(bim[,1]))
    chr23check <- length(grep(23, chrCodes))

    if (chr23check == 1) { 
        ## additional QC (female-chrX SNPs, missingness ok?)  
        outputPrefix.tmp1 <- paste0(outputPrefix, "tmp1")
        outputPrefix.tmp2 <- paste0(outputPrefix, "tmp2")
        system(paste0(plink, " --bfile ", inputPrefix, 
               " --filter-females --chr 23 --make-bed --out ", 
               outputPrefix.tmp1))
        system(paste0(plink, " --bfile ", inputPrefix, 
               " --filter-females --chr 23 --geno ", femaleChrXmissCutoff, 
               " --make-bed --out ", outputPrefix.tmp2) )

         ## check if equal  
        femaleChrXorig <- read.table(paste0(outputPrefix.tmp1, ".bim"), 
                                     stringsAsFactors=FALSE) 
        femaleChrXMiss <- read.table(paste0(outputPrefix.tmp2, ".bim"), 
                                     stringsAsFactors=FALSE)  
        snps2removed <- setdiff(femaleChrXorig[,2], femaleChrXMiss[,2]) 
        snps2removedfile <- paste0(outputPrefix, ".txt")
        write.table(snps2removed, file=snps2removedfile, quote=FALSE, 
                    row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ") 

        system(paste0(plink, " --bfile ", inputPrefix, " --exclude ", 
               snps2removedfile, " --make-bed --out ", outputPrefix))
        system(paste0("rm ", outputPrefix.tmp1, ".* ", outputPrefix.tmp2, ".*")) 
        system(paste0("rm ", snps2removedfile)) 
    } else { 
        ## copy/rename plink files
        renamePlinkBFile(inputPrefix, outputPrefix, action="copy") 
    }
}




##########################################   
##########################################
#' Hardy Weinberg Equilibrium test for autosomal SNPs
#'
#' @description
#' Remove autosomal SNPs deviating from Hardy Weinberg Equilibrium (HWE).
 
#' @param groupLabel a string value indicating the outcome label: "control",  
#' or, "case" or "caseControl" for both existing groups. For more details, see 
#' \code{\link{getGroupLabel}}.
#' @param plink an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param pval the p-value cutoff for controlling HWE test in either control or 
#' case subjects. Only autosomal SNPs are considered. The default value is
#' 0.000001.
#' @param outputPvalFile the output pure text file that stores autosomal SNPs  
#' and their sorted HWE p-values.
#' @param outputSNPfile the output pure text file that stores the removed SNPs, 
#' one per line.
#' @param outputPrefix the prefix of the output PLINK binary files.

#' @return The output PLINK binary files after HWE test on the autosome.
#' @details 

#' @export 
#' @author Junfang Chen 
#' @seealso \code{\link{removedSnpFemaleChrXhweControl}}, 
#' \code{\link{getGroupLabel}}.
#' @examples  
#' ## In the current working directory
#' bedFile <- system.file("extdata", "genoUpdatedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "genoUpdatedData.bim", package="Gimpute") 
#' famFile <- system.file("extdata", "genoUpdatedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, bimFile, famFile, " ."))  
#' groupLabel <- "control"
#' inputPrefix <- "genoUpdatedData" ## Specify the input PLINK file prefix 
#' outputPvalFile <- "2_11_snpHwePvalAuto.txt"
#' outputSNPfile <- "2_11_snpRemovedHweAuto.txt" 
#' outputPrefix <- "2_11_removedSnpHweAuto" 
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## removedSnpHWEauto(groupLabel, plink, inputPrefix, pval=0.000001,
#' ##                   outputPvalFile, outputSNPfile, outputPrefix)

removedSnpHWEauto <- function(groupLabel, plink, inputPrefix, 
                              pval=0.000001, outputPvalFile, 
                              outputSNPfile, outputPrefix){ 

    if (groupLabel == "control"){ 
        groupStatus <- "filter-controls"
        affection <- "UNAFF" 
    } else if (groupLabel == "case"){
        groupStatus <- "filter-cases"
        affection <- "AFF" 
    } else {
        print("ERROR: during HWE test on autosome! Wrong label warning!")
    }

    outputPrefix.tmp <- paste0(outputPrefix, "tmp")
    system(paste0(plink, " --bfile ", inputPrefix, " --",
           groupStatus, " --hardy --autosome --allow-no-sex --make-bed --out ", 
           outputPrefix.tmp))  

    ## read HWE p values 
    hweCheck <- read.table(file=paste0(outputPrefix.tmp, ".hwe"), 
                           header=TRUE, stringsAsFactors=FALSE) 

    hwe <- hweCheck[which(hweCheck$TEST == affection), ]  
    snpHweValAuto <- subset(hwe, select=c(SNP, P))
    snpHweValAuto <- snpHweValAuto[order(snpHweValAuto[,"P"]),] 
    write.table(snpHweValAuto, file=outputPvalFile, quote=FALSE, 
                row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
    removedSNPhwe <- hwe[which(hwe$P <= pval), "SNP"]
    write.table(removedSNPhwe, file=outputSNPfile, quote=FALSE, 
                row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")

    ## exclude SNPs 
    system(paste0(plink, " --bfile ", inputPrefix, " --exclude ", 
           outputSNPfile, " --make-bed --out ", outputPrefix)) 
    system(paste0("rm ", outputPrefix.tmp, ".*"))

}

##########################################   
##########################################  
#' Hardy Weinberg Equilibrium test for chromosome X SNPs in female controls. 
#'
#' @description
#' Hardy Weinberg Equilibrium test for SNPs on the chromosome X in 
#' female controls.  

#' @param plink an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param pval the p-value cutoff for controlling HWE test in female control 
#' subjects. Only chromosome X SNPs are considered. 
#' The default value is 0.000001.
#' @param outputPvalFile the output pure text file that stores chromosome X 
#' SNPs and their sorted HWE p-values.
#' @param outputSNPfile the output pure text file that stores the removed SNPs, 
#' one per line.
#' @param outputPrefix the prefix of the output PLINK binary files.

#' @return The output PLINK binary files after HWE test on chromosomal X 
#' in female controls.

#' @export 
#' @author Junfang Chen 
#' @seealso \code{\link{removedSnpHWEauto}}
#' @examples  
#' ## In the current working directory
#' bedFile <- system.file("extdata", "genoUpdatedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "genoUpdatedData.bim", package="Gimpute") 
#' famFile <- system.file("extdata", "genoUpdatedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, bimFile, famFile, " ."))   
#' inputPrefix <- "genoUpdatedData"  
#' outputPvalFile <- "2_12_snpHwePvalfemaleXct.txt" 
#' outputSNPfile <- "2_12_snpRemovedHweFemaleXct.txt" 
#' outputPrefix <- "2_12_removedSnpHweFemaleX"
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## removedSnpFemaleChrXhweControl(plink, inputPrefix, pval=0.000001,
#' ##                                outputPvalFile, outputSNPfile, 
#' ##                                outputPrefix)

removedSnpFemaleChrXhweControl <- function(plink, inputPrefix, pval=0.000001, 
                                           outputPvalFile, outputSNPfile, 
                                           outputPrefix){ 

    bim <- read.table(paste0(inputPrefix, ".bim"), stringsAsFactors=FALSE)  
    chrCodes <- names(table(bim[,1]))
    chr23check <- length(grep(23, chrCodes))

    if (chr23check == 1) { 
        outputPrefix.tmp <- paste0(outputPrefix, "tmp") 
        system(paste0(plink, " --bfile ", inputPrefix, 
               " --filter-females --filter-controls --chr 23 --hardy ", 
               " --allow-no-sex --make-bed --out ", outputPrefix.tmp) )
        ## read p values
        hweCheck <- read.table(file=paste0(outputPrefix.tmp, ".hwe"), 
                               header=TRUE, stringsAsFactors=FALSE) 
        ## for controls 
        hweControl <- hweCheck[which(hweCheck$TEST == "UNAFF"), ] # 
        snpHweValChrXCt <- subset(hweControl, select=c(SNP, P))
        snpHweValChrXCt <- snpHweValChrXCt[order(snpHweValChrXCt[,"P"]),]
        write.table(snpHweValChrXCt, file=outputPvalFile, quote=FALSE, 
                    row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
        ## remove bad SNPs
        removedSNPhweControl <- hweControl[which(hweControl$P <= pval), "SNP"]
        write.table(removedSNPhweControl, file=outputSNPfile, quote=FALSE, 
                    row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")

        system(paste0(plink, " --bfile ", inputPrefix, " --exclude ", 
                outputSNPfile, " --make-bed --out ", outputPrefix))
        system(paste0("rm ", outputPrefix.tmp, ".*")) 

    } else {
        ## copy/rename plink files
        renamePlinkBFile(inputPrefix, outputPrefix, action="copy") 
    }
}

##########################################   
##########################################
#' Population outlier detection
#'
#' @description
#' Principle component analysis (PCA) on the genotype data is performed 
#' to detect population outliers, and the first two PCs 
#' are plotted for the visualization. 

#' @param gcta an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param nThread the number of threads used for computation. 
#' The default is 20.
#' @param outputPC4subjFile the pure text file that stores all the subject IDs 
#' and their corresponding eigenvalues of the first two principle components.
#' @param outputPCplotFile the plot file for visualizing the first two 
#' principle components of all investigated subjects. 

#' @return The output pure text file and plot file for storing first two 
#' principle components of study subjects.
#' @details Before population outlier detection, it's better to perform QC 
#' on the genotype data. 
#' Only autosomal genotypes are used for principle component analysis. 

#' @export 
#' @import lattice  

#' @author Junfang Chen 
#' @examples  
#' ## In the current working directory
#' bedFile <- system.file("extdata", "QCdata.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "QCdata.bim", package="Gimpute") 
#' famFile <- system.file("extdata", "QCdata.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, bimFile, famFile, " ."))  
#' inputPrefix <- "QCdata" 
#' outputPC4subjFile <- "2_13_eigenvalAfterQC.txt"
#' outputPCplotFile <- "2_13_eigenvalAfterQC.png" ## png format
#' ## Not run: Requires an executable program GCTA, e.g.
#' ## gcta <- "/home/tools/gcta64"
#' ## plotPCA4plink(gcta, inputPrefix, nThread=20, 
#' ##               outputPC4subjFile, outputPrefix)

plotPCA4plink <- function(gcta, inputPrefix, nThread=20, 
                          outputPC4subjFile, outputPCplotFile){ 

    autosomefn <- paste0(inputPrefix, "Autosome")
    system(paste0(gcta, " --bfile ", inputPrefix, 
           " --make-grm --autosome --out ", 
           autosomefn, " --thread-num ", nThread))
    system(paste0(gcta, " --grm ", autosomefn, " --pca 20 --out ", 
           autosomefn, " --thread-num ", nThread))

    eigen <- read.table(file=paste0(autosomefn,".eigenvec"), 
                        stringsAsFactors=FALSE)
    pcs <- eigen[,seq_len(4)] ## first two PCs in the 3rd and 4th column.
    write.table(pcs, outputPC4subjFile, quote=FALSE, row.names=FALSE, 
                col.names=FALSE, eol="\r\n", sep=" ")
    pcWithGroup <- cbind(pcs, stringsAsFactors=FALSE)

    png(outputPCplotFile, width=8, height=6, units="in", res=800)
    print(xyplot(pcWithGroup[,4] ~ pcWithGroup[,3], data=pcWithGroup, 
           auto.key=list(space="right"),  
           jitter.x=TRUE, jitter.y=TRUE, xlab="PC1", ylab="PC2"))
    dev.off()
    ## remove unwanted files
    system(paste0("rm ", autosomefn, ".*"))
}

######################################################
######################################################
#' Remove population outliers
#'
#' @description
#' Remove population outliers by using principle component analysis.

#' @param plink an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param gcta an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param nThread the number of threads used for computation. 
#' The default is 20.
#' @param cutoff the cutoff that distinguishes the outliers from ordinary  
#' population using PCA. If it is null, then there are no outliers or 
#' outliers are not required to be removed. The default is NULL. 
#' @param cutoffSign the cutoff sign: 'greater' or 'smaller' that determines 
#' if the outliers should be greater or smaller than the cutoff value.
#' @param inputPC4subjFile the pure text file that stores all the subject IDs 
#' and their corresponding eigenvalues of the first two principle components.
#' @param outputPC4outlierFile the pure text file that stores the outlier IDs 
#' and their corresponding eigenvalues of the first two principle components.
#' @param outputPCplotFile the plot file for visualizing the first two 
#' principle components of all subjects without population outliers.
#' @param outputPrefix the prefix of the output PLINK binary files.

#' @return 1.) The output PLINK binary files after outlier removal. 
#' 2.) The output pure text file (if any) for storing removed outlier IDs 
#' and their corresponding PCs. 3.) The plot file (if any) for visualizing 
#' the first two principle components after outlier removal.

#' @details This function is used for removing population outliers. 
#' If the outliers are necessary to be removed, then one uses the eigenvalues 
#' from the first principle component as a criterion to find out the outliers 
#' by assigning an appropriate cutoff. 


#' @export 
#' @import lattice  
#' @author Junfang Chen 
#' @examples  
#' ## In the current working directory
#' bedFile <- system.file("extdata", "QCdata.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "QCdata.bim", package="Gimpute") 
#' famFile <- system.file("extdata", "QCdata.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, bimFile, famFile, " ."))  
#' inputPrefix <- "QCdata"  
#' cutoff <- NULL ## no outlier to be removed
#' cutoffSign <- "greater" ## not used if cutoff == NULL
#' inputPC4subjFile <- "2_13_eigenvalAfterQC.txt"
#' outputPC4outlierFile <- "2_13_eigenval4outliers.txt"
#' outputPCplotFile <- "2_13_removedOutliers.png"
#' outputPrefix <- "2_13_removedOutliers" 
#' ## Not run: Requires an executable program PLINK and GCTA, e.g.
#' ## plink <- "/home/tools/plink"
#' ## gcta <- "/home/tools/gcta64"
#' ## removeOutlierByPCs(plink, gcta, inputPrefix, nThread=20, 
#' ##                    cutoff, cutoffSign, inputPC4subjFile, 
#' ##                    outputPC4outlierFile, outputPCplotFile, outputPrefix)

removeOutlierByPCs <- function(plink, gcta, inputPrefix, nThread=20, cutoff=NULL, 
                               cutoffSign, inputPC4subjFile, 
                               outputPC4outlierFile, 
                               outputPCplotFile, outputPrefix){

    ## if no outliers or no need to remove PC outliers. 
    if (is.null(cutoff) == TRUE) { 
        ## copy/rename all snp info updated plink files
        renamePlinkBFile(inputPrefix, outputPrefix, action="copy")
    } else {  
        subjID_PCs <- read.table(file=inputPC4subjFile, stringsAsFactors=FALSE)   
        if (length(cutoff) > 1) { 
            ## if the outliers should be removed on both side of the cluster
            ## detected by PC1
            outliersPC1v1 <- subjID_PCs[which(subjID_PCs[,3] <= cutoff[1]), ] 
            outliersPC1v2 <- subjID_PCs[which(subjID_PCs[,3] >= cutoff[2]), ]  
            outlierID <- rbind(outliersPC1v1, outliersPC1v2)
        } else { 
            if (cutoffSign == "smaller"){ 
                ## detected by PC1
                outlierID <- subjID_PCs[which(subjID_PCs[,3] <= cutoff), ] 
            } else if (cutoffSign == "greater"){
                ## detected by PC1
                outlierID <- subjID_PCs[which(subjID_PCs[,3] >= cutoff), ]
            }
        }       
        ## sorted by first PC.
        outlierIDSorted <- outlierID[order(outlierID[,3]), ] 
        write.table(outlierIDSorted, file=outputPC4outlierFile, quote=FALSE, 
                    row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
        subjID4outlierTmp  <- outlierIDSorted[,c("V1", "V2")]
        subjID4outlierTmpFile <- "subjID4outlierTmp.txt"
        write.table(subjID4outlierTmp, file=subjID4outlierTmpFile, quote=FALSE, 
                    row.names=FALSE, col.names=FALSE, eol="\r\n", sep=" ")
        system(paste0(plink, " --bfile ", inputPrefix, " --remove ", 
               subjID4outlierTmpFile, " --make-bed --out ", outputPrefix))
        system(paste0("rm ", subjID4outlierTmpFile)) 
        ## Plot first two PCs again; PCs for the retained subjects 
        outputPC4subjFiletmp <- "outputPC4subjFile.txt" 
        plotPCA4plink(gcta, inputPrefix=outputPrefix, nThread, 
                      outputPC4subjFiletmp, outputPCplotFile)
        system(paste0("rm ", outputPC4subjFiletmp))
    } 
}



######################################################
######################################################
#' Quality control for genotype data
#'
#' @description
#' Perform quality control on the genotype data. 

#' @param plink an executable program in either the current working directory 
#' or somewhere in the command path.
#' @param inputPrefix the prefix of the input PLINK binary files.
#' @param snpMissCutOffpre the cutoff of the missingness for removing SNPs 
#' before subject removal. The default is 0.05.
#' @param sampleMissCutOff the cutoff of the missingness for removing 
#' subjects/instances. The default is 0.02.
#' @param Fhet the cutoff of the autosomal heterozygosity deviation. 
#' The default is 0.2.
#' @param cutoffSubject the cutoff determines that families (subjects) with 
#' more than the predefined cutoff of Mendel errors by considering all SNPs 
#' will be removed. The default is 0.05.
#' @param cutoffSNP the cutoff indicates that SNPs with more than the 
#' predefined cutoff of Mendel error rate will be excluded 
#' (i.e. based on the number of trios/duos). The default is 0.1.
#' @param snpMissCutOffpost the cutoff of the missingness for removing SNPs 
#' after subject removal. The default is 0.02. 
#' @param snpMissDifCutOff the cutoff of the difference in missingness between 
#' cases and controls. The default is 0.02.
#' @param femaleChrXmissCutoff the cutoff of the missingness in female 
#' chromosome X SNPs. The default is 0.05.
#' @param pval4autoCtl the p-value cutoff for controlling HWE test in either 
#' control or case subjects. Only autosomal SNPs are considered. 
#' The default is 0.000001
#' @param pval4femaleXctl the p-value cutoff for controlling HWE test in 
#' female control subjects. Only chromosome X SNPs are considered. 
#' The default is 0.000001
#' @param outputPrefix the prefix of the output PLINK binary files after QC.
#' @param keepInterFile a logical value indicating if the intermediate 
#' processed files should be kept or not. The default is TRUE.

#' @return The output PLINK binary files after QC.
#' @details The original PLINK files are implicitly processed by the following 
#' default steps: 
#' 1.) Set all heterozygous alleles of SNPs on male chrX as missing;
#' 2.) SNP missingness < 0.05 (before sample removal);
#' 3.) Subject missingness < 0.02;  
#' 4.) Remove subjects with |Fhet| >= 0.2;
#' 5.) Reset paternal and maternal codes;
#' 6.) SNP missingness < 0.02 (after sample removal);
#' 7.) Remove SNPs with difference >= 0.02 of SNP missingness 
#' between cases and controls;
#' 8.) Remove subjects or SNPs with Mendel errors for family based data.
#' 9.) Remove chrX SNPs with missingness >= 0.05 in females.
#' (Optional, if no chrX data);
#' 10.) Remove autosomal SNPs with HWE p < 10-6 in controls;
#' 11.) Remove chrX SNPs with HWE p < 10-6 in female controls. 
#' (Optional, if no chrX data). 

#' @export  
#' @author Junfang Chen 
#' @references Schizophrenia Working Group of the Psychiatric Genomics, C. 
#' (2014). Biological insights from 108 schizophrenia-associated genetic loci. 
#' Nature 511(7510): 421-427. 

#' @examples  
#' ## In the current working directory
#' bedFile <- system.file("extdata", "genoUpdatedData.bed", package="Gimpute")
#' bimFile <- system.file("extdata", "genoUpdatedData.bim", package="Gimpute") 
#' famFile <- system.file("extdata", "genoUpdatedData.fam", package="Gimpute")
#' system(paste0("scp ", bedFile, bimFile, famFile, " ."))  
#' inputPrefix <- "genoUpdatedData" 
#' outputPrefix <- "2_13_removedSnpHweFemaleX"  
#' ## Not run: Requires an executable program PLINK, e.g.
#' ## plink <- "/home/tools/plink"
#' ## genoQC(plink, inputPrefix, 
#' ##        snpMissCutOffpre=0.05, 
#' ##        sampleMissCutOff=0.02, 
#' ##        Fhet=0.2, cutoffSubject, cutoffSNP,
#' ##        snpMissCutOffpost=0.02, 
#' ##        snpMissDifCutOff=0.02,
#' ##        femaleChrXmissCutoff=0.05, 
#' ##        pval4autoCtl=0.000001, 
#' ##        pval4femaleXctl=0.000001, 
#' ##        outputPrefix, keepInterFile=TRUE)


genoQC <- function(plink, inputPrefix, snpMissCutOffpre=0.05, 
                   sampleMissCutOff=0.02, Fhet=0.2, 
                   cutoffSubject, cutoffSNP,
                   snpMissCutOffpost=0.02, snpMissDifCutOff=0.02,
                   femaleChrXmissCutoff=0.05, pval4autoCtl=0.000001, 
                   pval4femaleXctl=0.000001, 
                   outputPrefix, keepInterFile=TRUE) {

    ## check case control status
    groupLabel <- getGroupLabel(inputFAMfile=paste0(inputPrefix, ".fam"))
    ## step 3 
    # 3. Set all heterozygous alleles of SNPs of the chr 23 for males
    outputPrefix3 <- "2_03_setHeteroHaploMissing" 
    setHeteroHaploMissing(plink, inputPrefix, outputPrefix=outputPrefix3) 
    ## step 4  SNP missingness < 0.05 (before sample removal);  
    outputPrefix4 <- "2_04_removedSnpMissPre" 
    removedSnpMiss(plink, snpMissCutOff=snpMissCutOffpre, 
                   inputPrefix=outputPrefix3, outputPrefix=outputPrefix4)        
    ## step 5 
    # subject missingness < 0.02;  
    outputPrefix5 <- "2_05_removedInstMiss" 
    removedInstMiss(plink, sampleMissCutOff,  
                    inputPrefix=outputPrefix4, outputPrefix=outputPrefix5)
    ## step 6  
    outputPrefix6 <- "2_06_removedInstFhet" 
    removedInstFhet(plink, Fhet, 
                    inputPrefix=outputPrefix5, outputPrefix=outputPrefix6)
    ## step 7
    outputPrefix7 <- "2_07_removedParentIdsMiss" 
    removedParentIdsMiss(plink, inputPrefix=outputPrefix6, 
                         outputPrefix=outputPrefix7)
    ## step 8  
    outputPrefix8 <- "2_08_removedMendelErr" 
    removedMendelErr(plink, inputPrefix=outputPrefix7, cutoffSubject, 
                     cutoffSNP, outputPrefix=outputPrefix8)
    
    ## step 9
    outputPrefix9 <- "2_09_removedSnpMissPost" 
    removedSnpMiss(plink, snpMissCutOff=snpMissCutOffpost, 
                   inputPrefix=outputPrefix8, outputPrefix=outputPrefix9)
    ## step 10  
    outputPrefix10 <- "2_10_removedSnpMissDiff" 
    removedSnpMissDiff(plink, inputPrefix=outputPrefix9, 
                       snpMissDifCutOff, outputPrefix=outputPrefix10, 
                       groupLabel) 
    ## step 11
    outputPrefix11 <- "2_11_removedSnpFemaleChrXmiss" 
    removedSnpFemaleChrXmiss(plink, femaleChrXmissCutoff, 
                             inputPrefix=outputPrefix10, 
                             outputPrefix=outputPrefix11) 
    ## step 12
    outputPvalFile <- "2_12_snpHwePvalAuto.txt" 
    outputSNPfile <-  "2_12_snpRemovedHweAuto.txt" 
    outputPrefix12 <- "2_12_removedSnpHweAuto" 
    if (groupLabel == "control" | groupLabel == "caseControl"){
        removedSnpHWEauto(groupLabel="control", plink, 
                          inputPrefix=outputPrefix11, 
                          pval=pval4autoCtl, outputPvalFile, 
                          outputSNPfile, outputPrefix=outputPrefix12)
    ## HWE test is only performed on control data 
    } else { print("ERROR: HWE test on autosome!") }
    ## step 12 
    outputPvalFile <- "2_13_snpHwePvalfemaleXct.txt" 
    outputSNPfile <- "2_13_snpRemovedHweFemaleXct.txt"  
    removedSnpFemaleChrXhweControl(plink, inputPrefix=outputPrefix12, 
                                   pval=pval4femaleXctl, outputPvalFile,
                                   outputSNPfile, outputPrefix=outputPrefix)
    if (keepInterFile==FALSE){ 
        ## remove intermediate files 
        system(paste0("rm ", outputPrefix3, ".*"))
        system(paste0("rm ", outputPrefix4, ".*"))
        system(paste0("rm ", outputPrefix5, ".*"))
        system(paste0("rm ", outputPrefix6, ".*"))
        system(paste0("rm ", outputPrefix7, ".*"))
        system(paste0("rm ", outputPrefix8, ".*"))
        system(paste0("rm ", outputPrefix9, ".*"))
        system(paste0("rm ", outputPrefix10, ".*"))
        system(paste0("rm ", outputPrefix11, ".*")) 
        system(paste0("rm ", outputPrefix12, ".*")) 

        if (file.exists(outputPvalFile)) {  
            system(paste0("rm ", outputPvalFile))
        }
        if (file.exists(outputSNPfile)) {  
            system(paste0("rm ", outputSNPfile))
        }    
    }
    ## remove unwanted files
    system(paste0("rm  *.log "))   
} 
transbioZI/Gimpute documentation built on April 10, 2022, 4:20 a.m.