R/plinkCoxSurv.R
In gwasurvivr: gwasurvivr: an R package for genome wide survival analysis

Documented in plinkCoxSurv

#' Fit Cox survival to all variants from PLINK binary files (.BED, .BIM, .FAM)
#'
#' Performs survival analysis using Cox proportional hazard models on directly 
#' typed data in PLINK format
#'
#' @param bed.file character of name of plink files without extension 
#' @param covariate.file data.frame comprising phenotype information, all 
#'  covariates to be added in the model must be numeric.
#' @param id.column character giving the name of the ID column
#'  in covariate.file.
#' @param sample.ids character vector of sample IDs to keep in
#'  survival analysis
#' @param time.to.event character of column name in covariate.file that
#'  represents the time interval of interest in the analysis
#' @param event character of column name in covariate.file that represents
#'  the event of interest to be included in the analysis
#' @param covariates character vector with exact names of columns in
#'  covariate.file to include in analysis
#' @param inter.term character string giving the column name of the covariate
#'  that will be added to the interaction term with
#'  SNP (e.g. \code{term*SNP}). See details.
#' @param print.covs character string of either \code{"only"}, \code{"all"}
#'  or \code{"some"}, defining which covariate statistics should be printed to
#'  the output. See details.
#' @param out.file character of output file name (do not include extension) 
#' @param chunk.size integer number of variants to process per thread
#' @param maf.filter numeric to filter minor allele frequency
#'  (i.e. choosing 0.05 means filtering MAF>0.05). User can set this to 
#' \code{NULL} if no filtering is preffered. Default is 0.05.
#' @param flip.dosage logical to flip which allele the dosage was
#'  calculated on, default \code{flip.dosage=TRUE}
#' @param verbose logical for messages that describe which part of the
#'  analysis is currently being run
#' @param clusterObj A cluster object that can be used with the
#'  \code{parApply} function. See details.
#' 
#' @details 
#' Testing for SNP-covariate interactions:          
#' User can define the column name of the covariate that will be included in
#'  the interaction term. 
#' For example, for given covariates \code{a} and \code{b}, where \code{c} is
#'  defined as the \code{inter.term} the model will be:
#'  \code{~ a + b + c + SNP + c*SNP}.
#' 
#' Printing results of other covariates:       
#' \code{print.covs} argument controls the number of covariates will be printed
#'  as output. The function is set to \code{only}
#'  by default and will only print the SNP or if an interaction term is given, 
#'  the results of the interaction 
#'  term (e.g. \code{SNP*covariate}). Whereas,  \code{all} will print results
#'  (coef, se.coef, p.value etc) of all covariates included in the model.
#'  \code{some} is only applicable if an interaction term is given and will 
#'  print the results for SNP, covariate tested for interaction and the
#'  interaction term. User should be mindful about using the \code{all} option,
#'  as it will likely slow down the analysis and will increase
#'  the output file size. 
#'
#' User defined parallelization:  
#' This function uses \code{parApply} from \code{parallel} package to fit 
#'  models to SNPs in parallel. 
#' User is not required to set any options for the parallelization. 
#' However, advanced users who wish to optimize it, can provide a 
#'  cluster object generated by \code{makeCluster} family of functions that 
#'  suits their need and platform.
#'
#' @return
#' Saves two text files directly to disk:  
#' \code{.coxph} extension containing CoxPH survival analysis results.  
#' \code{.snps_removed} extension containing SNPs that were removed
#'  due to low variance or user-defined thresholds.   
#' 
#' @examples
#' bed.file <- system.file(package="gwasurvivr",
#'                        "extdata",
#'                        "plink_example.bed")
#' covariate.file <- system.file(package="gwasurvivr", 
#'                               "extdata",
#'                               "simulated_pheno.txt")
#' covariate.file <- read.table(covariate.file,
#'                              sep=" ",
#'                              header=TRUE,
#'                              stringsAsFactors = FALSE)
#' covariate.file$SexFemale <- ifelse(covariate.file$sex=="female", 1L, 0L)
#' sample.ids <- covariate.file[covariate.file$group=="experimental",]$ID_2
#' plinkCoxSurv(bed.file=bed.file,
#'              covariate.file=covariate.file,
#'              id.column="ID_2",
#'              sample.ids=sample.ids,
#'              time.to.event="time",
#'              event="event",
#'              covariates=c("age", "SexFemale", "DrugTxYes"),
#'              inter.term=NULL,
#'              print.covs="only",
#'              out.file="impute_example",
#'              chunk.size=50,
#'              maf.filter=0.005,
#'              flip.dosage=TRUE,
#'              verbose=TRUE,
#'              clusterObj=NULL)  
#'  
#' @importFrom survival Surv coxph.fit
#' @importFrom matrixStats rowMeans2 rowVars rowSds
#' @importFrom SummarizedExperiment rowRanges
#' @importFrom utils write.table
#' @importFrom stats pnorm rnorm setNames complete.cases
#' @importFrom SNPRelate snpgdsBED2GDS
#' @import parallel
#' @import GWASTools
#'  
#' @export


plinkCoxSurv <- function(bed.file,
                         covariate.file,
                         id.column,
                         sample.ids=NULL, 
                         time.to.event, 
                         event,
                         covariates,
                         inter.term=NULL,
                         print.covs="only",
                         out.file,
                         chunk.size=10000,
                         maf.filter=0.005,
                         flip.dosage=TRUE,
                         verbose=TRUE,
                         clusterObj=NULL)
{
    

    bed.file <- bed.file
    bim.file <- sub("\\.[^.]*?$", ".bim", bed.file)
    fam.file <- sub("\\.[^.]*?$", ".fam", bed.file)
    ###################################
    #### Phenotype data wrangling #####
    cox.params <- coxPheno(covariate.file,
                           covariates,
                           id.column,
                           inter.term, 
                           time.to.event,
                           event, 
                           sample.ids, 
                           verbose)
    ###################################
    ###################################
    ##### Generate cluster obj ########
    # create cluster object depending on 
    # user pref or OS type, also create 
    #option to input number of cores
    if(!is.null(clusterObj)){
        cl <- clusterObj
    }else if(.Platform$OS.type == "unix") {
        cl <- makePSOCKcluster(getOption("gwasurvivr.cores", 2L))
    } else {
        cl <- makeCluster(getOption("gwasurvivr.cores", 2L))
    }
    on.exit(stopCluster(cl), add=TRUE)
    
    gdsfile <- tempfile(pattern="", fileext = ".gds")
    
    comp_time <- system.time(snpgdsBED2GDS(bed.file, 
                  fam.file,
                  bim.file,
                  gdsfile,
                  cvt.chr="int",
                  cvt.snpid="int",
                  verbose=TRUE)
    )
    
    message("***** Compression time ******\n", 
            "User:", round(comp_time[1], 3),
            "\nSystem: ", round(comp_time[2], 3),
            "\nElapsed: ", round(comp_time[3], 3), 
            "\n*****************************"
    )
    
    gds <- GdsGenotypeReader(gdsfile,
                             YchromCode=24L, 
                             XYchromCode=25L)
    

    scanID <- getScanID(gds)
    scanAnnot <- ScanAnnotationDataFrame(data.frame(scanID,
                                                    stringsAsFactors=FALSE))
    snpID <- getSnpID(gds)
    chromosome <- getChromosome(gds)
    position <- getPosition(gds)
    alleleA <- getAlleleA(gds)
    alleleB <- getAlleleB(gds)
    rsID <- getVariable(gds, "snp.rs.id")
    # requires snpID
    snpAnnot <- SnpAnnotationDataFrame(data.frame(snpID,
                                                  rsID,
                                                  chromosome,
                                                  position,
                                                  alleleA,
                                                  alleleB,
                                                  stringsAsFactors=FALSE),
                                       YchromCode=24L,
                                       XYchromCode=25L)
    genoData <- GenotypeData(gds,
                             scanAnnot=scanAnnot, 
                             snpAnnot=snpAnnot)

    # set up columns for output
    
    cols <- c("RSID",
              "CHR", 
              "POS",
              "A0",
              "A1",
              "exp_freq_A1",
              "SAMP_MAF")
    
    write.table( t(cols),
                 paste0(out.file, ".snps_removed"),
                 row.names = FALSE,
                 col.names=FALSE,
                 sep="\t",
                 quote = FALSE,
                 append = FALSE)
    
    snp.df <- data.frame(t(rep(NA, 7)))
    colnames(snp.df) <- cols
    rownames(snp.df) <- NULL
    
    snp.spike <- rbind(rnorm(nrow(cox.params$pheno.file)),
                       rnorm(nrow(cox.params$pheno.file)))
    
    if(!is.null(inter.term)){
        cox.out <- t(apply(snp.spike,
                           1,
                           survFitInt,
                           cox.params=cox.params,
                           cov.interaction=inter.term, 
                           print.covs=print.covs)
        )
        res.cols <- colnames(coxExtract(cox.out,
                                        snp.df,
                                        print.covs=print.covs)
        )
        
        write.table( t(res.cols),
                     paste0(out.file, ".coxph"),
                     row.names = FALSE,
                     col.names=FALSE,
                     sep="\t",
                     quote = FALSE,
                     append = FALSE)
    } else {
        cox.out <- t(apply(snp.spike,
                           1, 
                           survFit,
                           cox.params=cox.params,
                           print.covs=print.covs)
        )
        res.cols <- colnames(coxExtract(cox.out,
                                        snp.df,
                                        print.covs=print.covs)
        )
        

        write.table(t(res.cols),
                    paste0(out.file, ".coxph"),
                    row.names = FALSE,
                    col.names=FALSE,
                    sep="\t",
                    quote = FALSE,
                    append = FALSE)
    }
    
    # number of dropped snps
    snp.drop.n <-0
    snp.n <- 0
    
    
    snp.start <- 1
    snp.end <- nsnp(genoData)
    # get genotypes for certain chunk size
    nsnp.seg <- snp.end - snp.start + 1
    nchunks <- ceiling(nsnp.seg/chunk.size)
    
    
    for(i in seq_len(nchunks)){
      
      if(verbose) message("Analyzing part ", i, "/", nchunks, "...")
      
      # set up chunks
      next.chunk <- (i-1)*chunk.size
      next.chunk.start <- snp.start + next.chunk
      snp.chunk <- ifelse(next.chunk.start + chunk.size > snp.end,
                          snp.end - next.chunk.start + 1,
                          chunk.size)
      chunk.idx <- (next.chunk+1):(next.chunk+snp.chunk)
      
      # get genotypes for chunk
      genotypes <- getGenotype(genoData,
                               snp=c(next.chunk.start, snp.chunk),
                               scan=c(1,-1),
                               drop=FALSE)
      
      # get the snp info file
      snp <- getAnnotation(getSnpAnnotation(genoData))[chunk.idx,]
      snp.cols <- c("snpID",
                    "RSID",
                    "CHR",
                    "POS",
                    "A0",
                    "A1")
      snp.ord <- c("RSID",
                   "CHR",
                   "POS",
                   "A0",
                   "A1")
      
      colnames(snp) <- snp.cols
      snp <- snp[, snp.ord]
      blankSNPs <- snp$A0 == "0" & snp$A1 == "0"
      # grab sample file data
      scanAnn <- getAnnotation(getScanAnnotation(genoData))
      
      # assign rsIDs (pasted with imputation status) as rows 
      # and sample ID as columns to genotype file
      dimnames(genotypes) <- list(snp$RSID, 
                                  scanAnn$scanID)
      
      # Subset genotypes by given samples
      genotypes <- genotypes[!blankSNPs,cox.params$ids]
      snp <- snp[!blankSNPs,]
      
      # flip dosage
      if(flip.dosage) genotypes <- 2 - genotypes
      ########################################################################
      
      ########################################################################
      ##### SNP info and filtering ###########################################
      
      ### Check snps for MAF = 0  ###
      # remove snps with SD less than 1e-4
      # to put this in perspective:
      # a sample size of 100 000 000 with only 1 person being 1 and rest 0,
      # has an SD = 1e-4
      # x <- c(rep(0, 1e8),1)
      # sd(x)
      ok.snp <- rowSds(genotypes, na.rm = TRUE) > 1e-4
      snp <- snp[ok.snp, ]
      genotypes <- genotypes[ok.snp, ]
      snp.drop <- snp[!ok.snp, ]
      
      # calculate MAF
      snp$exp_freq_A1 <- round(rowMeans2(genotypes, na.rm = TRUE)*0.5,4)
      snp$SAMP_MAF <- ifelse(snp$exp_freq_A1 > 0.5,
                             1-snp$exp_freq_A1,
                             snp$exp_freq_A1
      )
      
      
      # Further filter by user defined thresholds
      if (!is.null(maf.filter)) {
        ok.snp <- snp$SAMP_MAF > maf.filter
        genotypes <- genotypes[ok.snp,]
        snp <- snp[ok.snp,]
        
        if(nrow(snp.drop) > 0){
          snp.drop$exp_freq_A1 <- 1
          snp.drop$SAMP_MAF <- 0
          snp.drop <- rbind(snp.drop, snp[!ok.snp,])
        } else {
          snp.drop <- snp[!ok.snp,]
        }
      }
      
      if (nrow(snp.drop) > 0) {
        write.table(
          snp.drop,
          paste0(out.file, ".snps_removed"),
          row.names = FALSE,
          col.names = FALSE,
          sep = "\t",
          quote = FALSE,
          append = TRUE )
        snp.drop.n <- snp.drop.n+nrow(snp.drop)
      }
      
      if (nrow(genotypes) > 0) {
        # fit models in parallel
        if (is.null(inter.term)) {
          if (is.matrix(genotypes)) {
            cox.out <- t(
              parallel::parApply(
                cl = cl,
                1L,
                X = genotypes,
                FUN = survFit,
                cox.params = cox.params,
                print.covs = print.covs
              )
            )
          } else if(is.numeric(genotypes)) {
            cox.out <- survFit(genotypes,
                               cox.params = cox.params,
                               print.covs = print.covs)
          }
        } else if (inter.term %in% covariates) {
          if (is.matrix(genotypes)) {
            cox.out <- t(
              parApply(
                cl = cl,
                X = genotypes,
                MARGIN = 1,
                FUN = survFitInt,
                cox.params = cox.params,
                cov.interaction = inter.term,
                print.covs = print.covs
              )
            )
          } else if(is.numeric(genotypes)) {
            cox.out <- survFitInt(
              genotypes,
              cox.params = cox.params,
              cov.interaction = inter.term,
              print.covs = print.covs
            )
          }
        }
        
        res <- coxExtract(cox.out,
                          snp,
                          print.covs)
        
        write.table(
          res,
          file = paste0(out.file, ".coxph"),
          sep = "\t",
          quote = FALSE,
          row.names = FALSE,
          col.names = FALSE,
          append = TRUE
        )
        
        snp.n <- nrow(genotypes) + snp.n
        
      }
      
    }
    
    if(verbose) {
      message(snp.drop.n," SNPs were removed from the analysis for not meeting\n",
              "the given threshold criteria or for having MAF = 0")
    }
    if(verbose) message("List of removed SNPs are saved to ",
                        paste0(out.file, ".snps_removed"))
    if(verbose) message("In total ", snp.n,
                        " SNPs were included in the analysis")
    if(verbose) message("The Cox model results output was saved to ",
                        paste0(out.file, ".coxph"))
    if (verbose) message("Analysis completed on ",
                         format(Sys.time(), "%Y-%m-%d"),
                         " at ",
                         format(Sys.time(), "%H:%M:%S"))
    
}