R/gdsCoxSurv.R
In gwasurvivr: gwasurvivr: an R package for genome wide survival analysis

Documented in gdsCoxSurv

#' Fit Cox survival to all variants from a standard IMPUTE2 output after
#' genotype imputation
#'
#' Performs survival analysis using Cox proportional hazard models on imputed
#' genetic data from GDS files.
#'
#' @param gdsfile path to .gds file. Location of the .gds file should also contain
#' .snp.rdata and .scan.rdata files.
#' @param covariate.file data.frame comprising phenotype information, all 
#'  covariates to be added in the model must be numeric.
#' @param id.column character giving the name of the ID column
#'  in covariate.file.
#' @param sample.ids character vector of sample IDs to keep in
#'  survival analysis
#' @param time.to.event character of column name in covariate.file that
#'  represents the time interval of interest in the analysis
#' @param event character of column name in covariate.file that represents
#'  the event of interest to be included in the analysis
#' @param covariates character vector with exact names of columns in
#'  covariate.file to include in analysis
#' @param inter.term character string giving the column name of the covariate
#'  that will be added to the interaction term with
#'  SNP (e.g. \code{term*SNP}). See details.
#' @param print.covs character string of either \code{"only"}, \code{"all"}
#'  or \code{"some"}, defining which covariate statistics should be printed to
#'  the output. See details.
#' @param out.file character of output file name (do not include extension) 
#' @param chunk.size integer number of variants to process per thread
#' @param maf.filter numeric to filter minor allele frequency
#'  (i.e. choosing 0.05 means filtering MAF>0.05). User can set this to 
#' \code{NULL} if no filtering is preffered. Default is 0.05.
#' @param flip.dosage logical to flip which allele the dosage was
#'  calculated on, default \code{flip.dosage=TRUE}
#' @param verbose logical for messages that describe which part of the
#'  analysis is currently being run
#' @param clusterObj A cluster object that can be used with the
#'  \code{parApply} function. See details.
#' 
#' @details 
#' Testing for SNP-covariate interactions:          
#' User can define the column name of the covariate that will be included in
#'  the interaction term. 
#' For example, for given covariates \code{a} and \code{b}, where \code{c} is
#'  defined as the \code{inter.term} the model will be:
#'  \code{~ a + b + c + SNP + c*SNP}.
#' 
#' Printing results of other covariates:       
#' \code{print.covs} argument controls the number of covariates will be printed
#'  as output. The function is set to \code{only}
#'  by default and will only print the SNP or if an interaction term is given, 
#'  the results of the interaction 
#'  term (e.g. \code{SNP*covariate}). Whereas,  \code{all} will print results
#'  (coef, se.coef, p.value etc) of all covariates included in the model.
#'  \code{some} is only applicable if an interaction term is given and will 
#'  print the results for SNP, covariate tested for interaction and the
#'  interaction term. User should be mindful about using the \code{all} option,
#'  as it will likely slow down the analysis and will increase
#'  the output file size. 
#'
#' User defined parallelization:  
#' This function uses \code{parApply} from \code{parallel} package to fit 
#'  models to SNPs in parallel. 
#' User is not required to set any options for the parallelization. 
#' However, advanced users who wish to optimize it, can provide a 
#'  cluster object generated by \code{makeCluster} family of functions that 
#'  suits their need and platform.
#'
#' @return
#' Saves two text files directly to disk:  
#' \code{.coxph} extension containing CoxPH survival analysis results.  
#' \code{.snps_removed} extension containing SNPs that were removed
#'  due to low variance or user-defined thresholds.   
#' 
#' @examples
#' gdsfile <- system.file(package="gwasurvivr",
#'                        "extdata",
#'                        "gds_example.gds")
#' covariate.file <- system.file(package="gwasurvivr", 
#'                               "extdata",
#'                               "simulated_pheno.txt")
#' covariate.file <- read.table(covariate.file,
#'                              sep=" ",
#'                              header=TRUE,
#'                              stringsAsFactors = FALSE)
#' covariate.file$SexFemale <- ifelse(covariate.file$sex=="female", 1L, 0L)
#' sample.ids <- covariate.file[covariate.file$group=="experimental",]$ID_2
#' gdsCoxSurv(gdsfile=gdsfile,
#'            covariate.file=covariate.file,
#'            id.column="ID_2",
#'            sample.ids=sample.ids,
#'            time.to.event="time",
#'            event="event",
#'            covariates=c("age", "SexFemale", "DrugTxYes"),
#'            inter.term=NULL,
#'            print.covs="only",
#'            out.file="impute_example",
#'            chunk.size=50,
#'            maf.filter=0.005,
#'            flip.dosage=TRUE,
#'            verbose=TRUE,
#'            clusterObj=NULL)  
#'  
#' @importFrom survival Surv coxph.fit
#' @importFrom matrixStats rowMeans2 rowVars rowSds
#' @importFrom SummarizedExperiment rowRanges
#' @importFrom utils write.table
#' @importFrom stats pnorm rnorm setNames complete.cases
#' @import parallel
#' @import GWASTools
#'  
#' @export

gdsCoxSurv <- function(gdsfile,
                       covariate.file,
                       id.column,
                       sample.ids = NULL,
                       time.to.event,
                       event,
                       covariates,
                       inter.term = NULL,
                       print.covs = "only",
                       out.file,
                       chunk.size = 5000,
                       maf.filter = 0.05,
                       flip.dosage = TRUE,
                       verbose = TRUE,
                       clusterObj = NULL)
{
    if (verbose) message("Analysis started on ",
                         format(Sys.time(), "%Y-%m-%d"),
                         " at ", 
                         format(Sys.time(), "%H:%M:%S"))
    
    ############################################################################
    #### Phenotype data wrangling ##############################################
    cox.params <- coxPheno(covariate.file,
                           covariates,
                           id.column,
                           inter.term, 
                           time.to.event,
                           event, 
                           sample.ids, 
                           verbose)
    ############################################################################
    
    ############################################################################
    #### Prep output files #####################################################
    
    # set up columns for output
    cols <- c("RSID",
              "TYPED",
              "CHR", 
              "POS",
              "A0",
              "A1",
              "exp_freq_A1",
              "SAMP_MAF")
    write.table( t(cols),
                 paste0(out.file, ".snps_removed"),
                 row.names = FALSE,
                 col.names=FALSE,
                 sep="\t",
                 quote = FALSE,
                 append = FALSE)
    
    snp.df <- data.frame(matrix(data = rep(NA, 16), ncol = 8 ) )
    colnames(snp.df) <- cols
    rownames(snp.df) <- NULL
    
    snp.spike <- rbind(c(rnorm(nrow(cox.params$pheno.file)-3), rep(NA, 3)),
                       c(rnorm(nrow(cox.params$pheno.file)-4), rep(NA, 4))
    )
    
    if(is.null(inter.term)){
        cox.out <- t(apply(snp.spike, 1, survFit,
                           cox.params=cox.params,
                           print.covs=print.covs) )
        
        res.cols <- colnames(coxExtract(cox.out,snp.df,print.covs=print.covs))
        
        write.table(t(res.cols),
                    paste0(out.file, ".coxph"),
                    row.names = FALSE,
                    col.names=FALSE,
                    sep="\t",
                    quote = FALSE,
                    append = FALSE)
    } else {
        cox.out <- t(apply(snp.spike,
                           1,
                           survFitInt,
                           cox.params=cox.params,
                           cov.interaction=inter.term, 
                           print.covs=print.covs) )
        
        res.cols <- colnames(coxExtract(cox.out,
                                        snp.df, 
                                        print.covs=print.covs) )
        
        write.table( t(res.cols),
                     paste0(out.file, ".coxph"),
                     row.names = FALSE,
                     col.names=FALSE,
                     sep="\t",
                     quote = FALSE,
                     append = FALSE)
    }
    
    ############################################################################
    
    ############################################################################
    ##### Generate cluster obj #################################################
    
    # create cluster object depending on user pref or OS type,
    # also create option to input number of cores
    if(!is.null(clusterObj)){
        cl <- clusterObj
    }else if(.Platform$OS.type == "unix") {
        cl <- makePSOCKcluster(getOption("gwasurvivr.cores", 2L))
        clusterEvalQ(cl, library(gwasurvivr))
    } else {
        cl <- makeCluster(getOption("gwasurvivr.cores", 2L))
    }
    on.exit(stopCluster(cl), add=TRUE)
    
    ############################################################################
    
    ############################################################################
    ##### Load Genotype data ###################################################
    
    # read genotype
    ## need to add if statement about dimensions
    # set default "snp,scan" -- 
    # in GWASTools documentation say it needs to be in this orientation
    gds <- GdsGenotypeReader(gdsfile, genotypeDim="scan,snp")
    # close gds file on exit of the function
    on.exit(close(gds), add=TRUE)
    # aux files
    snpfile <- sub("\\.[^.]*?$", ".snp.rdata", gdsfile)
    scanfile <- sub("\\.[^.]*?$", ".scan.rdata", gdsfile)
    # read in snp data
    snpAnnot <- getobj(snpfile)
    # read scan
    scanAnnot <- getobj(scanfile)
    # put into GenotypeData coding 
    genoData <- GenotypeData(gds,
                             snpAnnot=snpAnnot,
                             scanAnnot=scanAnnot)
    
    
    ############################################################################
    ##### Genotype data wrangling ##############################################
    
    # number of snps in segment
    snp.start <- 1
    snp.end <- nsnp(genoData)
    # number of dropped snps
    snp.drop.n <-0
    snp.n <- 0
    
    # get genotypes for certain chunk size
    nsnp.seg <- snp.end - snp.start + 1
    nchunks <- ceiling(nsnp.seg/chunk.size)
    
    for(i in seq_len(nchunks)){
        
        if(verbose) message("Analyzing part ", i, "/", nchunks, "...")
        
        # set up chunks
        next.chunk <- (i-1)*chunk.size
        next.chunk.start <- snp.start + next.chunk
        snp.chunk <- ifelse(next.chunk.start + chunk.size > snp.end,
                            snp.end - next.chunk.start + 1,
                            chunk.size)
        chunk.idx <- (next.chunk+1):(next.chunk+snp.chunk)
        
        # get genotypes for chunk
        genotypes <- getGenotype(genoData,
                                 snp=c(next.chunk.start, snp.chunk),
                                 scan=c(1,-1),
                                 drop=FALSE)
        
        # get the snp info file
        snp <- getAnnotation(getSnpAnnotation(genoData))[chunk.idx,]
        snp.cols <- c("snpID",
                      "TYPED",
                      "RSID",
                      "POS",
                      "A0",
                      "A1",
                      "CHR")
        snp.ord <- c("RSID",
                     "TYPED",
                     "CHR",
                     "POS",
                     "A0",
                     "A1")
        
        colnames(snp) <- snp.cols
        snp <- snp[, snp.ord]
        
        # grab sample file data
        scanAnn <- getAnnotation(getScanAnnotation(genoData))
        
        # assign rsIDs (pasted with imputation status) as rows 
        # and sample ID as columns to genotype file
        dimnames(genotypes) <- list(paste(snp$snp, snp$rsID, sep=";"), 
                                    scanAnn$ID_2)
        
        # Subset genotypes by given samples
        genotypes <- genotypes[,cox.params$ids]
        
        # flip dosage
        if(flip.dosage) genotypes <- 2 - genotypes
        ########################################################################
        
        ########################################################################
        ##### SNP info and filtering ###########################################
        
        ### Check snps for MAF = 0  ###
        # remove snps with SD less than 1e-4
        # to put this in perspective:
        # a sample size of 100 000 000 with only 1 person being 1 and rest 0,
        # has an SD = 1e-4
        # x <- c(rep(0, 1e8),1)
        # sd(x)
        ok.snp <- rowSds(genotypes, na.rm = TRUE) > 1e-4
        snp <- snp[ok.snp, ]
        genotypes <- genotypes[ok.snp, ]
        snp.drop <- snp[!ok.snp, ]
        
        # calculate MAF
        snp$exp_freq_A1 <- round(rowMeans2(genotypes, na.rm = TRUE)*0.5,4)
        snp$SAMP_MAF <- ifelse(snp$exp_freq_A1 > 0.5,
                               1-snp$exp_freq_A1,
                               snp$exp_freq_A1
        )
        
        # Further filter by user defined thresholds
        if (!is.null(maf.filter)) {
            ok.snp <- snp$SAMP_MAF > maf.filter
            genotypes <- genotypes[ok.snp,]
            snp <- snp[ok.snp,]
            
            if(nrow(snp.drop) > 0){
                snp.drop$exp_freq_A1 <- 1
                snp.drop$SAMP_MAF <- 0
                snp.drop <- rbind(snp.drop, snp[!ok.snp,])
            } else {
                snp.drop <- snp[!ok.snp,]
            }
        }
        
        if (nrow(snp.drop) > 0) {
            write.table(
                snp.drop,
                paste0(out.file, ".snps_removed"),
                row.names = FALSE,
                col.names = FALSE,
                sep = "\t",
                quote = FALSE,
                append = TRUE )
            snp.drop.n <- snp.drop.n+nrow(snp.drop)
        }
        
        if (nrow(genotypes) > 0) {
            # fit models in parallel
            if (is.null(inter.term)) {
                if (is.matrix(genotypes)) {
                    # browser()
                    # cl2 <- makeForkCluster(2L)
                    cox.out <- t(
                        parallel::parApply(
                            cl = cl,
                            X = genotypes,
                            MARGIN = 1,
                            FUN = survFit,
                            cox.params = cox.params,
                            print.covs = "only"
                        )
                    )
                } else if(is.numeric(genotypes)) {
                    cox.out <- survFit(genotypes,
                                       cox.params = cox.params,
                                       print.covs = print.covs)
                }
            } else if (inter.term %in% covariates) {
                if (is.matrix(genotypes)) {
                    cox.out <- t(
                        parApply(
                            cl = cl,
                            X = genotypes,
                            MARGIN = 1,
                            FUN = survFitInt,
                            cox.params = cox.params,
                            cov.interaction = inter.term,
                            print.covs = print.covs
                        )
                    )
                } else if(is.numeric(genotypes)) {
                    cox.out <- survFitInt(
                        genotypes,
                        cox.params = cox.params,
                        cov.interaction = inter.term,
                        print.covs = print.covs
                    )
                }
            }
            
            res <- coxExtract(cox.out,
                              snp,
                              print.covs)
            
            write.table(
                res,
                file = paste0(out.file, ".coxph"),
                sep = "\t",
                quote = FALSE,
                row.names = FALSE,
                col.names = FALSE,
                append = TRUE
            )
            
            snp.n <- nrow(genotypes) + snp.n
            
        }
        
    }
    
    if(verbose) {
        message(snp.drop.n," SNPs were removed from the analysis for not meeting\n",
                "the given threshold criteria or for having MAF = 0")
    }
    if(verbose) message("List of removed SNPs are saved to ",
                        paste0(out.file, ".snps_removed"))
    if(verbose) message("In total ", snp.n,
                        " SNPs were included in the analysis")
    if(verbose) message("The Cox model results output was saved to ",
                        paste0(out.file, ".coxph"))
    if (verbose) message("Analysis completed on ",
                         format(Sys.time(), "%Y-%m-%d"),
                         " at ",
                         format(Sys.time(), "%H:%M:%S"))
}