R/michiganCoxSurv.R

Defines functions michiganCoxSurv

Documented in michiganCoxSurv

#' Fit Cox survival to all variants in a .vcf.gz file from
#'  Michigan imputation server
#' 
#' Performs survival analysis using Cox proportional hazard models on imputed
#'  genetic data stored in compressed VCF files 
#' 
#' @param vcf.file character(1) path to VCF file.
#' @param covariate.file matrix(1) comprising phenotype (time, event) and
#'  additional covariate data. 
#' @param id.column character(1) providing exact match to sample ID column 
#'  from \code{covariate.file}
#' @param time.to.event character(1) string that matches time column name 
#'  in pheno.file
#' @param event character(1) string that matches event column name in 
#'  pheno.file
#' @param covariates character vector with matching column names in 
#'  pheno.file of covariates of interest
#' @param inter.term character(1) string giving the column name of the 
#'  covariate that will be added to the interaction term with
#'   SNP (e.g. \code{term*SNP}). See details.
#' @param print.covs character(1) string of either \code{"only"},
#'  \code{"all"} or \code{"some"}, 
#'  defining which covariate statistics should be printed to the output.
#'   See details.
#' @param sample.ids character vector with sample ids to include in analysis
#' @param r2.filter integer(1) of imputation quality
#'  score filter (i.e. 0.7 will filter r2 > 0.7)
#' @param maf.filter integer(1) filter out minor allele frequency below 
#'  threshold (i.e. 0.005 will filter MAF > 0.005)
#' @param chunk.size integer(1) number of variants to process per thread
#' @param out.file character(1) string with output name
#' @param verbose logical(1) for messages that describe which part of the
#'  analysis is currently being run
#' @param clusterObj A cluster object that can be used with the
#'  \code{parApply} function. See details.
#' 
#' @details 
#' Testing for SNP-covariate interactions:          
#' User can define the column name of the covariate that will be included in 
#'  the interaction term. 
#' For example, for given covariates \code{a} and \code{b}, where \code{c} is
#'  defined as the \code{inter.term} the model will be: 
#'  \code{~ a + b + c + SNP + c*SNP}.
#' 
#' Printing results of other covariates:       
#' \code{print.covs} argument controls the number of covariates will be printed 
#' as output. The function is set to \code{only} by default and will only print 
#' the SNP or if an interaction term is given, the results of the interaction 
#' term (e.g. \code{SNP*covariate}). Whereas,  \code{all} will print results
#'  (coef, se.coef, p.value etc) of all covariates included in the model. 
#'  \code{some} is only applicable if an interaction term is given and will
#'   print the results for SNP, covariate tested for interaction 
#'   and the interaction term. 
#' User should be mindful about using the \code{all} option, as it will likely 
#'  slow down the analysis and will increase the output file size. 
#'
#' User defined parallelization:  
#' This function uses \code{parApply} from \code{parallel} package to fit 
#'  models to SNPs in parallel. 
#' User is not required to set any options for the parallelization. 
#' However, advanced users who wish to optimize it, can provide a cluster 
#'  object generated by \code{makeCluster} family of functions that suits 
#'  their need and platform.
#
#' @return 
#' Saves two text files directly to disk:  
#' \code{.coxph} extension containing CoxPH survival analysis results.  
#' \code{.snps_removed} extension containing SNPs that were removed
#'  due to low variance or user-defined thresholds.   
#' 
#' @examples 
#' vcf.file <- system.file(package="gwasurvivr",
#'                         "extdata",
#'                      "michigan.chr14.dose.vcf.gz")
#' pheno.fl <- system.file(package="gwasurvivr",
#'                         "extdata",
#'                      "simulated_pheno.txt")
#' pheno.file <- read.table(pheno.fl, 
#'                          sep=" ",
#'                          header=TRUE,
#'                          stringsAsFactors = FALSE)
#' pheno.file$SexFemale <- ifelse(pheno.file$sex=="female", 1L, 0L)
#' sample.ids <- pheno.file[pheno.file$group=="experimental",]$ID_2
#' michiganCoxSurv(vcf.file=vcf.file,
#'               covariate.file=pheno.file,
#'               id.column="ID_2",
#'               sample.ids=sample.ids,
#'               time.to.event="time",
#'               event="event",
#'               covariates=c("age", "SexFemale", "DrugTxYes"),
#'               inter.term=NULL,
#'               print.covs="only",
#'               out.file="michigan_example",
#'               r2.filter=0.3,
#'               maf.filter=0.005,
#'               chunk.size=50,
#'               verbose=TRUE,
#'               clusterObj=NULL)
#' 
#' @importFrom survival Surv coxph.fit
#' @importFrom utils write.table
#' @importFrom matrixStats rowMeans2
#' @importFrom SummarizedExperiment rowRanges
#' @importFrom stats pnorm complete.cases
#' @import parallel
#' @import VariantAnnotation
#' 
#' @export

michiganCoxSurv <- function(vcf.file,
                            covariate.file,
                            id.column,
                            sample.ids=NULL,
                            time.to.event,
                            event,
                            covariates,
                            inter.term=NULL,
                            print.covs="only",
                            out.file,
                            maf.filter=0.05,
                            r2.filter=NULL,
                            chunk.size=5000,
                            verbose=TRUE,
                            clusterObj=NULL){
    
    if(verbose) message("Analysis started on ",
                        format(Sys.time(), "%Y-%m-%d"),
                        " at ",
                        format(Sys.time(), "%H:%M:%S"))
    ################################################
    #### Phenotype data wrangling ################
    cox.params <- coxPheno(covariate.file,
                           covariates,
                           id.column, 
                           inter.term,
                           time.to.event,
                           event,
                           sample.ids,
                           verbose)
    ################################################
    ################################################
    ########## Cluster object ########################
    # create cluster object depending on user pref or OS type,
    # also create option to input number of cores
    if(!is.null(clusterObj)){
        cl <- clusterObj
    }else if(.Platform$OS.type == "unix") {
        cl <- makePSOCKcluster(getOption("gwasurvivr.cores", 2L))
    } else {
        cl <- makeCluster(getOption("gwasurvivr.cores", 2L))
    }
    on.exit(stopCluster(cl), add=TRUE)
    ################################################
    #### open VCF file connection ##################
    vcf <- VcfFile(vcf.file, yieldSize=chunk.size)
    open(vcf)
    ################################################
    ####### read first chunk #######################
    chunk.start <- 0
    if(verbose) message("Analyzing chunk ",
                        chunk.start,
                        "-", 
                        chunk.start+chunk.size)    
    data <- readVcf(vcf,
                    param=ScanVcfParam(geno="DS",
                                       info=c("AF", "MAF", "R2", "ER2")))
    out.list <- coxVcfMichigan(data,
                               covariates, 
                               maf.filter, 
                               r2.filter, 
                               cox.params,
                               cl,
                               inter.term,
                               print.covs)
    write.table(
        out.list$res,
        paste0(out.file, ".coxph"),
        append = FALSE,
        row.names = FALSE,
        col.names = TRUE,
        quote = FALSE,
        sep = "\t"
    )
    write.table(
        out.list$dropped.snps,
        paste0(out.file, ".snps_removed"),
        append = FALSE,
        row.names = FALSE,
        col.names = FALSE,
        quote = FALSE,
        sep = "\t"
    )
    chunk.start <- chunk.size
    snps_removed <- nrow(out.list$dropped.snps)
    snps_analyzed <- nrow(out.list$res)
    ################################################
    ################################################
    ##### Start repeat loop ########################
    # get genotype probabilities by chunks
    # apply the survival function and save output
    repeat{ 
        # read in just dosage data from Vcf file
        if(verbose) message("Analyzing chunk ",
                            chunk.start,
                            "-",
                            chunk.start+chunk.size)    
        
        data <- readVcf(vcf,
                        param=ScanVcfParam(geno="DS",
                                           info=c("AF", "MAF", "R2", "ER2")
                                           )
                        )
        if(nrow(data)==0){
            break
        }
        out.list <- coxVcfMichigan(data,
                                   covariates,
                                   maf.filter, 
                                   r2.filter,
                                   cox.params,
                                   cl,
                                   inter.term, 
                                   print.covs)
        write.table(
            out.list$res,
            paste0(out.file, ".coxph"),
            append = TRUE,
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE,
            sep = "\t"
        )
        write.table(
            out.list$dropped.snps,
            paste0(out.file, ".snps_removed"),
            append = TRUE,
            row.names = FALSE,
            col.names = FALSE,
            quote = FALSE,
            sep = "\t"
        )
        chunk.start <- chunk.start+chunk.size
        snps_removed <- snps_removed+nrow(out.list$dropped.snps)
        snps_analyzed <-  snps_analyzed+nrow(out.list$res)
    }
    ################################################
    close(vcf)
    if(verbose) message("Analysis completed on ", 
                        format(Sys.time(), "%Y-%m-%d"),
                        " at ",
                        format(Sys.time(), "%H:%M:%S"))
    if(verbose){
        message(snps_removed,
                " SNPs were removed from the analysis for ",
                "not meeting the threshold criteria.")   
    } 
    if(verbose) message("List of removed SNPs can be found in ",
                        paste0(out.file, ".snps_removed"))
    if(verbose) message(snps_analyzed,
                        " SNPs were analyzed in total")
    if(verbose) message("The survival output can be found at ",
                        paste0(out.file, ".coxph"))
}

Try the gwasurvivr package in your browser

Any scripts or data that you put into this service are public.

gwasurvivr documentation built on Nov. 8, 2020, 6:53 p.m.