inst/script/create_collapsing_matrix.R

#!/usr/bin/Rscript

require(optparse)
require(data.table)

require(rvatk)                                                               

Main <- function() {
  option.list <- GetOptList()
  opt <- parse_args(OptionParser(option_list=option.list))

  # read in geneset file
  geneset <- ReadListFile(opt$geneset.file)

  # read in sample file
  samples.tbl <- read.table(opt$sampleped.file, stringsAsFactors=F)
  cases <- samples.tbl[samples.tbl[,6]==2, 2]
  ctrls <- samples.tbl[samples.tbl[,6]==1, 2]
  samples <- c(cases, ctrls)

  # verify no repeats of sampleID or genename
  if (TRUE %in% duplicated(samples)) {
    stop("duplicated samples in input sample file")
  } else if (TRUE %in% duplicated(geneset)) {
    stop("duplicated gene names in input geneset file")
  }

  # build collapsing matrix based on geneset and sample files
  mat <- CollapsingMatrixInit(geneset, samples)
  
  # read in qualvar file if defined, else read in genotypes CSV
  if (is.null(opt$qualvar) == F) {
    qualvar <- read.table(opt$qualvar, header=F)
    qualvar <- qualvar[ qualvar[,1] %in% geneset & 
                        qualvar[,3] %in% samples, , drop=F]
    mat <- CollapsingMatrixRecode(mat, qualvar)
  } else if (is.null(opt$variant) == F | 
              is.null(opt$exclude.variant) == F) {
    variant <- c()
    if (is.null(opt$variant) == F) { 
      variant <- ScanList(opt$variant) 
    }
    exclude.variant <- c()
    if (is.null(opt$exclude.variant) == F) { 
      exclude.variant <- ScanList(opt$exclude.variant) 
    }
  } else if (is.null(opt$genotypes.csv) == F) {
    geno <- ReadLargeTable(opt$genotypes.csv, 
                           showProgress=F,
                           header=T, check.names=F)
    geno[[opt$gene.name.col]] <- gsub("'","", geno[[opt$gene.name.col]]) 
    cnd <- ReadConditionFile(opt$condition.file)
    if (nrow(cnd) > 0) {
      if ("loo maf" %in% cnd[,1]) {
        cat("Adding loo maf col...\n")
        geno <- AddLooMafCol(geno, length(samples))
      }
      geno <- TblCndSubset(geno, cnd)
    }
    qualvar <- geno[, c(opt$gene.name.col, opt$variant.id.col,
                        opt$sample.name.col, opt$genotype.col), drop=F]
    mat <- CollapsingMatrixRecode(mat, qualvar)                                    
    ## WORKFLOW : APPLY ALL THRESHOLDS DEFINED IN THRESHOLD FILE
    # threshold file format (each row): "col name" "relational operator" "val"
    # example1 : "ExAC fin maf" < 0.01
    # example2 : "Polyphen Humvar Prediction" in "possibly_damaging,probably_damaging"
  } else {
    cat("require a qualvar file or an ATAV genotypes CSV. Exiting...\n")
    q()
  }

  # write matrix to output file
  CollapsingMatrixWrite(mat, opt$collapsing.matrix.out)

}

GetOptList <- function() {
  # Get option list.
  #
  # Returns:
  #   Option list to parse options.
  option.list <- list(
    make_option(c("--sampleped-file"), action="store", type="character",
                default=NULL, dest="sampleped.file", 
                help="name of required sample file, [default %default]"),

    make_option(c("--geneset-file"), action="store", 
                type="character",      
                default=NULL, 
                dest="geneset.file",                          
                help="name of required geneset file, [default %default]"), 

    make_option(c("--collapsing-matrix-out"), 
                action="store", type="character",
                default=NULL, dest="collapsing.matrix.out",
                help="name of required output collapsing 
                      matrix file, [default %default]"),

    make_option(c("--qualvar"), action="store", type="character",
                default=NULL, dest="qualvar",
                help="name of .qualvar file, [default %default]"),

    make_option(c("--genotypes-csv"), action="store", type="character",
                default=NULL, dest="genotypes.csv",
                help="name of genotypes csv generated by ATAV, 
                      if no qualvar file provided [default %default]"),

    make_option(c("--condition-file"), action="store", type="character",
                default=NULL, dest="condition.file",
                help="name of condition filename to subset genotype csv on,
                      if no qualvar file is provided [default %default]"),

    make_option(c("--gene-name-col"), action="store", type="character",
                default="Gene Name", dest="gene.name.col",
                help="Gene name column to subset genotype csv on,
                      if no qualvar file is provided [default %default]"),

    make_option(c("--variant-id-col"), action="store", type="character",
                default="Variant ID", dest="variant.id.col",
                help="variant ID column to subset genotype csv on,
                      if no qualvar file is provided [default %default]"),

    make_option(c("--sample-name-col"), action="store", type="character",
                default="Sample Name", dest="sample.name.col",
                help="Sample name column to subset genotype csv on,
                      if no qualvar file is provided [default %default]"),

    make_option(c("--genotype-col"), action="store", type="character",
                default="Genotype", dest="genotype.col",
                help="Sample name column to subset genotype csv on,
                      if no qualvar file is provided [default %default]")

  )

  return(option.list)
}

if (interactive() == F) {
  Main()
}
Halvee/rvatk documentation built on May 6, 2019, 10:55 p.m.