R/GMS-class.R
In GenVisR: Genomic Visualizations in R

Documented in GMS

################################################################################
##################### Public/Private Class Definitions #########################

#!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Public Class !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#

#' Class GMS
#' 
#' An S4 class for Genome Modeling System annotation files, under development!!!
#' @name GMS-class
#' @rdname GMS-class
#' @slot path Character string specifying the paths of the GMS files read in.
#' @slot version Numeric value specifying the version of the GMS annotation files.
#' @slot gmsObject gms object which inherits from gms_Virtual class.
#' @exportClass GMS
#' @include GMS_Virtual-class.R
#' @import methods
setClass("GMS",
         representation=representation(path="character",
                                       version="numeric",
                                       gmsObject="GMS_Virtual"))

#' Constructor for the GMS container class.
#' 
#' @name GMS
#' @rdname GMS-class
#' @param path String specifying the path to a GMS annotation file. Can accept
#' wildcards if multiple GMS annotation files exist (see details).
#' @param data data.table object storing a GMS annotation file. Overrides "path"
#' if specified.
#' @param version String specifying the version of the GMS files, Defaults to
#' version 4.
#' @param verbose Boolean specifying if progress should be reported while
#' reading in the GMS files.
#' @details When specifying a path to a GMS annotation file the option exist to
#' either specify the full path to an annotation file or to use wildcards to
#' specify multiple files. When specifying a full path the initalizer will check
#' if a column named "sample" containg the relevant sample for each row exists.
#' If such a column is not found the initalizer will assume this file
#' corresponds to only one sample and populate a sample column accordingly.
#' Alternatively if multiple files are specified at once using a wildcard, the
#' initalizer will aggregate all the files and use the file names minus any
#' extension top populate sample names.
#' The version defaults to 4 which is the default value of the GMS annotator.
#' This value will need to be changed only if files were created using a
#' different GMS annotator version.
#' @seealso \code{\link{Waterfall}}
#' @seealso \code{\link{MutSpectra}}
#' @importFrom data.table fread
#' @importFrom data.table rbindlist
#' @importFrom data.table is.data.table
#' @importFrom data.table as.data.table
#' @export
GMS <- function(path, data=NULL, version=4, verbose=FALSE){
    
    message("This function is part of the new S4 feature and is under active development")
    
    # assign the annotator version
    version <- version
    
    # if data is null read from path variable
    if(is.null(data)){
        # Grab all files and assign to slot
        gmsFiles <- Sys.glob(path)
        path <- gmsFiles
        
        # anonymous function to read in files
        a <- function(x, verbose){
            # detect OS and remove slashes and extension
            if(.Platform$OS.type == "windows"){
                sampleName <- gsub("(.*/)||(.*\\\\)", "", x)
                sampleName <- gsub("\\.[^.]+$", "", x)
            } else {
                sampleName <- gsub("(.*/)", "", x)
                sampleName <- gsub("\\.[^.]+$", "", sampleName)
            }
            # read data
            gmsData <- suppressWarnings(data.table::fread(input=x,
                                                          stringsAsFactors=TRUE,
                                                          verbose=verbose))
            # set sample if it's not already in the data table
            if(any(colnames(gmsData) %in% "sample")){
                return(gmsData)
            } else {
                gmsData$sample <- sampleName
                return(gmsData)
            }
        }
        
        # aggregate data into a single data table if necessary
        if(length(gmsFiles) == 0){
            memo <- paste("No files found using:", path)
            stop(memo)
        } else {
            gmsData <- lapply(gmsFiles, a, verbose)
            gmsData <- data.table::rbindlist(gmsData)
        }
    } else if(is.data.table(data)){
        path <- "none"
        gmsData <- data
    } else {
        memo <- paste("data is not of class data.table,",
                      "attempting to coerce")
        warning(memo)
        path <- "none"
        gmsData <- data.table::as.data.table(data)
    }
    
    # assign the gmsData to it's slot
    if(version == 4){
        gmsObject <- GMS_v4(gmsData=gmsData)
    } else {
        memo <- paste("Currently only GMS version 4 is supported, make a",
                      "feature request on",
                      "https://github.com/griffithlab/GenVisR!")
        stop(memo)
    }
    
    new("GMS", path=path, gmsObject=gmsObject, version=version)
}

################################################################################
###################### Accessor function definitions ###########################

#' @rdname writeData-methods
#' @aliases writeData
setMethod(f="writeData",
          signature="GMS",
          definition=function(object, file, ...){
              writeData(object@gmsObject, file, sep="\t")
          })

#' @rdname getVersion-methods
#' @aliases getVersion
setMethod(f="getVersion",
          signature="GMS",
          definition=function(object, ...){
              version <- object@version
              return(version)
          })

#' @rdname getPath-methods
#' @aliases getPath
setMethod(f="getPath",
          signature="GMS",
          definition=function(object, ...){
              path <- object@path
              return(path)
          })

#' @rdname getPosition-methods
#' @aliases getPosition
setMethod(f="getPosition",
          signature="GMS",
          definition=function(object, ...){
              positions <- getPosition(object@gmsObject)
              return(positions)
          })

#' @rdname getMutation-methods
#' @aliases getMutation
setMethod(f="getMutation",
          signature="GMS",
          definition=function(object, ...){
              mutations <- getMutation(object@gmsObject)
              return(mutations)
          })

#' @rdname getSample-methods
#' @aliases getSample
setMethod(f="getSample",
          signature="GMS",
          definition=function(object, ...){
              sample <- getSample(object@gmsObject)
              return(sample)
          })

#' @rdname getMeta-methods
#' @aliases getMeta
setMethod(
    f="getMeta",
    signature="GMS",
    definition=function(object, ...){
        meta <- getMeta(object@gmsObject)
        return(meta)
    }
)

################################################################################
####################### Method function definitions ############################

#' @rdname toWaterfall-methods
#' @aliases toWaterfall
#' @noRd
setMethod(f="toWaterfall",
          signature="GMS",
          definition=function(object, hierarchy, labelColumn, verbose, ...){
              
              # print status message
              if(verbose){
                  memo <- paste("Converting", class(object),
                                "to expected waterfall format")
                  message(memo)
              }
              
              # grab the sample, mutation, gene columns and set a label
              sample <- getSample(object)
              mutation <- getMutation(object)[,"trv_type"]
              gene <- getMeta(object)[,"gene_name"]
              label <- NA
              labelFlag <- TRUE
              
              # if a label column exists and is proper overwrite the label variable
              # if not change the flag
              if(!is.null(labelColumn)){
                  if(length(labelColumn) != 1) {
                      memo <- paste("Parameter \"labelColumn\" must be of length 1!",
                                    "Found length to be", length(labelColumn))
                      warning(memo)
                      labelFlag <- FALSE
                  }
                  
                  if(any(!labelColumn %in% colnames(getMeta(object)))){
                      memo <- paste("Did not find column:", labelColumn,
                                    "in the meta slot of the vepObject! Valid",
                                    "names are:", toString(colnames(getMeta(object))))
                      warning(memo)
                      labelFlag <- FALSE
                  }
                  
                  if(labelFlag){
                      label <- getMeta(object)[,labelColumn, with=FALSE]
                  }
              }
              
              # combine all columns into a consistent format
              waterfallFormat <- cbind(sample, gene, mutation, label)
              colnames(waterfallFormat) <- c("sample", "gene", "mutation", "label")
              
              # make a temporary ID column for genomic features to collapse on
              # this will ensure the mutation burden/frequency plot will be accurate
              waterfallFormat$key <- paste0(getPosition(object)$chromosome_name, ":",
                                            getPosition(object)$start, ":",
                                            getPosition(object)$stop, ":",
                                            getPosition(object)$reference, ":",
                                            getPosition(object)$variant, ":",
                                            getSample(object)$sample)
              rowCountOrig <- nrow(waterfallFormat)

              # order the data based on the mutation hierarchy,
              # remove all duplicates based on key, and remove the key column
              waterfallFormat$mutation <- factor(waterfallFormat$mutation, levels=hierarchy$mutation)
              waterfallFormat <- waterfallFormat[order(waterfallFormat$mutation),]
              waterfallFormat <- waterfallFormat[!duplicated(waterfallFormat$key),]
              waterfallFormat[,key:=NULL]

              # print status message
              if(verbose){
                  memo <- paste("Removed", rowCountOrig - nrow(waterfallFormat),
                                "rows from the data which harbored duplicate",
                                "genomic locations")
                  message(memo)
              }
              
              # remove any rows with NA values in gene or sample, this could mess up downstream functions
              rowCountOrig <- nrow(waterfallFormat)
              waterfallFormat <- waterfallFormat[!is.na(waterfallFormat$gene),]
              waterfallFormat <- waterfallFormat[!is.na(waterfallFormat$sample),]
              if(verbose & rowCountOrig != nrow(waterfallFormat)){
                  memo <- paste("Removed", rowCountOrig-nrow(waterfallFormat), "rows which harbored NA values",
                                "in either the gene or sample column")
                  message(memo)
              }
              
              # convert appropriate columns to factor
              waterfallFormat$sample <- factor(waterfallFormat$sample)
              
              return(waterfallFormat)
          })

#' @rdname setMutationHierarchy-methods
#' @aliases setMutationHierarchy
#' @noRd
#' @importFrom data.table data.table
#' @importFrom data.table setDT
#' @importFrom data.table rbindlist
#' @importFrom grDevices colors
setMethod(f="setMutationHierarchy",
          signature="GMS",
          definition=function(object, mutationHierarchy, verbose, ...){
              # set the mutation hierarchy if a custom hierarchy is unspecified
              if(is.null(mutationHierarchy)){
                  mutationHierarchy$mutation <- c("nonsense", "frame_shift_del",
                                                  "frame_shift_ins", "splice_site_del",
                                                  "splice_site_ins", "splice_site",
                                                  "nonstop", "in_frame_del", "in_frame_ins",
                                                  "missense", "splice_region_del",
                                                  "splice_region_ins", "splice_region",
                                                  "5_prime_flanking_region", 
                                                  "3_prime_flanking_region", 
                                                  "3_prime_untranslated_region",
                                                  "5_prime_unstranslated_region",
                                                  "rna", "intronic", "silent")
                  
                  mutationHierarchy$color <- c('#4f00A8', '#A80100', '#CF5A59',
                                               '#A80079', '#BC2D94', '#CF59AE',
                                               '#000000', '#006666', '#00A8A8',
                                               '#009933', '#ace7b9', '#cdf0d5',
                                               '#59CF74', '#002AA8', '#5977CF',
                                               '#F37812', '#F2B079', '#888811',
                                               '#FDF31C', '#8C8C8C')
                  mutationHierarchy <- data.table::data.table("mutation"=mutationHierarchy$mutation,
                                                              "color"=mutationHierarchy$color)
              }
              
              # check that mutationHiearchy is a data table
              if(!is(mutationHierarchy, "data.table")){
                  memo <- paste("mutationHiearchy is not an object of class",
                                "data.table, attempting to coerce.")
                  warning(memo)
                  mutationHierarchy <- data.table::setDT(mutationHierarchy)
              }
              
              # check for the correct columns
              if(!all(colnames(mutationHierarchy) %in% c("mutation", "color"))){
                  missingCol <- colnames(mutationHierarchy)[!c("mutation", "color") %in% colnames(mutationHierarchy)]
                  memo <- paste("The correct columns were not found in",
                                "mutationHierarchy, missing", toString(missingCol))
                  stop(memo)
              }
              
              # check that all mutations are specified
              if(!all(object@gmsObject@mutation$trv_type %in% mutationHierarchy$mutation)){
                  missingMutations <- unique(object@gmsObject@mutation$trv_type[!object@gmsObject@mutation$trv_type %in% mutationHierarchy$mutation])
                  memo <- paste("The following mutations were found in the",
                                "input however were not specified in the",
                                "mutationHierarchy!", toString(missingMutations),
                                "adding these in as least important and",
                                "assigning random colors!")
                  warning(memo)
                  newCol <- grDevices::colors(distinct=TRUE)[!grepl("^gray", grDevices::colors(distinct=TRUE))]
                  tmp <- data.table::data.table("mutation"=missingMutations,
                                                "color"=sample(newCol, length(missingMutations)))
                  mutationHierarchy <- data.table::rbindlist(list(mutationHierarchy, tmp), use.names=TRUE, fill=TRUE)
              }
              
              # add in a pretty print mutation labels
              mutationHierarchy$label <- gsub("_", " ", mutationHierarchy$mutation)
              mutationHierarchy$label <-  gsub("'", "' ", mutationHierarchy$label)
              
              # check for duplicate mutations
              if(any(duplicated(mutationHierarchy$mutation))){
                  duplicateMut <- mutationHierarchy[duplicated(mutationHierarchy$mutation),"mutation"]
                  memo <- paste("The mutation type",toString(duplicateMut),
                                "was duplicated in the supplied mutationHierarchy!")
                  mutationHierarchy <- mutationHierarchy[!duplicated(mutationHierarchy$mutation),]
              }
              
              # ensure columns are of the proper type
              mutationHierarchy$color <- as.character(mutationHierarchy$color)
              mutationHierarchy$mutation <- as.character(mutationHierarchy$mutation)
              
              # print status message
              if(verbose){
                  memo <- paste("Setting the hierarchy of mutations from most",
                                "to least deleterious and mapping to colors:",
                                toString(mutationHierarchy$mutation))
                  message(memo)
              }
              
              return(mutationHierarchy)
          })

#' @rdname toMutSpectra-methods
#' @aliases toMutSpectra
#' @param object Object of class GMS
#' @param verbose Boolean specifying if status messages should be reported
#' @importFrom data.table rbindlist
#' @noRd
setMethod(f="toMutSpectra",
          signature="GMS",
          definition=function(object, verbose, ...){
              
              # print status message
              if(verbose){
                  memo <- paste("Converting", class(object),
                                "to expected MutSpectra format")
                  message(memo)
              }
              
              # get an index of only the snvs
              snvIndex <- which(object@gmsObject@meta$type == "SNP")
              
              # status message
              if(verbose){
                  memo <- paste("Removing", nrow(object@gmsObject@sample)-length(snvIndex),
                                "entries which are not of class SNP!")
                  message(memo)
              }
              
              # grab the sample, mutation, position columns
              sample <- object@gmsObject@sample[snvIndex]
              variantAllele <- object@gmsObject@mutation[snvIndex,"variant"]
              refAllele <- object@gmsObject@mutation[snvIndex,"reference"]
              chr <- object@gmsObject@position[snvIndex,"chromosome_name"]
              start <- object@gmsObject@position[snvIndex,"start"]
              stop <- object@gmsObject@position[snvIndex,"stop"]
              
              # combine all columns into a consistent format and remove duplicate variants
              mutSpectraFormat <- cbind(sample, chr, start, stop, refAllele, variantAllele)
              colnames(mutSpectraFormat) <- c("sample", "chromosome", "start", "stop", "refAllele", "variantAllele")
              
              # unique, to make sure no duplicate variants exist to throw off the counts
              rowCountOrig <- nrow(mutSpectraFormat)
              mutSpectraFormat <- unique(mutSpectraFormat)
              
              # print status message
              if(verbose){
                  memo <- paste("Removed", rowCountOrig - nrow(mutSpectraFormat),
                                "rows from the data which harbored duplicate",
                                "genomic locations")
                  message(memo)
              }
              
              # Remove cases where there is not change between reference and variant
              mutSpectraFormat$refAllele <- as.character(mutSpectraFormat$refAllele)
              mutSpectraFormat$variantAllele <- as.character(mutSpectraFormat$variantAllele)
              alleleMatchIndex <- mutSpectraFormat$refAllele == mutSpectraFormat$variantAllele
              mutSpectraFormat <- mutSpectraFormat[!alleleMatchIndex,]
              if(verbose){
                  memo <- paste("Removed", length(alleleMatchIndex), "entries",
                                "where the reference allele matched the tumor allele")
                  message(memo)
              }
              
              # convert appropriate columns to factor
              mutSpectraFormat$sample <- factor(mutSpectraFormat$sample)
              
              return(mutSpectraFormat)
          })

#' @rdname toRainfall-methods
#' @aliases toRainfall
#' @param object Object of class GMS
#' @param verbose Boolean specifying if status messages should be reported
#' @noRd
setMethod(f="toRainfall",
          signature="GMS",
          definition=function(object, verbose, ...){
              
              # print status message
              if(verbose){
                  memo <- paste("converting", class(object), "to expected Rainfall format")
                  message(memo)
              }
              
              # grab the sample, position, and mutation columns
              sample <- getSample(object)
              chromosome <- getPosition(object)$chromosome_name
              start <- getPosition(object)$start
              stop <- getPosition(object)$stop
              refAllele <- as.character(getMutation(object)$reference)
              variantAllele <- as.character(getMutation(object)$variant)
              
              # combine all the relevant data into a single data table
              rainfallFormat <- cbind(sample, chromosome, start, stop, refAllele, variantAllele)
              
              # remove cases where a mutation does not exist
              rowCountOrig <- nrow(rainfallFormat)
              rainfallFormat <- rainfallFormat[rainfallFormat$refAllele != rainfallFormat$variantAllele,]
              if(rowCountOrig != nrow(rainfallFormat)){
                  memo <- paste("Removed", rowCountOrig - nrow(rainfallFormat),
                                "entries where the reference matches the variant")
                  warning(memo)
              }
              
              # remove mutations at duplicate genomic mutation as this could artifically increase
              # the density of mutations
              rowCountOrig <- nrow(rainfallFormat)
              rainfallFormat <- rainfallFormat[!duplicated(rainfallFormat[,c("sample", "chromosome", "start", "stop")]),]
              if(rowCountOrig != nrow(rainfallFormat)){
                  memo <- paste("Removed", rowCountOrig - nrow(rainfallFormat),
                                "entries with duplicate genomic positions")
                  warning(memo)
              }
              
              # create a flag column for where these entries came from
              rainfallFormat$origin <- 'mutation'
              
              # make sure everything is of the proper type
              rainfallFormat$sample <- factor(rainfallFormat$sample, levels=unique(rainfallFormat$sample))
              rainfallFormat$chromosome <- factor(rainfallFormat$chromosome, levels=unique(rainfallFormat$chromosome))
              rainfallFormat$start <- as.integer(rainfallFormat$start)
              rainfallFormat$stop <- as.integer(rainfallFormat$stop)
              rainfallFormat$refAllele <- factor(rainfallFormat$refAllele, levels=unique(rainfallFormat$refAllele))
              rainfallFormat$variantAllele <- factor(rainfallFormat$variantAllele, levels=unique(rainfallFormat$variantAllele))
              
              # return the standardized format
              return(rainfallFormat)
          })

#' @rdname toLolliplot-methods
#' @aliases toLolliplot
#' @param object Object of class GMS
#' @param verbose Boolean specifying if status messages should be reported
#' @noRd
setMethod(f="toLolliplot",
          signature="GMS",
          definition=function(object, verbose, ...){
              
              # print status message
              if(verbose){
                  memo <- paste("converting", class(object), "to expected Lolliplot format")
                  message(memo)
              }
              
              # grab the core required columns
              chromosome <- getPosition(object)$chromosome_name
              start <- getPosition(object)$start
              stop <- getPosition(object)$stop
              reference <- getMutation(object)$reference
              variant <- getMutation(object)$variant
              sample <- getSample(object)
              gene <- getMeta(object)$ensembl_gene_id
              consequence <- getMutation(object)$trv_type
              
              # grab amino acid position columns if available
              transcript <- getMeta(object)$transcript_name
              aminoAcidChange <- getMeta(object)$amino_acid_change
              label <- getMeta(object)$amino_acid_change
              
              # if amino acid position is not present try and use the biomaRt method to get relevent data, this should never
              # happen with a GMS file though
              valueCheck <- c(is.null(transcript), is.null(aminoAcidChange))
              if(any(valueCheck)){
                  missingIndex <- which(valueCheck == TRUE)
                  memo <- paste("Could not find the follwing required fields:", toString(valueCheck[missingINdex]),
                                "unable to use file supplied protein positions, please contact developer to report error!")
                  warning(memo)
                  lolliplotFormat <- cbind(chromosome, start, stop, reference, variant, sample, gene, consequence)
                  colnames(lolliplotFormat) <- cbind("chromosome", "start", "stop", "reference", "variant", "sample", "gene", "consequence")
              } else {
                  # combine all the relevant data into a single data table
                  lolliplotFormat <- cbind(chromosome, start, stop, reference, variant, sample, gene, consequence, transcript, aminoAcidChange, label)
                  colnames(lolliplotFormat) <- c("chromosome", "start", "stop", "reference", "variant", "sample", "gene", "consequence", "transcript", "proteinCoord", "label")
              }
              
              return(lolliplotFormat)
              })