R/GCT.R

Defines functions GCT

Documented in GCT

########################################
### GCT class and method definitions ###
########################################

#' An S4 class to represent a GCT object
#' 
#' @slot mat a numeric matrix
#' @slot rid a character vector of row ids
#' @slot cid a character vector of column ids
#' @slot rdesc a \code{data.frame} of row descriptors
#' @slot rdesc a \code{data.frame} of column descriptors
#' @slot src a character indicating the source (usually file path) of the data
#' 
#' @description The GCT class serves to represent annotated
#'   matrices. The \code{mat} slot contains said data and the
#'   \code{rdesc} and \code{cdesc} slots contain data frames with
#'   annotations about the rows and columns, respectively
#'   
#' @seealso \code{\link{parse_gctx}},
#' \code{\link{write_gctx}}, \code{\link{read_gctx_meta}},
#' \code{\link{read_gctx_ids}}
#' @seealso visit \url{http://clue.io/help} for more information on the
#'   GCT format
methods::setClass("GCT",
                  methods::representation(
                    mat = "matrix",
                    rid = "character",
                    cid = "character",
                    rdesc = "data.frame",
                    cdesc = "data.frame",
                    version = "character",
                    src = "character"
                  )
)


## ----set up methods for checking GCT validity----
methods::setValidity("GCT",
                     function(object) {
                       # check whether dimensions of various
                       # slots are in sync
                       m <- mat(object)
                       rid <- ids(object)
                       cid <- ids(object, dim="column")
                       rdesc <- meta(object)
                       cdesc <- meta(object, dim="column")
                       nrows <- nrow(m)
                       ncols <- ncol(m)
                       if (nrows != length(rid)) {
                         return(
                      "rid must be the same length as number of matrix rows")
                       }
                       if (ncols != length(cid)) {
                         return(
                    "cid must be the same length as number of matrix columns")
                       }
                       if (any(duplicated(cid))) {
                         return("cid must be unique")
                       }
                       if (any(duplicated(rid))) {
                         return("rid must be unique")
                       }
                       if (nrow(cdesc) != ncols & nrow(cdesc) != 0) {
                         return(paste(
                           "cdesc must either have 0 rows or the",
                           "same number of rows as matrix has columns"))
                       }
                       if (nrow(rdesc) != nrows & nrow(rdesc) != 0) {
                         return(paste(
                           "rdesc must either have 0 rows or the same number",
                           "of rows as matrix has rows"))
                       }
                       else {
                         return(T)
                       }
                     }
)

## ----define the initialization method for the GCT class----
methods::setMethod("initialize",
                   signature = "GCT",
                   definition = function(.Object, mat=NULL, rdesc=NULL,
                                         cdesc=NULL,
                                         src=NULL, rid=NULL, cid=NULL,
                                         matrix_only=FALSE) {
                     # if we were supplied a matrix and annotations, use them
                     if (!is.null(mat)) {
                       .Object@mat <- mat
                       # if given rid and cid, use those as well
                       if (!is.null(rid)) {
                         .Object@rid <- rid
                       } else {
                         .Object@rid <- rownames(mat)
                       }
                       if (!is.null(cid)) {
                         .Object@cid <- cid
                       } else {
                         .Object@cid <- colnames(mat)
                       }
                       if (!is.null(rdesc)) {
                         .Object@rdesc <- rdesc
                       }
                       if (!is.null(cdesc)) {
                         .Object@cdesc <- cdesc
                       }
                       # make sure rid, cid and dimnames of mat in sync
                       dimnames(.Object@mat) <- list(.Object@rid, .Object@cid)
                     } else if (!is.null(src)) {
                       # we were not given a matrix, were we given a src file?
                       # check to make sure it's either .gct or .gctx
                       if (! (grepl(".gct$", src) || grepl(".gctx$", src) ))
                         stop("Either a .gct or .gctx file must be given")
                       if (grepl(".gct$", src)) {
                         if ( ! is.null(rid) || !is.null(cid) )
                           warning(
                             "rid and cid values may only be given for",
                             ".gctx files, ignoring")
                         # parse the .gct
                         .Object@src <- src
                         # get the .gct version by reading first line
                         .Object@version <- scan(src, what="", nlines=1,
                                                 sep="\t", quiet=TRUE)[1]
                         # get matrix dimensions by reading second line
                         dimensions <- scan(src, what=double(0), nlines=1,
                                            skip=1, sep="\t", quiet=TRUE)
                         nrmat <- dimensions[1]
                         ncmat <- dimensions[2]
                         if (length(dimensions) == 4) {
                           # a #1.3 file
                           message("parsing as GCT v1.3")
                           nrhd <- dimensions[3]
                           nchd <- dimensions[4]
                         } else {
                           # a #1.2 file
                           message("parsing as GCT v1.2")
                           nrhd <- 0
                           nchd <- 0
                         }
                         message(paste(
                           src, nrmat, "rows,", ncmat, "cols,", nrhd,
                           "row descriptors,", nchd, "col descriptors"))
                         # read in header line
                         header <- scan(src, what="", nlines=1, skip=2,
                                        sep="\t", quote=NULL, quiet=TRUE)
                         # construct row header and column id's from the
                         # header line
                         if ( nrhd > 0 ) {
                           rhd <- header[2:(nrhd+1)]
                           cid <- header[-(nrhd+1):-1]
                           col_offset <- 1
                         }
                         else {
                           if (any(grepl("description", header,
                                         ignore.case=T))) {
                             # check for presence of description column
                             # in v1.2 files
                             col_offset <- 2
                           } else {
                             col_offset <- 1
                           }
                           rhd <- NULL
                           cid <- header[(1+col_offset):length(header)]
                         }
                         # read in the next set of headers (column annotations)
                         # and shape into a matrix
                         if ( nchd > 0 ) {
                           header <- scan(src, what="", nlines=nchd,
                                          skip=3,
                                          sep="\t", quote=NULL, quiet=TRUE)		
                           header <- matrix(header, nrow=nchd, 
                                            ncol=ncmat + nrhd + 1, byrow=TRUE)
                           # extract the column header and column descriptions
                           chd <- header[,1]
                           cdesc <- header[,-(nrhd+1):-1]
                           # need to transpose in the case where there's
                           # only one column annotation
                           if ( nchd == 1 )
                             cdesc <- t(cdesc)
                         }
                         else {
                           chd = NULL
                           cdesc <- data.frame(id=cid)
                         }
                         # read in the data matrix and row descriptions
                         # shape into a matrix
                         m <- scan(src, what="", nlines=nrmat, 
                                     skip=3 + nchd, sep="\t", quote=NULL,
                                     quiet=TRUE)
                         m <- matrix(m, nrow=nrmat,
                                       ncol=ncmat + nrhd + col_offset, 
                                       byrow=TRUE)
                         # message(paste(dim(mat), collapse="\t"))
                         # Extract the row id's row descriptions,
                         # and the data matrix
                         rid <- m[,1]
                         if ( nrhd > 0 ) {
                           # need as.matrix for the case where there's
                           # only one row annotation
                           rdesc <- as.matrix(m[,2:(nrhd + 1)])
                           m <- matrix(as.numeric(m[,-(nrhd + 1):-1]),
                                         nrow=nrmat, ncol=ncmat)
                         }
                         else {
                           rdesc <- data.frame(id=rid)
                           m <- matrix(as.numeric(
                             m[, (1+col_offset):ncol(m)]),
                             nrow=nrmat, ncol=ncmat)
                         }
                         # assign names to the data matrix and the
                         # row and column descriptions
                         dimnames(m) <- list(rid, cid)
                         if ( nrhd > 0 ) {
                           dimnames(rdesc) <- list(rid, rhd)
                           rdesc <- as.data.frame(rdesc,
                                                  stringsAsFactors=FALSE)
                         }
                         if ( nchd > 0 ) {
                           cdesc <- t(cdesc)
                           dimnames(cdesc) <- list(cid, chd)
                           cdesc <- as.data.frame(cdesc,
                                                  stringsAsFactors=FALSE)
                         }
                         # assign to the GCT slots
                         .Object@mat <- m
                         .Object@rid <- rownames(m)
                         .Object@cid <- colnames(m)
                         if (!matrix_only) {
                           # return annotations as well as matrix
                           .Object@rdesc <- fix_datatypes(rdesc)
                           .Object@cdesc <- fix_datatypes(cdesc)
                           # add id columns to rdesc and cdesc
                           .Object@rdesc$id <- rownames(.Object@rdesc)
                           .Object@cdesc$id <- rownames(.Object@cdesc)
                         }
                       }
                       else { 
                         # parse the .gctx
                         message("reading ", src)
                         .Object@src <- src
                         # if the rid's or column id's are .grp files,
                         # read them in
                         if ( length(rid) == 1 && grepl(".grp$", rid) )
                           rid <- parse_grp(rid)
                         if ( length(cid) == 1 && grepl(".grp$", cid) )
                           cid <- parse_grp(cid)
                         # get all the row and column ids
                         all_rid <- read_gctx_ids(src, dim="row")
                         all_cid <- read_gctx_ids(src, dim="col")
                         # if rid or cid specified, read only those rows/columns
                         # if already numeric, use as is
                         # else convert to numeric indices
                         processed_rids <- process_ids(rid, all_rid, type="rid")
                         processed_cids <- process_ids(cid, all_cid, type="cid")
                         # read the data matrix
                         .Object@mat <-
                           rhdf5::h5read(
                             src, name="0/DATA/0/matrix",
                             index=list(processed_rids$idx, processed_cids$idx))
                         # set the row and column ids, casting as characters
                         .Object@rid <- processed_rids$ids
                         .Object@cid <- processed_cids$ids
                         rownames(.Object@mat) <- processed_rids$ids
                         colnames(.Object@mat) <- processed_cids$ids
                         # get the meta data
                         if (!matrix_only) {
                           .Object@rdesc <- read_gctx_meta(
                             src, dim="row",
                             ids=processed_rids$ids)
                           .Object@cdesc <- read_gctx_meta(
                             src, dim="col",
                             ids=processed_cids$ids)
                         }
                         else {
                           .Object@rdesc <- data.frame(id=.Object@rid,
                                                       stringsAsFactors=FALSE)
                           .Object@cdesc <- data.frame(id=.Object@cid,
                                                       stringsAsFactors=FALSE)
                         }
                         # close any open handles and return the object
                         if(utils::packageVersion('rhdf5') < "2.23.0") {
                           rhdf5::H5close()
                         } else {
                           rhdf5::h5closeAll()
                         }
                         message("done")
                       }
                     }
                     # finally, make sure object is valid before returning
                     ok <- methods::validObject(.Object)
                     return(.Object)
                   }
)

#' Initialize an object of class \code{GCT}
#' @param mat a matrix
#' @param rdesc a \code{data.frame} of row metadata
#' @param cdesc a \code{data.frame} of column metadata
#' @param src path to a GCT file to read
#' @param rid vector of character identifiers for rows
#' @param cid vector of character identifiers for columns
#' @param matrix_only logical indicating whether to read just the matrix
#'   data from \code{src}
#'   
#' @details 
#'   If \code{mat} is provided, \code{rid} and \code{cid} are treated as
#'   the row and column identifiers for the matrix and are assigned to the
#'   \code{rid} and \code{cid} slots of the \code{GCT} object.
#'   
#'   If \code{mat} is not provided but \code{src} is provided,
#'   \code{rid} and \code{cid} are treated as filters. Data will be read from
#'   the file path provided to \code{src} and will then be restricted to the
#'   character ids or integer indices provided to \code{rid} and \code{cid}.
#'   In a similar manner, \code{matrix_only} controls whether the
#'   row and column metadata are also read from the \code{src} file path.
#'   
#' @returns a \code{GCT} object
#' @examples 
#' # an empty object
#' (g <- GCT())
#' # with a matrix
#' # note we must specify row and column ids
#' (g <- GCT(mat=matrix(rnorm(100), nrow=10),
#'           rid=letters[1:10], cid=letters[1:10]))
#' # from file
#' gct_file <- system.file("extdata", "modzs_n25x50.gctx", package="cmapR")
#' (g <- GCT(src=gct_file))
#' @family GCTX parsing functions
#' @importFrom methods new
#' @export
GCT <- function(mat=NULL, rdesc=NULL, cdesc=NULL,
                src=NULL, rid=NULL, cid=NULL,
                matrix_only=FALSE) {
  methods::new("GCT", mat=mat, rdesc=rdesc, cdesc=cdesc,
      src=src, rid=rid, cid=cid, matrix_only=matrix_only)
}

###########################################
### accessor functions for GCT objects  ###
###########################################

# set method for displaying a GCT object
# just use the 'str' function to show its structure
setMethod("show", methods::signature("GCT"), function(object) {
  utils::str(object)
})

#' Extract or set the matrix of GCT object
#' @param g the GCT object
#' @param value a numeric matrix
#' @return a matrix
#' @examples 
#' # get the matrix
#' m <- mat(ds)
#' # set the matrix
#' mat(ds) <- matrix(0, nrow=nrow(m), ncol=ncol(m))
#' @family GCT accessor methods
#' @export
methods::setGeneric("mat", function(g) {
  standardGeneric("mat")
})
#' @rdname mat
methods::setMethod("mat", "GCT", function(g) g@mat)
#' @export
#' @rdname mat
methods::setGeneric("mat<-", function(g, value) {
  standardGeneric("mat<-")
})
#' @rdname mat
methods::setMethod("mat<-", "GCT", function(g, value) {
  g@mat <- value
  methods::validObject(g)
  return(g)
})

#' Extract the or set row or column ids of a GCT object
#' @param g the GCT object
#' @param dimension the dimension to extract/update ['row' or 'column']
#' @param value a character vector
#' @return a vector of row ids
#' @examples 
#' # extract rids
#' rids <- ids(ds)
#' # extract column ids
#' cids <- ids(ds, "column")
#' # set rids
#' ids(ds) <- as.character(1:length(rids))
#' # set cids
#' ids(ds, "column") <- as.character(1:length(cids))
#' @family GCT accessor methods
#' @export
methods::setGeneric("ids", function(g, dimension="row")  {
  standardGeneric("ids")
})
#' @rdname ids
methods::setMethod("ids", "GCT", function(g, dimension="row") {
  dimension <- tolower(dimension)
  if (dimension == "col") dimension <- "column"
  stopifnot(dimension %in% c("row", "column"))
  switch(dimension, row=g@rid, column=g@cid)
})
#' @export
#' @rdname ids
methods::setGeneric("ids<-", function(g, dimension="row", value)  {
  standardGeneric("ids<-")
})
#' @rdname ids
methods::setMethod("ids<-", "GCT", function(g, dimension="row", value) {
  dimension <- tolower(dimension)
  if (dimension == "col") dimension <- "column"
  stopifnot(dimension %in% c("row", "column"))
  if (dimension == "row") {
    g@rid <- value
  } else {
    g@cid <- value
  }
  methods::validObject(g)
  return(g)
})

#' Extract the or set metadata of a GCT object
#' @param g the GCT object
#' @param dimension the dimension to extract/update ['row' or 'column']
#' @param value a data.frame
#' @return a data.frame
#' @examples 
#' # extract rdesc
#' rdesc <- meta(ds)
#' # extract cdesc
#' cdesc <- meta(ds, dim="column")
#' # set rdesc
#' meta(ds) <- data.frame(x=sample(letters, nrow(rdesc), replace=TRUE))
#' # set cdesc
#' meta(ds, dim="column") <- data.frame(x=sample(letters, nrow(cdesc),
#'   replace=TRUE))
#' @family GCT accessor methods
#' @export
methods::setGeneric("meta", function(g, dimension="row")  {
  standardGeneric("meta")
})
#' @rdname meta
methods::setMethod("meta", "GCT", function(g, dimension="row") {
  dimension <- tolower(dimension)
  if (dimension == "col") dimension <- "column"
  stopifnot(dimension %in% c("row", "column"))
  switch(dimension, row=g@rdesc, column=g@cdesc)
})
#' @export
#' @rdname meta
methods::setGeneric("meta<-", function(g, dimension="row", value)  {
  standardGeneric("meta<-")
})
#' @rdname meta
methods::setMethod("meta<-", "GCT", function(g, dimension="row", value) {
  dimension <- tolower(dimension)
  if (dimension == "col") dimension <- "column"
  stopifnot(dimension %in% c("row", "column"))
  if (dimension == "row") {
    g@rdesc <- value
  } else {
    g@cdesc <- value
  }
  methods::validObject(g)
  return(g)
})


###########################################
###  cast GCT as SummarizedExperiment   ###
###########################################

#' as("GCT", "SummarizedExperiment")
#' 
#' Create SummarizedExperiment object from GCT object.
#' 
#' @examples
#'
#' se <- as(ds, "SummarizedExperiment")
#' 
#' @importFrom SummarizedExperiment SummarizedExperiment
#' @importFrom methods validObject
setAs("GCT", "SummarizedExperiment", function(from) {
  stopifnot(methods::validObject(from))
  SummarizedExperiment::SummarizedExperiment(
    assays = list(exprs = mat(from)), 
    colData = meta(from, dimension="column"),
    rowData = meta(from))
})

Try the cmapR package in your browser

Any scripts or data that you put into this service are public.

cmapR documentation built on Dec. 12, 2020, 2 a.m.