########################################
### GCT class and method definitions ###
########################################
#' An S4 Class to Represent a GCT Object
#' @name GCT object
#' @slot mat a numeric matrix
#' @slot rid a character vector of row ids
#' @slot cid a character vector of column ids
#' @slot rdesc a \code{data.frame} of row descriptors
#' @slot rdesc a \code{data.frame} of column descriptors
#' @slot src a character indicating the source (usually file path)
#' of the data
#' @description The GCT class serves to represent annotated
#' matrices. The \code{mat} slot contains the numeric matrix data and the
#' \code{rdesc} and \code{cdesc} slots contain data frames with
#' annotations about the rows and columns, respectively
#'
#' @seealso \code{\link{parse_gctx}}
methods::setClass("GCT",
methods::representation(
mat = "matrix",
rid = "character",
cid = "character",
rdesc = "data.frame",
cdesc = "data.frame",
version = "character",
src = "character"
)
)
# set up methods for checking GCT validity
methods::setValidity("GCT",
function(object) {
# check whether dimensions of various
# slots are in sync
nrows <- nrow(object@mat)
ncols <- ncol(object@mat)
if (nrows != length(object@rid)) {
return("rid must be the same length as number of matrix rows")
}
if (ncols != length(object@cid)) {
return("cid must be the same length as number of matrix columns")
}
if (length(object@cid) > length(unique(object@cid))) {
return("cid must be unique")
}
if (length(object@rid) > length(unique(object@rid))) {
return("rid must be unique")
}
if (nrow(object@cdesc) != ncols & nrow(object@cdesc) != 0) {
return("cdesc must either have 0 rows or the same number of rows as
matrix has columns")
}
if (nrow(object@rdesc) != nrows & nrow(object@rdesc) != 0) {
return("rdesc must either have 0 rows or the same number of rows as
matrix has rows")
}
else {
return(TRUE)
}
}
)
#### define some helper methods for parsing gctx files ###
# Adjust the data types for columns of a meta data frame
fix.datatypes <- function(meta) {
for (field.name in names(meta)) {
# get the field values
field <- meta[[field.name]]
# check if it's numeric. data may come in as a string
# but actually contains numeric values. if so, as.numeric
# will not result in a vector of NA values
field.as.numeric <- suppressWarnings(as.numeric(field))
if (!any(is.na(field.as.numeric))) {
field <- field.as.numeric
}
if (is.numeric(field)) {
# check if it's an integer. data may be floats but
# if we coerce to an integer and the difference from
# original values is zero, that means data are actually
# integers. integer conversion will return NA if there
# are any issues.
field.as.integer <- suppressWarnings(as.integer(field))
if (!any(is.na(field.as.integer))) {
# integer conversion was fine, lets see if the
# values are altered
diffs <- field - field.as.integer
if (all(diffs == 0)) {
# converting to integer didn't change value,
# set field to integer values
field <- field.as.integer
}
}
}
# insert back into the annotations
meta[[field.name]] <- field
}
return(meta)
}
# Parse row or column metadata from GCTX files
read.gctx.meta <- function(gctx_path, dimension="row", ids=NULL,
set_annot_rownames=TRUE) {
if (!file.exists(gctx_path)) {
stop(paste(gctx_path, "does not exist"))
}
if (dimension=="column") dimension <- "col"
if (!(dimension %in% c("row", "col"))) {
stop("dimension can be either row or col")
}
if (dimension == "row") {
name <- "0/META/ROW"
} else {
name <- "0/META/COL"
}
raw_annots <- rhdf5::h5read(gctx_path, name=name) # returns a list
fields <- names(raw_annots)
# define an empty data frame of the correct dimensions
annots <- data.frame(matrix(nrow=length(raw_annots[[fields[1]]]),
ncol=length(fields)))
names(annots) <- fields
# loop through each field and fill the annots data.frame
for (i in seq_along(fields)) {
field <- fields[i]
# remove any trailing spaces
# and cast as vector
annots[,i] <- as.vector(gsub("\\s*$", "", raw_annots[[field]], perl=TRUE))
}
annots <- fix.datatypes(annots)
# subset to the provided set of ids, if given
if (is.null(ids)) {
ids <- as.character(annots$id)
} else {
ids <- ids
}
# make sure annots row ordering matches that of ids
annots <- subset_to_ids(annots, ids)
annots$id <- as.character(annots$id)
# use the id field to set the rownames
if (set_annot_rownames) {
rownames(annots) <- annots$id
}
return(annots)
}
# Read GCTX row or column ids
read.gctx.ids <- function(gctx_path, dimension="row") {
if (!file.exists(gctx_path)) {
stop(paste(gctx_path, "does not exist"))
}
if (dimension=="column") dimension <- "col"
if (!(dimension %in% c("row", "col"))) {
stop("dimension can be either row or col")
}
if (dimension == "row") {
name <- "0/META/ROW/id"
} else {
name <- "0/META/COL/id"
}
# remove any spaces
ids <- gsub("\\s*$", "", rhdf5::h5read(gctx_path, name=name), perl=TRUE)
# cast as character
ids <- as.character(ids)
return(ids)
}
# Return a subset of requested GCTX row/colum ids
# out of the universe of all ids
process_ids <- function(ids, all_ids, type="rid") {
if (!is.null(ids)) {
if (is.numeric(ids)) {
# is it numeric?
idx <- ids
is_invalid_idx <- (idx > length(all_ids)) | (idx <= 0)
invalid_idx <- idx[is_invalid_idx]
if (all(is_invalid_idx)) {
stop(paste("none of the requested", type,
"indices were found in the dataset"))
}
if (any(is_invalid_idx)) {
# requested indices are outside of the possible range
warning(paste("the following ", type,
" were are outside possible range and will be ignored:\n",
paste(invalid_idx, collapse="\n"), sep=""))
}
idx <- idx[!is_invalid_idx]
} else {
# assume its a character
idx <- match(ids, all_ids)
if (all(is.na(idx))) {
stop(paste("none of the requested", type, "were found in the dataset"))
}
if (any(is.na(idx))) {
ids_not_found <- ids[is.na(idx)]
warning(paste("the following ", type,
" were not found and will be ignored:\n",
paste(ids_not_found, collapse="\n"), sep=""))
}
idx <- idx[!is.na(idx)]
}
} else {
# ids were null, just return an index vector
# allong all_ids
idx <- seq_along(all_ids)
}
# subset the character ids to the ones we want
id_keep <- as.character(all_ids[idx])
return(list(idx=idx, ids=id_keep))
}
# define the initialization method for the GCT class
methods::setMethod("initialize",
signature = "GCT",
definition = function(.Object, mat=NULL, rdesc=NULL, cdesc=NULL,
src=NULL, rid=NULL, cid=NULL,
set_annot_rownames=FALSE,
matrix_only=FALSE) {
# if we were supplied a matrix and annotations, use them
if (!is.null(mat)) {
.Object@mat <- mat
# if given rid and cid, use those as well
if (!is.null(rid)) {
.Object@rid <- rid
} else {
.Object@rid <- rownames(mat)
}
if (!is.null(cid)) {
.Object@cid <- cid
} else {
.Object@cid <- colnames(mat)
}
}
if (!is.null(rdesc)) {
.Object@rdesc <- rdesc
}
if (!is.null(cdesc)) {
.Object@cdesc <- cdesc
} else if (!is.null(src)) {
# we were not given a matrix, were we given a src file?
# check to make sure it's either .gct or .gctx
if (! (grepl(".gct$", src) || grepl(".gctx$", src) ))
stop("Either a .gct or .gctx file must be given")
if (grepl(".gct$", src)) {
if ( ! is.null(rid) || !is.null(cid) )
warning(paste("rid and cid values may only be given for
.gctx files, not .gct files\n",
"ignoring"))
# parse the .gct
.Object@src = src
# get the .gct version by reading first line
.Object@version = scan(src, what = "", nlines = 1, sep = "\t",
quiet = TRUE)[1]
# get matrix dimensions by reading second line
dimensions = scan(src, what = double(0), nlines = 1, skip = 1,
sep = "\t", quiet = TRUE)
nrmat = dimensions[1]
ncmat = dimensions[2]
if (length(dimensions)==4) {
# a #1.3 file
message("parsing as GCT v1.3")
nrhd <- dimensions[3]
nchd <- dimensions[4]
} else {
# a #1.2 file
message("parsing as GCT v1.2")
nrhd <- 0
nchd <- 0
}
message(paste(src, nrmat, "rows,", ncmat, "cols,", nrhd,
"row descriptors,", nchd, "col descriptors"))
# read in header line
header = scan(src, what = "", nlines = 1, skip = 2, sep = "\t",
quote = NULL, quiet = TRUE)
# construct row header and column id's from the header line
if ( nrhd > 0 ) {
rhd <- header[2:(nrhd+1)]
cid <- header[-(nrhd+1):-1]
col_offset <- 1
}
else {
if (any(grepl("description", header, ignore.case=T))) {
# check for presence of description column in v1.2 files
col_offset <- 2
} else {
col_offset <- col_offset <- 1
}
rhd = NULL
cid = header[(1+col_offset):length(header)]
}
# read in the next set of headers (column annotations) and
# shape into a matrix
if ( nchd > 0 ) {
header = scan(src, what = "", nlines = nchd, skip = 3,
sep = "\t", quote = NULL, quiet = TRUE)
header = matrix(header, nrow = nchd,
ncol = ncmat + nrhd + 1, byrow = TRUE)
# extract the column header and column descriptions
chd = header[,1]
cdesc = header[,-(nrhd+1):-1]
# need to transpose in the case where there's only one column
# annotation
if ( nchd == 1 )
cdesc = t(cdesc)
}
else {
chd = NULL
cdesc = data.frame()
}
# read in the data matrix and row descriptions, shape into a
# matrix
mat = scan(src, what = "", nlines = nrmat, skip = 3 + nchd,
sep = "\t", quote = NULL, quiet = TRUE)
mat = matrix(mat, nrow = nrmat,
ncol = ncmat + nrhd + col_offset,
byrow = TRUE)
# message(paste(dim(mat), collapse="\t"))
# Extract the row id's row descriptions, and the data matrix
rid = mat[,1]
if ( nrhd > 0 ) {
# need as.matrix for the case where there's only one row
# annotation
rdesc = as.matrix(mat[,2:(nrhd + 1)])
mat = matrix(as.numeric(mat[,-(nrhd + 1):-1]),
nrow = nrmat, ncol = ncmat)
}
else {
rdesc = data.frame()
mat = matrix(as.numeric(mat[, (1+col_offset):ncol(mat)]),
nrow = nrmat, ncol = ncmat)
}
# assign names to the data matrix and the row and column
# descriptions
# message(paste(dim(mat), collapse="\t"))
dimnames(mat) = list(rid, cid)
if ( nrhd > 0 ) {
dimnames(rdesc) = list(rid,rhd)
rdesc = as.data.frame(rdesc, stringsAsFactors = FALSE)
}
if ( nchd > 0 ) {
cdesc = t(cdesc)
dimnames(cdesc) = list(cid,chd)
cdesc = as.data.frame(cdesc, stringsAsFactors = FALSE)
}
# assign to the GCT slots
.Object@mat = mat
.Object@rid = rownames(mat)
.Object@cid = colnames(mat)
if (!matrix_only) {
# return annotations as well as matrix
.Object@rdesc = fix.datatypes(rdesc)
.Object@cdesc = fix.datatypes(cdesc)
# add id columns to rdesc and cdesc
.Object@rdesc$id <- rownames(.Object@rdesc)
.Object@cdesc$id <- rownames(.Object@cdesc)
}
}
else {
# parse the .gctx
message(paste("reading", src))
.Object@src = src
# get all the row and column ids
all_rid <- read.gctx.ids(src, dimension="row")
all_cid <- read.gctx.ids(src, dimension="col")
# if rid or cid specified, read only those rows/columns
# if already numeric, use as is
# else convert to numeric indices
processed_rids <- process_ids(rid, all_rid, type="rid")
processed_cids <- process_ids(cid, all_cid, type="cid")
# read the data matrix
.Object@mat <- rhdf5::h5read(src, name="0/DATA/0/matrix",
index=list(processed_rids$idx,
processed_cids$idx))
# set the row and column ids, casting as characters
.Object@rid <- processed_rids$ids
.Object@cid <- processed_cids$ids
rownames(.Object@mat) <- processed_rids$ids
colnames(.Object@mat) <- processed_cids$ids
# get the meta data
if (!matrix_only) {
.Object@rdesc <- read.gctx.meta(src, dimension="row",
ids=processed_rids$ids,
set_annot_rownames=set_annot_rownames)
.Object@cdesc <- read.gctx.meta(src, dimension="col",
ids=processed_cids$ids,
set_annot_rownames=set_annot_rownames)
}
else {
.Object@rdesc <- data.frame(id=.Object@rid,
stringsAsFactors = FALSE)
.Object@cdesc <- data.frame(id=.Object@cid,
stringsAsFactors = FALSE)
}
# close any open handles and return the object
if(utils::packageVersion('rhdf5') < "2.23.0") {
rhdf5::H5close()
} else {
rhdf5::h5closeAll()
}
message("done")
}
}
# finally, make sure object is valid before returning
ok <- methods::validObject(.Object)
return(.Object)
}
)
#' Parse a GCTX file into the R workspace as a GCT object
#' @title Parse GCTX
#' @param fname character(1), path to the GCTX file on disk
#' @param rid either a vector of character or integer
#' row indices or a path to a grp file containing character
#' row indices. Only these indices will be parsed from the
#' file.
#' @param cid either a vector of character or integer
#' column indices or a path to a grp file containing character
#' column indices. Only these indices will be parsed from the
#' file.
#' @param set_annot_rownames boolean indicating whether to set the
#' rownames on the row/column metadata data.frames. Set this to
#' false if the GCTX file has duplicate row/column ids.
#' @param matrix_only boolean indicating whether to parse only
#' the matrix (ignoring row and column annotations)
#' @return gct object
#' @family GCTX parsing functions
#' @examples
#' gctx <- system.file("extdata", "test_sample_n2x12328.gctx",
#' package="signatureSearch")
#' gct <- parse_gctx(gctx)
#' @export
parse_gctx <- function(fname, rid=NULL, cid=NULL, set_annot_rownames=FALSE,
matrix_only=FALSE) {
ds <- methods::new("GCT",
src = fname,
rid = rid,
cid = cid,
set_annot_rownames = set_annot_rownames,
matrix_only = matrix_only)
return(ds)
}
# Do a robust \code{\link{data.frame}} subset to a set of ids
subset_to_ids <- function(df, ids) {
# helper function to do a robust df subset
check_colnames("id", df)
newdf <- data.frame(df[match(ids, df$id), ])
names(newdf) <- names(df)
return(newdf)
}
# Check whether \code{test_names} are columns in the \code{\link{data.frame}} df
check_colnames <- function(test_names, df, throw_error=TRUE) {
# check whether test_names are valid names in df
# throw error if specified
diffs <- setdiff(test_names, names(df))
if (length(diffs) > 0) {
if (throw_error) {
stop(paste("the following column names are not found in",
deparse(substitute(df)), ":",
paste(diffs, collapse=" "), "\n"))
} else {
return(FALSE)
}
} else {
return(TRUE)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.