R/data_manipulation.R
In scmamp: Statistical Comparison of Multiple Algorithms in Multiple Problems

#' @title Expression based row filtering
#'
#' @export
#' @description This is a simple function to filter data based on an expression defined using the colum names
#' @param data A NAMED matrix or data frame to be filtered (column names are required).
#' @param condition A string indicating the condition that the row have to fulfill to be retained. The column names are used as variables in the condition (see examples bellow).
#' @param remove.cols Either a vector of column names or a vector of column indices to be removed from the result
#' @return The original data where the rows for which the condition is \code{FALSE} and the columns in the vector \code{remove.cols}  have been removed
#' @seealso \code{\link{summarizeData}}, \code{\link{writeTabular}} and the vignette \code{vignette(topic="Data_loading_and_manipulation", 
#' package="scmamp")}
#' @examples
#' data(data_gh_2008)
#' names(data.gh.2008)
#' filterData(data.gh.2008, condition="CN2 > 0.7 & Kernel < 0.7", remove.cols=1:2)
#' 
filterData <- function (data, condition="TRUE", remove.cols=NULL) {
  checkRow <- function (row) {
    # Extract columns as variables
    for (i in seq(along.with=row)) {
      assign(names(row)[i], row[i])
    }
    # Evaluate the condition
    cond <- eval(parse(text=condition))
    return(cond)
  }
  
  # Generate the subset of rows
  sub <- apply(data, MARGIN=1, FUN=checkRow)
  
  ## Generate the colums to select
  if (is.character(remove.cols)) {
    id.retain <- which(!(colnames(data) %in% remove.cols))
  } else {
    id.retain <- which(!(1:ncol(data) %in% remove.cols))
  }
  
  # In case there are indices out of range, remove them
  id.retain <- subset(id.retain, 
                      subset=id.retain > 0 & id.retain <= ncol(data))
  
  # Get the subset
  sbst <- subset(data, subset=sub, select=id.retain)
  return(sbst)
}


#' @title Summarization of data
#'
#' @export
#' @description This is a simple function to apply a summarization function to a matrix or data frame.
#' @param data A matrix or data frame to be summarized.
#' @param fun Function to be used in the summarization. It can be any function that, taking as first argument a numeric vector, otuputs a numeric value. Typical examples are \code{\link{mean}}, \code{\link{median}}, \code{\link{min}}, \code{\link{max}} or \code{\link{sd}}.
#' @param group.by A vector of either column names or column indices according to which the data will be grouped to be summarized.
#' @param ignore A vector of either column names or column indices of the columns that have to be removed from the output.
#' @param ... Additional parameters to the summarization function (\code{fun}). For example, \code{na.rm=TRUE} to indicate that the missing values should be ignored.
#' @return A data frame where, for each combination of the values in the columns indicated by \code{group.by}, each column (except those in \code{ignore}) contains the summarization of the values in the original matrix that have that combination of values.
#' #' @seealso \code{\link{filterData}}, \code{\link{writeTabular}} and the vignette \code{vignette(topic="Data_loading_and_manipulation", 
#' package="scmamp")}
#' @examples
#' data(data_blum_2015)
#' # Group by size and radius. Get the mean and variance of only the last two
#' # columns.
#' summarizeData (data.blum.2015, group.by=c("Radius","Size"), ignore=3:8, 
#'                fun=mean, na.rm=TRUE)
#' summarizeData (data.blum.2015, group.by=c("Radius","Size"), ignore=3:8, 
#'                fun=sd, na.rm=TRUE)
#' 
summarizeData <- function (data, fun=mean, group.by=NULL, ignore=NULL, ... ) {
  if (!is.data.frame(data)) {
    data <- data.frame(data)
  }
  # Convert character definitions to colum id
  if (is.character(group.by)) {
    group.by <- which(colnames(data) %in% group.by)
  }
  
  if (is.character(ignore)) {
    ignore <- which(colnames(data) %in% ignore)
  }
  
  ## Only numeric columns can be summarized
  non.numeric <- which(!unlist(lapply(data, is.numeric)))
  if (!all(non.numeric %in% c(group.by, ignore))) {
    warning ("Only numeric columns can be summarized. Character and factor ",
             "columns should be either in the 'group.by' or the 'ignore' list. ",
             "Non numeric columns will be ignored")
    ignore <- unique(c(ignore, non.numeric))
  }
  
  # Remove any index out of bounds
  group.by <- subset(group.by, subset=group.by > 0 & group.by <= ncol(data))
  ignore <- subset(ignore, subset=ignore > 0 & ignore <= ncol(data))
  
  if (length(intersect(group.by,ignore)) > 0) {
    stop("The same column cannot be simultaneously in the 'group.by' and the ",
         "'ignore' list")
  }
  
  if (is.null(group.by)) {
    if (!is.null(ignore)) {
      data <- data[, -ignore]
    }
    summ <- apply(data, MARGIN=2, 
                  FUN=function(x) {
                    fun(x, ...)
                  }) 
  }else{  
    groups <- unique(data[, group.by])
    if(length(group.by)) groups <- data.frame(groups)
    to.summarize <- (1:ncol(data))[-c(ignore, group.by)]
    summGroup <- function (i) {
      sub <- rep(TRUE, nrow(data))
      for (j in seq(along.with=group.by)) {
        sub <- sub & data[, group.by[j]] == groups[i,j]
      }
      m <- subset(data, subset=sub)
      m <- m[, to.summarize]
      if (length(to.summarize) == 1) {
        m <- matrix(m, ncol=1)
      }
      apply(m, MARGIN=2, 
            FUN=function(x) {
                  fun(x , ...)
                })
    }
    aux <- lapply(1:nrow(groups), FUN=summGroup)
    summ <- cbind(groups, do.call(rbind, aux))
  }
  return(summ)
}



#' @title Creation of boolean matrices for highlighting results
#'
#' @export
#' @description A simple function to create boolean matrices to be used when constructing LaTeX tables.
#' @param data It can be a data frame, a matrix or a vector.
#' @param find A string indicating what has to be detected. Possible values are:
#' \itemize{
#'   \item{\code{'eq'}}{ All values equal to the value passed in \code{th}}
#'   \item{\code{'le'}}{ All values lower or equal to the value passed in \code{th}}
#'   \item{\code{'ge'}}{ All values greater or equal to the value passed in \code{th}}
#'   \item{\code{'lw'}}{ All values lower than the value passed in \code{th}}
#'   \item{\code{'gt'}}{ All values greater than the value passed in \code{th}}
#'   \item{\code{'min'}}{ Minimum value in each row / column / matrix}
#'   \item{\code{'max'}}{ Maximum value in each row / column / matrix}
#' }
#' @param th Thershold used when \code{find} is set to \code{'eq'}, \code{'ge'}, \code{'le'}, \code{'gt'} or \code{'lw'}.
#' @param by A string or string vector indicating where the min/max values have to be find. It can be \code{'row'}, \code{'col'} or \code{'mat'} for the row, column and matrix min/max respectively.
#' @return A boolean matrix that matches in dimension the output data and where the identified elements are marked as TRUE.
#' @examples
#' data('data_gh_2008')
#' booleanMatrix(data.gh.2008, find='min', by='row')
#' booleanMatrix(data.gh.2008, find='ge',  th=0.5)
#' 
booleanMatrix <- function (data, find='max', th=0, by='row') {
  
  # Check whether all the values are numeric or not
  if (is.data.frame(data)) {
    numeric.data <- all(apply(data, FUN="is.numeric", MARGIN=c(1,2)))
  } else if (is.matrix(data) | is.vector(data)) {
    numeric.data <- is.numeric(data)
  } else {
    stop("The 'data' argument has to be either a data frame, a matrix or a vector")
  }
  
  if (!numeric.data && find!='eq') {
    stop("For non-numeric matrices the only posible comparison is find='eq'")
  }
  
    if (by=='col') {
      margin <- 2
    } else if (by == 'row') {
      margin <- 1
    } else if (by != 'mat') {
      stop("The 'by' argument can only take values 'col', 'row' and 'mat'")
    }

  matrix <- switch(find,
                   'eq'={
                     data == th
                   },
                   'ge'={
                     data >= th
                   },
                   'le'={
                     data <= th
                   },
                   'gt'={
                     data > th
                   },
                   'lw'={
                     data < th
                   },
                   'min'={
                     if (is.vector(data)) {
                       res <- data == min(data)
                     } else {
                       if(by == 'mat') {
                         res <- data == min(data)
                       } else {
                         res <- apply(data, MARGIN=margin,
                                      FUN=function(x) {
                                        return (x==min(x))
                                      })
                         if (margin == 1) {
                           res <- t(res)
                         }
                       }
                     }
                     res
                   },
                   'max'={
                     if (is.vector(data)) {
                       res <- data == max(data)
                     } else {
                       if(length(margin) > 1) {
                         res <- data == max(data)
                       } else {
                         res <- apply(data, MARGIN=margin,
                                      FUN=function(x) {
                                        return (x==max(x))
                                      })
                         if (margin == 1) {
                           res <- t(res)
                         }
                       }
                     }
                     res
                   }) 
  return(matrix)
}

Any scripts or data that you put into this service are public.

scmamp documentation built on May 1, 2019, 10:10 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

scmamp
Statistical Comparison of Multiple Algorithms in Multiple Problems

R/data_manipulation.R
In scmamp: Statistical Comparison of Multiple Algorithms in Multiple Problems

Try the scmamp package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

scmamp Statistical Comparison of Multiple Algorithms in Multiple Problems

R/data_manipulation.R In scmamp: Statistical Comparison of Multiple Algorithms in Multiple Problems

Try the scmamp package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

scmamp
Statistical Comparison of Multiple Algorithms in Multiple Problems

R/data_manipulation.R
In scmamp: Statistical Comparison of Multiple Algorithms in Multiple Problems