Bioconductor

group_by.GMQLDateset <- function(
    .data, 
    groupBy_meta = conds(), 
    groupBy_regions = c(""), 
    region_aggregates = NULL, 
    meta_aggregates = NULL,
    .add = NULL,
    .drop = NULL
) {
    ptr_data = value(.data)
    gmql_group(
        ptr_data, 
        groupBy_meta, 
        groupBy_regions, 
        region_aggregates, 
        meta_aggregates)
}

#' Method group_by
#' 
#' @description Wrapper to GMQL GROUP operator
#' @description It performs the grouping of samples and/or sample regions of 
#' the input dataset based on one specified metadata and/or region attribute. 
#' If the metadata attribute is multi-value, i.e., it assumes multiple values 
#' for sample (e.g., both <disease, cancer> and <disease, diabetes>), 
#' the grouping identifies different groups of samples for each attribute value 
#' combination (e.g., group1 for samples that feature the combination 
#' <disease, cancer>, group2 for samples that feature the combination 
#' <disease, diabetes>, and group3 for samples that feature both combinations 
#' <disease, cancer> and <disease, diabetes>). For each obtained group, it is 
#' possible to request the evaluation of aggregate functions on metadata 
#' attributes; these functions consider the metadata contained in all samples 
#' of the group. The regions, their attributes and their values in output are 
#' the same as the ones in input for each sample, and the total number of 
#' samples does not change. All metadata in the input samples are conserved 
#' with their values in the output samples, with the addition of the "_group" 
#' attribute, whose value is the identifier of the group to which the specific 
#' sample is assigned; other metadata attributes can be added as aggregate 
#' functions computed on specified metadata. When used on region attributes, 
#' group_by can group regions of each sample individually, based on their 
#' coordinates (chr, start, stop, strand) and possibly also on other 
#' specified grouping region attributes (when these are present in the schema 
#' of the input dataset). In each sample, regions found in the same group 
#' (i.e., regions with same coordinates and grouping attribute values), 
#' are combined into a single region; this allows to merge regions that are 
#' duplicated inside the same sample (based on the values of their coordinates 
#' and of other possible specified region attributes). For each grouped 
#' region, it is possible to request the evaluation of aggregate functions 
#' on other region attributes (i.e., which are not coordinates, or grouping 
#' region attributes). This use is independent on the possible grouping 
#' realised based on metadata. The generated output schema only contains the 
#' original region attributes on which the grouping has been based, and 
#' additionally the attributes in case calculated as aggregated functions.
#' If the group_by is applied only on regions, the output metadata and their 
#' values are equal to the ones in input. Both when applied on metadata and 
#' on regions, the group_by operation returns a number of output samples 
#' equal to the number of input ones. Note that the two possible uses of 
#' group_by, on metadata and on regions, are perfectly orthogonal, 
#' therefore they can be used in combination or independently.
#' 
#' 
#' @importFrom rJava J .jarray .jnull
#' @importFrom dplyr group_by
#' 
#' @param .data GMQLDataset object
#' @param groupBy_meta \code{\link{conds}} function to support methods with 
#' groupBy or JoinBy input parameter
#' 
#' @param groupBy_regions vector of strings made up by region attribute names
#' @param meta_aggregates It accepts a list of aggregate functions on 
#' metadata attribute.
#' All the elements in the form \emph{key} = \emph{aggregate}.
#' The \emph{aggregate} is an object of class AGGREGATES.
#' The aggregate functions available are: \code{\link{SUM}}, 
#' \code{\link{COUNTSAMP}}, \code{\link{MIN}}, \code{\link{MAX}}, 
#' \code{\link{AVG}}, \code{\link{MEDIAN}}, \code{\link{STD}}, 
#' \code{\link{BAG}}, \code{\link{BAGD}}, \code{\link{Q1}}, 
#' \code{\link{Q2}}, \code{\link{Q3}}.
#' Every aggregate accepts a string value, except for COUNTSAMP, which does 
#' not have any value.
#' Argument of 'aggregate function' must exist in schema, i.e. among region 
#' attributes. Two styles are allowed:
#' \itemize{
#' \item list of key-value pairs: e.g. sum = SUM("cell")
#' \item list of values: e.g. SUM("cell")
#' }
#' "mixed style" is not allowed
#' 
#' @param region_aggregates It accepts a list of aggregate functions on 
#' region attribute. 
#' All the elements in the form \emph{key} = \emph{aggregate}.
#' The \emph{aggregate} is an object of class AGGREGATES.
#' The aggregate functions available are: \code{\link{SUM}}, 
#' \code{\link{COUNT}}, \code{\link{MIN}}, \code{\link{MAX}}, 
#' \code{\link{AVG}}, \code{\link{MEDIAN}}, \code{\link{STD}}, 
#' \code{\link{BAG}}, \code{\link{BAGD}}, \code{\link{Q1}}, 
#' \code{\link{Q2}}, \code{\link{Q3}}.
#' Every aggregate accepts a string value, except for COUNT, which does 
#' not have any value.
#' Argument of 'aggregate function' must exist in schema, i.e. among region 
#' attributes. Two styles are allowed:
#' \itemize{
#' \item list of key-value pairs: e.g. sum = SUM("pvalue")
#' \item list of values: e.g. SUM("pvalue")
#' }
#' "mixed style" is not allowed
#' @param region_aggregates It accepts a list of aggregate functions on 
#' @param .add parameters inherited, unused with GMQLDateset data object
#' @param .drop parameters inherited, unused with GMQLDateset data object
#' 
#' @return GMQLDataset object. It contains the value to use as input 
#' for the subsequent GMQLDataset method
#'
#' @examples
#' 
#' ## This statement initializes and runs the GMQL server for local execution 
#' ## and creation of results on disk. Then, with system.file() it defines 
#' ## the path to the folder "DATASET" in the subdirectory "example"
#' ## of the package "RGMQL" and opens such file as a GMQL dataset named "exp" 
#' ## using CustomParser
#'
#' init_gmql()
#' test_path <- system.file("example", "DATASET", package = "RGMQL")
#' exp = read_gmql(test_path)
#' 
#' ## This GMQL statement groups samples of the input 'exp' dataset according 
#' ## to their value of the metadata attribute 'tumor_type' and computes the 
#' ## maximum value that the metadata attribute 'size' takes inside the samples 
#' ## belonging to each group. The samples in the output GROUPS_T dataset 
#' ## have a new _group metadata attribute which indicates which group they 
#' ## belong to, based on the grouping on the metadata attribute tumor_type. 
#' ## In addition, they present the new metadata aggregate attribute 'MaxSize'. 
#' ## Note that the samples without metadata attribute 'tumor_type' are 
#' ## assigned to a single group with _group value equal 0
#' 
#' GROUPS_T = group_by(exp, conds("tumor_type"), 
#'     meta_aggregates = list(max_size = MAX("size")))
#' 
#' ## This GMQL statement takes as input dataset the same input dataset as 
#' ## the previous example. Yet, it calculates new _group values based on the 
#' ## grouping attribute 'cell', and adds the metadata aggregate attribute 
#' ## 'n_samp', which counts the number of samples belonging to the respective 
#' ## group. It has the following output GROUPS_C dataset samples 
#' ## (note that now no sample has metadata attribute _group with value 
#' ## equal 0 since all input samples include the metadata attribute cell, 
#' ## with different values, on which the new grouping is based)
#' 
#' GROUPS_C = group_by(exp, conds("cell"),
#'     meta_aggregates = list(n_samp = COUNTSAMP()))
#' 
#' ## This GMQL statement groups the regions of each 'exp' dataset sample by 
#' ## region coordinates chr, left, right, strand  (these are implicitly 
#' ## considered) and the additional region attribute score (which is 
#' ## explicitly specified), and keeps only one region for each group. 
#' ## In the output GROUPS dataset schema, the new region attributes 
#' ## avg_pvalue and max_qvalue are added, respectively computed as the 
#' ## average of the values taken by the pvalue and the maximum of the values 
#' ## taken by the qvalue region attributes in the regions grouped together, 
#' ## and the computed value is assigned to each region of each output sample. 
#' ## Note that the region attributes which are not coordinates or score are 
#' ## discarded.
#' 
#' GROUPS = group_by(exp, groupBy_regions = "score", 
#'     region_aggregates = list(avg_pvalue = AVG("pvalue"), 
#'     max_qvalue = MAX("qvalue")))
#' 
#' @name group_by
#' @rdname group_by
#' @aliases group_by,GMQLDataset-method
#' @aliases group_by-method
#' @export
setMethod("group_by","GMQLDataset",group_by.GMQLDateset)

gmql_group <- function(
    input_data, 
    group_meta,
    group_reg, 
    region_aggregates, 
    meta_aggregates
) {
    if(!is.null(group_meta)) {
        if("condition" %in% names(group_meta)) {
            cond <- .join_condition(group_meta)
            if(is.null(cond))
                join_matrix <- .jnull("java/lang/String")
            else
                join_matrix <- .jarray(cond, dispatch = TRUE)
        } else
            stop("use function conds()")
    } else
        join_matrix <- .jnull("java/lang/String")
    
    if(!is.null(group_reg)) {
        if(!is.character(group_reg))
            stop("metadata: no valid input")
        
        group_reg <- group_reg[!group_reg %in% ""]
        group_reg <- group_reg[!duplicated(group_reg)]
        
        if(!length(group_reg))
            group_reg <- .jnull("java/lang/String")
        else
            group_reg <- .jarray(group_reg,dispatch = TRUE)
    } else
        group_reg <- .jnull("java/lang/String")
    
    if(!is.null(meta_aggregates) && length(meta_aggregates)) {
        aggr <- .aggregates(meta_aggregates,"AGGREGATES")
        metadata_matrix <- .jarray(aggr, dispatch = TRUE)
    } else
        metadata_matrix <- .jnull("java/lang/String")
    
    if(!is.null(region_aggregates) && length(region_aggregates)) {
        aggr <- .aggregates(region_aggregates,"AGGREGATES")
        region_matrix <- .jarray(aggr, dispatch = TRUE)
    } else
        region_matrix <- .jnull("java/lang/String")
    
    WrappeR <- J("it/polimi/genomics/r/Wrapper")
    response <- WrappeR$group(
        join_matrix, 
        metadata_matrix,
        group_reg, 
        region_matrix, 
        input_data)
    error <- strtoi(response[1])
    val <- response[2]
    if(error)
        stop(val)
    else
        GMQLDataset(val)
}