Bioconductor

#' Method cover
#'
#' It takes as input a dataset containing one or more samples and returns 
#' another dataset (with a single sample, if no \emph{groupBy} option is 
#' specified) by “collapsing” the input dataset samples and their regions 
#' according to certain rules specified by the input parameters.
#' The attributes of the output genomic regions are only the region 
#' coordinates, and Jaccard indexes (\emph{JaccardIntersect} and 
#' \emph{JaccardResult}).
#' Jaccard Indexes are standard measures of similarity of the contributing 
#' regions, added as default region attributes.
#' The JaccardIntersect index is calculated as the ratio between the lengths 
#' of the intersection and of the union of the contributing regions; 
#' the JaccardResult index is calculated as the ratio between the lengths 
#' of the result and the union of the contributing regions.
#' If aggregate functions are specified, a new region attribute is added for 
#' each aggregate function specified.
#' Output metadata are the union of the input ones.
#' If \emph{groupBy} clause is specified, the input samples are partitioned 
#' in groups, each with distinct values of the grouping metadata attributes, 
#' and the \emph{cover} operation is separately applied to each group, 
#' yielding to one sample in the result for each group.
#' Input samples that do not satisfy the \emph{groupBy} condition 
#' are disregarded.
#' 
#' @include AllClasses.R
#' @importFrom methods is
#' @importFrom rJava J .jnull .jarray
#' 
#' @param .data GMQLDataset class object
#' @param min_acc minimum number of overlapping regions to be considered 
#' during execution. It is an integer number, declared also as string.
#' minAcc accepts also:
#' \itemize{
#' \item{PARAMETER class object: \code{\link{ALL}}, that represents the number 
#' of samples in the input dataset}
#' \item{an expression built using PARAMETER object: (ALL() + N) / K or
#' ALL() / K, with N and K integer values }
#' }
#' @param max_acc maximum number of overlapping regions to be considered 
#' during execution. It is an integer number, declared also as string.
#' maxAcc accept also:
#' \itemize{
#' \item{PARAMETER class object: \code{\link{ALL}}, that represents the number 
#' of samples in the input dataset}
#' \item{PARAMETER class object: \code{\link{ANY}}}, that acts as a wildcard, 
#' considering any amount of overlapping regions.
#' \item{an expression built using PARAMETER object: (ALL() + N) / K or
#' ALL() / K, with N and K integer values  }
#' }
#' @param groupBy \code{\link{conds}} function to support methods with 
#' groupBy or JoinBy input parameter
#' 
#' @param ... a series of expressions separated by comma in the form 
#' \emph{key} = \emph{aggregate}. The \emph{aggregate} is an object of 
#' class AGGREGATES. The aggregate functions available are: \code{\link{SUM}}, 
#' \code{\link{COUNT}}, \code{\link{MIN}}, \code{\link{MAX}}, 
#' \code{\link{AVG}}, \code{\link{MEDIAN}}, \code{\link{STD}}, 
#' \code{\link{BAG}}, \code{\link{BAGD}}, \code{\link{Q1}}, 
#' \code{\link{Q2}}, \code{\link{Q3}}.
#' Every aggregate accepts a string value, except for COUNT, which does not 
#' have any value.
#' Argument of 'aggregate function' must exist in schema, i.e. among region 
#' attributes. Two styles are allowed:
#' \itemize{
#' \item list of key-value pairs: e.g. sum = SUM("pvalue")
#' \item list of values: e.g. SUM("pvalue")
#' }
#' "mixed style" is not allowed
#'
#' @param variation string identifying the cover GMQL operator variation.
#' The admissible strings are:
#' \itemize{
#' \item{FLAT: It returns the regions that start from the first end and stop 
#' at the last end of the regions which would contribute to each region 
#' of the \emph{cover}.}
#' \item{SUMMIT: It returns regions that start from a position
#' where the number of intersecting regions is not increasing afterwards and
#' stop at a position where either the number of intersecting regions 
#' decreases, or it violates the max accumulation index.}
#' \item{HISTOGRAM: It returns the non-overlapping regions contributing to 
#' the \emph{cover}, each with its accumulation index value, which is assigned 
#' to the \emph{AccIndex} region attribute.}
#' \item{COVER: default value.}
#' }
#' It can be all caps or lowercase
#' 
#' @return GMQLDataset object. It contains the value to use as input 
#' for the subsequent GMQLDataset method
#' 
#' @examples
#' 
#' ## This statement initializes and runs the GMQL server for local execution 
#' ## and creation of results on disk. Then, with system.file() it defines 
#' ## the path to the folder "DATASET" in the subdirectory "example"
#' ## of the package "RGMQL" and opens such file as a GMQL dataset named "exp" 
#' ## using CustomParser
#' 
#' init_gmql()
#' test_path <- system.file("example", "DATASET", package = "RGMQL")
#' exp = read_gmql(test_path)
#'   
#' ## The following statement produces an output dataset with a single output 
#' ## sample. The COVER operation considers all areas defined by a minimum 
#' ## of two overlapping regions in the input samples, up to any amount of 
#' ## overlapping regions.
#' 
#' res = cover(exp, 2, ANY())
#'
#' ## The following GMQL statement computes the result grouping the input 
#' ## exp samples by the values of their cell metadata attribute, 
#' ## thus one output res sample is generated for each cell value; 
#' ## output regions are produced where at least 2 and at most 3 regions 
#' ## of grouped exp samples overlap, setting as attributes of the resulting 
#' ## regions the minimum pvalue of the overlapping regions (min_pvalue) 
#' ## and their Jaccard indexes (JaccardIntersect and JaccardResult).
#' 
#' res = cover(exp, 2, 3, groupBy = conds("cell"), min_pValue = MIN("pvalue"))
#' 
#' @name cover
#' @rdname cover
#' @aliases cover,GMQLDataset-method
#' @aliases cover-method
#' @export
setMethod("cover", "GMQLDataset",
            function(.data, min_acc, max_acc, groupBy = conds(), 
                    variation = "cover", ...)
            {
                val <- value(.data)
                s_min <- substitute(min_acc)
                s_min <- .trasform_cover(deparse(s_min))                
                s_max <- substitute(max_acc)
                s_max <- .trasform_cover(deparse(s_max))
                
                q_max <- .check_cover_param(s_max,FALSE)
                q_min <- .check_cover_param(s_min,TRUE)
                
                flag = toupper(variation)
                aggregates = list(...)
                gmql_cover(val, q_min, q_max, groupBy, aggregates, flag)
            })

gmql_cover <- function(input_data, min_acc, max_acc, groupBy,aggregates,flag)
{
    if(!is.null(groupBy))
    {
        if("condition" %in% names(groupBy))
        {
            cond <- .join_condition(groupBy)
            if(is.null(cond))
                join_matrix <- .jnull("java/lang/String")
            else
                join_matrix <- .jarray(cond, dispatch = TRUE)
        }
        else
            stop("use function conds()")
    }
    else
        join_matrix <- .jnull("java/lang/String")

    if(!is.null(aggregates) && length(aggregates))
    {
        aggr <- .aggregates(aggregates,"AGGREGATES")
        metadata_matrix <- .jarray(aggr, dispatch = TRUE)
    }
    else
        metadata_matrix <- .jnull("java/lang/String")
    
    WrappeR <- J("it/polimi/genomics/r/Wrapper")
    response <- switch(flag,
        "COVER" = WrappeR$cover(min_acc, max_acc, join_matrix,
                                    metadata_matrix, input_data),
        "FLAT" = WrappeR$flat(min_acc, max_acc, join_matrix,
                                    metadata_matrix, input_data),
        "SUMMIT" = WrappeR$summit(min_acc,max_acc, join_matrix,
                                    metadata_matrix, input_data),
        "HISTOGRAM" = WrappeR$histogram(min_acc, max_acc, join_matrix, 
                                    metadata_matrix, input_data))
    if(is.null(response))
        stop("no admissible variation: cover, flat, summit, histogram")
    
    error <- strtoi(response[1])
    val <- response[2]
    if(error)
        stop(val)
    else
        GMQLDataset(val)
}

.check_cover_param <- function(param, is_min)
{
    if(length(param) > 1)
        stop("length > 1")

    if(is.character(param))
    {
        if(is_min && identical(param,"ANY"))
            stop("min cannot assume ANY as value")
        
        return(param)
    }
    else
        stop("invalid input data")
    
}

.trasform_cover <- function(predicate)
{
    predicate <- gsub("\\(\\)","",predicate)
}