R/gmql_join.R

Defines functions gmql_join

#' Method merge
#'
#' @description Wrapper to GMQL JOIN operator
#' 
#' @description It takes in input two datasets, respectively known as anchor 
#' (left) and experiment (right) and returns a dataset of samples consisting 
#' of regions extracted from the operands according to the specified conditions
#' (a.k.a \emph{genometric_predicate} and \emph{region_attribute} predicate).
#' The number of generated output samples is the Cartesian product 
#' of the number of samples in the anchor and in the experiment dataset 
#' (if \emph{joinBy} is not specified).
#' The output metadata are the union of the input metadata, 
#' with their attribute names prefixed with left or right dataset name, 
#' respectively.
#'
#' @importFrom rJava J .jnull .jarray
#' @importFrom S4Vectors merge
#' 
#' @param x GMQLDataset class object
#' @param y GMQLDataset class object
#' 
#' @param genometric_predicate it is a list of DISTAL objects.
#' For details of DISTAL objects see:
#' \code{\link{DLE}}, \code{\link{DGE}}, \code{\link{DL}}, \code{\link{DG}},
#' \code{\link{MD}}, \code{\link{UP}}, \code{\link{DOWN}}
#' 
#' @param joinBy \code{\link{condition_evaluation}} function to support 
#' methods with groupBy or JoinBy input paramter
#' @param reg_attr vector of strings made up by region field attribute names, 
#' whose values in the paired left and right dataset regions must be equal in 
#' order to consider the two paired regions.
#' If specified, \emph{region_output} cannot be INT or CAT.
#' @param region_output single string that declares which region is given in 
#' output for each input pair of left dataset and right dataset regions 
#' satisfying the genometric predicate and/or the region attribute predicate:
#' \itemize{
#' \item{LEFT: It outputs the anchor regions from 'x' that satisfy the 
#' genometric and/or region attribute predicate}
#' \item{RIGHT: It outputs the experiment regions from 'y' that satisfy the 
#' genometric and/or region attribute predicate}
#' \item{INT (intersection): It outputs the overlapping part (intersection) 
#' of the 'x' and 'y' regions that satisfy the genometric  and/or region 
#' attribute predicate; if the intersection is empty, no output is produced}
#' \item{CAT: It outputs the concatenation between the 'x' and 'y' regions 
#' that satisfy the genometric  and/or region attribute predicate, 
#' (i.e. the output regions defined as having left (right) coordinates equal 
#' to the minimum (maximum) of the corresponding coordinate values in the 
#' 'x' and 'y' regions satisfying the genometric  and/or region attribute 
#' predicate)}
#' \item{LEFT_DIST: It outputs the duplicate elimination of 'x' output 
#' regions with the same coordinates and values, regardless the 'y' paired 
#' region and its values. In this case, the output region attributes and their 
#' values are all and only those of 'x', and the output metadata are equal 
#' to the 'x' metadata, without additional prefixes}
#' \item{RIGHT_DIST: It outputs the duplicate elimination of 'y' output 
#' regions with the same coordinates and values, regardless the 'x' paired 
#' region and its values. In this case, the output regions attributes and their 
#' values are all and only those of 'y', and the output metadata are equal 
#' to the 'y' metadata, without additional prefixes}
#' \item{BOTH: It outputs the same regions as LEFT, but it adds in the output 
#' region attributes the coordinates of the 'y' paired region that, 
#' together with the 'x' output region, satisfies the genometric  and/or 
#' region attribute predicate}
#' }
#'
#' @return GMQLDataset object. It contains the value to use as input 
#' for the subsequent GMQLDataset method
#' 
#' @examples
#' 
#' ## This statement initializes and runs the GMQL server for local execution 
#' ## and creation of results on disk. Then, with system.file() it defines 
#' ## the path to the folders "DATASET" and "DATASET_GDM" in the subdirectory 
#' ## "example" of the package "RGMQL" and opens such folders as a GMQL 
#' ## datasets named TSS and HM, respectively, using CustomParser
#' 
#' init_gmql()
#' test_path <- system.file("example", "DATASET", package = "RGMQL")
#' test_path2 <- system.file("example", "DATASET_GDM", package = "RGMQL")
#' TSS = read_gmql(test_path)
#' HM = read_gmql(test_path2)
#' 
#' ## Given a dataset HM and one called TSS with a sample including 
#' ## Transcription Start Site annotations, this statement searches for those 
#' ## regions of HM that are at a minimal distance from a transcription start 
#' ## site (TSS) and takes the first/closest one for each TSS, provided that 
#' ## such distance is lesser than 120K bases and joined TSS and HM 
#' ## samples are obtained from the same provider (joinby clause).
#' 
#' join_data = merge(TSS, HM, genometric_predicate = list(MD(1), DLE(120000)), 
#'     conds("provider"), region_output = "RIGHT")
#' 
#' 
#' @name merge
#' @aliases merge,GMQLDataset,GMQLDataset-method
#' @aliases merge-method
#' @export
setMethod(
    "merge", 
    c("GMQLDataset","GMQLDataset"),
    function(
        x, 
        y, 
        genometric_predicate = NULL, 
        region_output = "CAT", 
        joinBy = conds(), 
        reg_attr = c("")
    ) {
        ptr_data_x <- value(x)
        ptr_data_y <- value(y)
        gmql_join(
            ptr_data_x, 
            ptr_data_y, 
            genometric_predicate, 
            joinBy, 
            region_output, 
            reg_attr)
})


gmql_join <- function(
    left_data, 
    right_data, 
    genometric_predicate, 
    joinBy, 
    region_output,
    reg_attributes
) {
    if(!is.null(genometric_predicate)) {
        if(length(genometric_predicate) > 4)
            stop("genometric_predicate: only 4 DISTAL condition")
        
        if(!is.list(genometric_predicate))
            stop("genometric_predicate must be a list")
        
        distal_predicate <- vapply(genometric_predicate, function(x) { 
            is(x,"DISTAL") 
        }, logical(1))
        if(!all(distal_predicate))
            stop("All elements should be DISTAL object")
        
        genomatrix <- t(vapply(genometric_predicate, function(x) {
            new_value = as.character(x)
            array <- c(new_value)
        },character(2)))
        
        genomatrix <- .jarray(genomatrix, dispatch = TRUE)
        
    } else
        genomatrix <- .jnull("java/lang/String")
    
    if(!is.null(joinBy)) {
        cond <- .join_condition(joinBy)
        if(is.null(cond))
            join_matrix <- .jnull("java/lang/String")
        else
            join_matrix <- .jarray(cond, dispatch = TRUE)
    } else
        join_matrix <- .jnull("java/lang/String")
    
    if(!identical(reg_attributes,"")) {
        if(!is.character(reg_attributes))
            stop("metadata: no valid input")
        
        reg_attributes <- reg_attributes[!reg_attributes %in% ""]
        reg_attributes <- reg_attributes[!duplicated(reg_attributes)]
        
        if(!length(reg_attributes))
            reg_attributes <- .jnull("java/lang/String")
        else
            reg_attributes <- .jarray(reg_attributes, dispatch = TRUE)
        
        if(is.null(genometric_predicate) && length(reg_attributes))
            if(ouput %in% c("CAT","INT"))
                stop("Both reg_attributes and genometric_predicate are defined: 
                        output cannot be INT or CAT")
    } else
        reg_attributes <- .jnull("java/lang/String")
    
    ouput <- toupper(region_output)
    if(!ouput %in% c("CAT", "LEFT", "RIGHT", "INT", "BOTH", "RIGHT_DIST", 
                        "LEFT_DIST"))
        stop("region_output must be cat, left, right, right_dist, left_dist 
                or int (intersection)")
    
    
    WrappeR <- J("it/polimi/genomics/r/Wrapper")
    response <- WrappeR$join(
        genomatrix, 
        join_matrix, 
        ouput,reg_attributes,
        left_data, 
        right_data
    )
    error <- strtoi(response[1])
    val <- response[2]
    if(error)
        stop(val)
    else
        GMQLDataset(val)
}
DEIB-GECO/RGMQL documentation built on Feb. 17, 2024, 10:39 p.m.