R/gmql_join.R

Defines functions gmql_join

#' Method merge
#'
#' @description Wrapper to GMQL JOIN operator
#' 
#' @description It takes in input two datasets, respectively known as anchor 
#' (left) and experiment (right) and returns a dataset of samples consisting 
#' of regions extracted from the operands according to the specified conditions
#' (a.k.a \emph{genometric_predicate} and \emph{region_attribute} predicate).
#' The number of generated output samples is the Cartesian product 
#' of the number of samples in the anchor and in the experiment dataset 
#' (if \emph{joinBy} is not specified).
#' The output metadata are the union of the input metadata, 
#' with their attribute names prefixed with left or right dataset name, 
#' respectively.
#'
#' @importFrom rJava J .jnull .jarray
#' @importFrom S4Vectors merge
#' 
#' @param x GMQLDataset class object
#' @param y GMQLDataset class object
#' 
#' @param genometric_predicate it is a list of DISTAL objects.
#' For details of DISTAL objects see:
#' \code{\link{DLE}}, \code{\link{DGE}}, \code{\link{DL}}, \code{\link{DG}},
#' \code{\link{MD}}, \code{\link{UP}}, \code{\link{DOWN}}
#' 
#' @param joinBy \code{\link{condition_evaluation}} function to support 
#' methods with groupBy or JoinBy input paramter
#' @param reg_attr vector of strings made up by region field attribute names, 
#' whose values in the paired left and right dataset regions must be equal in 
#' order to consider the two paired regions.
#' If specified, \emph{region_output} cannot be INT or CAT.
#' @param region_output single string that declares which region is given in 
#' output for each input pair of left dataset and right dataset regions 
#' satisfying the genometric predicate and/or the region attribute predicate:
#' \itemize{
#' \item{LEFT: It outputs the anchor regions from 'x' that satisfy the 
#' genometric and/or region attribute predicate}
#' \item{RIGHT: It outputs the experiment regions from 'y' that satisfy the 
#' genometric and/or region attribute predicate}
#' \item{INT (intersection): It outputs the overlapping part (intersection) 
#' of the 'x' and 'y' regions that satisfy the genometric  and/or region 
#' attribute predicate; if the intersection is empty, no output is produced}
#' \item{CAT: It outputs the concatenation between the 'x' and 'y' regions 
#' that satisfy the genometric  and/or region attribute predicate, 
#' (i.e. the output regions defined as having left (right) coordinates equal 
#' to the minimum (maximum) of the corresponding coordinate values in the 
#' 'x' and 'y' regions satisfying the genometric  and/or region attribute 
#' predicate)}
#' \item{LEFT_DIST: It outputs the duplicate elimination of 'x' output 
#' regions with the same coordinates and values, regardless the 'y' paired 
#' region and its values. In this case, the output region attributes and their 
#' values are all and only those of 'x', and the output metadata are equal 
#' to the 'x' metadata, without additional prefixes}
#' \item{RIGHT_DIST: It outputs the duplicate elimination of 'y' output 
#' regions with the same coordinates and values, regardless the 'x' paired 
#' region and its values. In this case, the output regions attributes and their 
#' values are all and only those of 'y', and the output metadata are equal 
#' to the 'y' metadata, without additional prefixes}
#' \item{BOTH: It outputs the same regions as LEFT, but it adds in the output 
#' region attributes the coordinates of the 'y' paired region that, 
#' together with the 'x' output region, satisfies the genometric  and/or 
#' region attribute predicate}
#' }
#'
#' @return GMQLDataset object. It contains the value to use as input 
#' for the subsequent GMQLDataset method
#' 
#' @examples
#' 
#' ## This statement initializes and runs the GMQL server for local execution 
#' ## and creation of results on disk. Then, with system.file() it defines 
#' ## the path to the folders "DATASET" and "DATASET_GDM" in the subdirectory 
#' ## "example" of the package "RGMQL" and opens such folders as a GMQL 
#' ## datasets named TSS and HM, respectively, using CustomParser
#' 
#' init_gmql()
#' test_path <- system.file("example", "DATASET", package = "RGMQL")
#' test_path2 <- system.file("example", "DATASET_GDM", package = "RGMQL")
#' TSS = read_gmql(test_path)
#' HM = read_gmql(test_path2)
#' 
#' ## Given a dataset HM and one called TSS with a sample including 
#' ## Transcription Start Site annotations, this statement searches for those 
#' ## regions of HM that are at a minimal distance from a transcription start 
#' ## site (TSS) and takes the first/closest one for each TSS, provided that 
#' ## such distance is lesser than 120K bases and joined TSS and HM 
#' ## samples are obtained from the same provider (joinby clause).
#' 
#' join_data = merge(TSS, HM, genometric_predicate = list(MD(1), DLE(120000)), 
#'     conds("provider"), region_output = "RIGHT")
#' 
#' 
#' @name merge
#' @aliases merge,GMQLDataset,GMQLDataset-method
#' @aliases merge-method
#' @export
setMethod("merge", c("GMQLDataset","GMQLDataset"),
                function(x, y, genometric_predicate = NULL, 
                    region_output = "CAT", joinBy = conds(), reg_attr = c(""))
                {
                    ptr_data_x <- value(x)
                    ptr_data_y <- value(y)
                    gmql_join(ptr_data_x, ptr_data_y, 
                        genometric_predicate, joinBy, region_output, reg_attr)
                })


gmql_join <- function(left_data, right_data, genometric_predicate, joinBy, 
                        region_output, reg_attributes)
{
    if(!is.null(genometric_predicate))
    {
        if(length(genometric_predicate) > 4)
            stop("genometric_predicate: only 4 DISTAL condition")
        
        if(!is.list(genometric_predicate))
            stop("genometric_predicate must be a list")
    
        if(!all(vapply(genometric_predicate, function(x) {is(x,"DISTAL")},
                        logical(1))))
            stop("All elements should be DISTAL object")
        
        genomatrix <- t(vapply(genometric_predicate, function(x) {
                new_value = as.character(x)
                array <- c(new_value)
        },character(2)))
        
        genomatrix <- .jarray(genomatrix, dispatch = TRUE)
    }
    else
        genomatrix <- .jnull("java/lang/String")
    
    if(!is.null(joinBy))
    {
        cond <- .join_condition(joinBy)
        if(is.null(cond))
            join_matrix <- .jnull("java/lang/String")
        else
            join_matrix <- .jarray(cond, dispatch = TRUE)
    }
    else
        join_matrix <- .jnull("java/lang/String")
    
    if(!identical(reg_attributes,""))
    {
        if(!is.character(reg_attributes))
            stop("metadata: no valid input")
        
        reg_attributes <- reg_attributes[!reg_attributes %in% ""]
        reg_attributes <- reg_attributes[!duplicated(reg_attributes)]
        
        if(!length(reg_attributes))
            reg_attributes <- .jnull("java/lang/String")
        else
            reg_attributes <- .jarray(reg_attributes, dispatch = TRUE)
        
        if(is.null(genometric_predicate) && length(reg_attributes))
            if(ouput %in% c("CAT","INT"))
                stop("Both reg_attributes and genometric_predicate are defined: 
                        output cannot be INT or CAT")
    }
    else
        reg_attributes <- .jnull("java/lang/String")
    
    ouput <- toupper(region_output)
    if(!ouput %in% c("CAT", "LEFT", "RIGHT", "INT", "BOTH", "RIGHT_DIST", 
                        "LEFT_DIST"))
        stop("region_output must be cat, left, right, right_dist, left_dist 
                or int (intersection)")
    
    
    WrappeR <- J("it/polimi/genomics/r/Wrapper")
    response <- WrappeR$join(genomatrix, join_matrix, ouput,reg_attributes,
                                left_data, right_data)
    error <- strtoi(response[1])
    val <- response[2]
    if(error)
        stop(val)
    else
        GMQLDataset(val)
}

Try the RGMQL package in your browser

Any scripts or data that you put into this service are public.

RGMQL documentation built on Nov. 8, 2020, 5:59 p.m.