R/io_query.R

Defines functions search_entity_by_info_fields unpivot_join_arr_to_info get_entity

Documented in get_entity search_entity_by_info_fields

#' Query an entity and return the resulting dataframe, optionally including info fields
#' 
#' @param aop if TRUE, return the arrayop object instead of the data frame.  Only possible if not returning an info array
#' @param include_info_array if TRUE, also query the info array for the entity if one exists
#' @param join_info_array if TRUE and using `include_info_array`, return a single data frame with the info array joined to the main array.  Otherwise, return a list of the two dataframes
#' @param query_attributes if TRUE, filtering/joining on attributes of main array.  Inferred automatically is using `...` parameter, but needs to be set manually if using `semi_join_args`, particularly is `semi_join_args` contains any arrayop objects
#' @param semi_join_args an ordered list of things to semi_join the queried array by.  
#' @param ... If `semi_join_args` is not provided, can provide items to join on here. `get_entity(*, dataset_id=1:10, dataset_version=1)` is equivalent to `get_entity(*, semi_join_args = list(data.frame(dataset_id=1:10), data.frame(dataset_version=1))`
#' 
#' @examples 
#' \dontrun{
#' gs_query = revealcore:::get_entity(con, revealgenomics:::.ghEnv, "GENE_SYMBOL", gene_symbol=c("BRCA1","BRCA2"), aop = T)
#' revealcore:::get_entity(con, revealgenomics:::.ghEnv, "FEATURE", semi_join_args=list(data.frame("featureset_id"=1:10), gs_query), include_info_array=T, query_attributes=F)
#' revealcore:::get_entity(con, revealgenomics:::.ghEnv, "FEATURE", semi_join_args=list(data.frame("featureset_id"=1:10), gs_query, data.frame("feature_type"="gene")), include_info_array=T, query_attributes=T, join_info_array=F)
#' revealcore:::get_entity(con, revealgenomics:::.ghEnv, "FEATURE", semi_join_args=list(data.frame("featureset_id"=1:10), gs_query, data.frame("feature_type"="gene")), include_info_array=T, query_attributes=T, join_info_array=T)
#' }
#' 
#' @export
get_entity = function(con, pkg_schema, entitynm, aop = F, include_info_array = F, join_info_array = T, query_attributes=NULL, semi_join_args=NULL, ...){
  stopifnot(!(aop && include_info_array))
  entity_dims = get_idname(pkg_schema, entitynm)
  list_args = list(...)
  list_args = list_args[!sapply(list_args, is.null)]
  if(is.null(semi_join_args) && length(list_args)>0){
    semi_join_args = lapply(names(list_args), function(x){y=data.frame("a"=list_args[[x]]); names(y)=x; y})
    if(all(names(list_args) %in% entity_dims))
      query_attributes = F
    else
      query_attributes = T
  }
  if(!is.null(semi_join_args) && is.null(query_attributes)) stop("query_attributes must be specified with semi_join_args")
  arraynm = full_arrayname(pkg_schema = pkg_schema, entitynm = entitynm, con = con)
  array_query = scan_entity(con=con, pkg_schema = pkg_schema, entitynm = entitynm)
  aop_objs = list(con$aop_connection$afl_expr(array_query))
  semi_join_args_new = list()
  if(!is.null(semi_join_args)){
    for(i in 1:length(semi_join_args)){
      if(!query_attributes && grepl("ArrayOp", class(semi_join_args[[i]])[[1]])) #ensure we join only be the array dimesnions
        semi_join_args_new[[i]] = semi_join_args[[i]]$drop_dims()$mutate(.dots=sapply(setdiff(semi_join_args[[i]]$attrs_n_dims,entity_dims), function(x) NULL)) #drop anything that isn't a dimension in the main array
      else
        semi_join_args_new[[i]] = semi_join_args[[i]]
      aop_objs[[i+1]] = aop_objs[[i]]$semi_join(semi_join_args_new[[i]])
    }
  }
  if(aop){
    return(aop_objs[[length(aop_objs)]])
  } else {
    ret = list()
    ret[[entitynm]] = aop_objs[[length(aop_objs)]]$to_df_all()
    if(include_info_array && !(is.null(pkg_schema$array[[entitynm]]$info_array)) && pkg_schema$array[[entitynm]]$info_array){
      entitynm_info = paste0(entitynm,"_INFO")
      arraynm = full_arrayname(pkg_schema = pkg_schema, entitynm = entitynm, con = con)
      array_query_info = gsub(arraynm, paste0(arraynm,"_INFO"), array_query)
      aop_objs_info = list(con$aop_connection$afl_expr(array_query_info))
      if(!is.null(semi_join_args)){
        if(query_attributes){ #If we're querying by main array attributes, we can't repeat the same semi_joins on the info array
          aop_objs_info[[2]] = aop_objs_info[[1]]$semi_join(ret[[entitynm]][, entity_dims, drop=F])
        }
        else{
          for(i in 1:length(semi_join_args)){
            aop_objs_info[[i+1]] = aop_objs_info[[i]]$semi_join(semi_join_args[[i]])
          }
        }
      }
      ret[[entitynm_info]] = aop_objs_info[[length(aop_objs_info)]]$to_df_all()
      if(!join_info_array)
        return(ret)
      return(unpivot_join_arr_to_info(ret[[entitynm]], ret[[entitynm_info]], by = entity_dims))
    }
    return(ret[[entitynm]])
  }
}

unpivot_join_arr_to_info = function(main, info, by){
  if(nrow(main)==0){
    return(main)
  }
  names_from = intersect(colnames(info), c("key","metadata_attrkey"))
  values_from = intersect(colnames(info), c("val","metadata_value"))
  info = as.data.frame(tidyr::pivot_wider(info[,c(by, names_from, values_from)], names_from=tidyr::all_of(names_from), values_from=tidyr::all_of(values_from)))
  fields_in_both = setdiff(intersect(colnames(info), colnames(main)), by)
  if(length(fields_in_both)>0) info[,fields_in_both]=NULL
  return(dplyr::left_join(main, info, by = by))
}

#' Search an entity by values in info fields and return results
#' 
#' @export 
search_entity_by_info_fields = function(con,
                              pkg_schema,
                              entitynm,
                              attribute_values, 
                              semi_join_args=NULL,
                              case_sensitive=T,
                              join_info_array = T,
                              ...){
  entitynm_info = paste0(entitynm, "_INFO")
  metadata_value_entity = paste0(entitynm, "_METADATA_VALUE")
  metadata_attrkey_entity = paste0(entitynm, "_METADATA_ATTRKEY")
  entity_dims = get_idname(pkg_schema, entitynm)
  scan_arraynm = scan_entity(pkg_schema, entitynm, con=con)
  scan_arraynm_info = scan_entity(pkg_schema, entitynm_info, con=con)
  subset_step1 = !is.null(semi_join_args) || length(list(...)) > 0
  
  aop_info = list(get_entity(con, pkg_schema, entitynm_info, T, query_attributes = F, semi_join_args = semi_join_args, ...))
  aop_metadata_value = list(get_entity(con, pkg_schema, metadata_value_entity, T))
  aop_metadata_attrkey = list(get_entity(con, pkg_schema, metadata_attrkey_entity, T))
  aop_info_subset = list()
  aop_metadata_value_subset = list()
  aop_metadata_attrkey_subset = list()
  
  for(i in 1:length(attribute_values)){
    if(subset_step1 || i!=1){
      aop_metadata_value[[length(aop_metadata_value)+1]] = aop_metadata_value[[length(aop_metadata_value)]]$semi_join(aop_info[[length(aop_info)]]$group_by("metadata_value_id")$summarize(count(metadata_value)))
      aop_metadata_attrkey[[length(aop_metadata_attrkey)+1]] = aop_metadata_attrkey[[length(aop_metadata_attrkey)]]$semi_join(aop_info[[length(aop_info)]]$group_by("metadata_attrkey_id")$summarize(count(metadata_attrkey)))
    }
    attrkey_q = names(attribute_values)[[i]]
    value_q = paste0(attribute_values[[i]], collapse="|")
    if(attrkey_q==".*"){
      aop_metadata_attrkey_subset[[length(aop_metadata_attrkey_subset)+1]] = aop_metadata_attrkey[[length(aop_metadata_attrkey)]]
    } else {
      aop_metadata_attrkey_subset[[length(aop_metadata_attrkey_subset)+1]] = aop_metadata_attrkey[[length(aop_metadata_attrkey)]]$filter(metadata_attrkey %like% !!attrkey_q, .ignore_case = !case_sensitive)
    }
    aop_metadata_value_subset[[length(aop_metadata_value_subset)+1]] = aop_metadata_value[[length(aop_metadata_value)]]$filter(metadata_value %like% !!value_q, .ignore_case = !case_sensitive)
    aop_info_subset[[length(aop_info_subset)+1]] = aop_info[[length(aop_info)]]$semi_join(aop_metadata_attrkey_subset[[length(aop_metadata_attrkey_subset)]])$semi_join(aop_metadata_value_subset[[length(aop_metadata_value_subset)]])
    aop_info[[length(aop_info)+1]] = aop_info[[length(aop_info)]]$semi_join(aop_info_subset[[length(aop_info_subset)]]$drop_dims()$mutate(.dots=sapply(setdiff(aop_info[[length(aop_info)]]$attrs_n_dims,entity_dims), function(x) NULL)))
  }
  
  df_info = aop_info[[length(aop_info)]]$to_df_all()
  if(nrow(df_info)>0){
    df_main = con$aop_connection$afl_expr(scan_entity(pkg_schema, entitynm, con))$semi_join(unique(df_info[,entity_dims,drop=F]))$to_df_all()
  } else {
    df_main = iquery(con$db, paste0("limit(",scan_entity(pkg_schema, entitynm, con),",0)"), T)
  }
  if(join_info_array){
    return(unpivot_join_arr_to_info(df_main, df_info, by = entity_dims))
  } else {
    ret = list(df_main, df_info)
    names(ret) = c(entitynm, entitynm_info)
    return(ret)
  }
}
Paradigm4/revealcore documentation built on May 21, 2023, 9:57 a.m.