R/functions-search-index.R

Defines functions search_entity_flex_fields find_dataset_id_by_metadata_value

Documented in find_dataset_id_by_metadata_value

search_entity_flex_fields = function(dataset_id, dataset_version = 1, con = NULL) {
  con = use_ghEnv_if_null(con = con)
  stopifnot(length(dataset_id) == 1)
  stopifnot(length(dataset_version) == 1)
  arrnm = full_arrayname(.ghEnv$meta$arrEntityFlexFields)
  res = iquery(
    con$db, 
    paste0(
      "filter(", arrnm, 
      ", dataset_id = ", dataset_id, ")"
    ),
    return = T
  )
  res[res$dataset_version == dataset_version, ]
}

#' Find matching datasets by searching across entities
#' 
#' @param metadata_value_df portion of \code{get_metadata_value()}, must contain the column \code{metadata_value_id}
#' 
#' @export
find_dataset_id_by_metadata_value = function(metadata_value_df, 
                                             return_attribute_col = FALSE, 
                                             return_metadata_value = FALSE, 
                                             return_metadata_id = FALSE,
                                             con = NULL) {
  con = use_ghEnv_if_null(con = con)
  metadata_value_id = metadata_value_df$metadata_value_id
  metadata_value_id = as.integer(sort(metadata_value_id))
  if (length(metadata_value_id) == 0) {
    return(data.frame(
      dataset_id = integer(),
      dataset_name = character(),
      entity_id = integer(), 
      entity = character(), 
      count = integer(),
      stringsAsFactors = F
    ))
  }
  if (length(metadata_value_id) <= 20) {
    message("Searching by following metadata_value id-s: (", 
            pretty_print(metadata_value_id, prettify_after = 20), ")")
  } else {
    message("Searching by following metadata_value id-s: ", 
            pretty_print(metadata_value_id))
  }
  build_literal_query = paste0(
    "build(<metadata_value_id:int64>[idx=0:", 
    length(metadata_value_id)-1, 
    "], '[", 
    paste0(metadata_value_id, collapse = ", "),
    "]', true)"
  )
  eq_join_query = formulate_equi_join_query(
    left_array_or_query = paste0(
      custom_scan(), "(", 
      full_arrayname(.ghEnv$meta$arrEntityFlexFields), 
      ")"
    ), 
    right_array_or_query = build_literal_query,
    left_fields_to_join_by = 'metadata_value_id',
    right_fields_to_join_by = 'metadata_value_id',
    keep_dimensions = TRUE, 
    con = con
  )
  if (!return_attribute_col & !return_metadata_value) {
    aggregate_query = paste0(
      "grouped_aggregate(",
      eq_join_query, 
      ", count(*), dataset_id, entity_id)")
  } else if (return_attribute_col & !return_metadata_value) {
    aggregate_query = paste0(
      "grouped_aggregate(",
      eq_join_query, 
      ", count(*), dataset_id, metadata_attrkey_id, entity_id)")
  } else if (!return_attribute_col & return_metadata_value) {
    aggregate_query = paste0(
      "grouped_aggregate(",
      eq_join_query, 
      ", count(*), dataset_id, metadata_value_id, entity_id)")
  } else if (return_attribute_col & return_metadata_value) {
    aggregate_query = paste0(
      "grouped_aggregate(",
      eq_join_query, 
      ", count(*), dataset_id, metadata_attrkey_id, metadata_value_id, entity_id)")
  }
  join_dataset_name = formulate_equi_join_query(
    left_array_or_query = paste0(
      "project(",
      revealgenomics:::custom_scan(), "(", 
      revealgenomics:::full_arrayname(.ghEnv$meta$arrDataset), 
      ")", 
      ", name)"
    ),
    right_array_or_query = aggregate_query,
    left_fields_to_join_by = 'dataset_id',
    right_fields_to_join_by = 'dataset_id', 
    con = con
  )
  res = revealgenomics:::drop_equi_join_dims(iquery(con$db, join_dataset_name, return = T))
  res$entity = revealgenomics:::get_entity_from_entity_id(entity_id = res$entity_id)
  res = plyr::rename(res, c('name' = 'dataset_name'))
  res = res[order(res$dataset_id, res$entity_id), ]
  if (return_attribute_col) {
    attribute_columns = revealgenomics:::get_metadata_attrkey(metadata_attrkey_id = unique(res$metadata_attrkey_id), con = con)
    res$attribute_column = attribute_columns$metadata_attrkey[match(res$metadata_attrkey_id, attribute_columns$metadata_attrkey_id)]
  } 
  if (return_metadata_value) {
    m1 = find_matches_and_return_indices(res$metadata_value_id, metadata_value_df$metadata_value_id)
    if (length(m1$source_unmatched_idx) > 0) stop("Did not expect metadata_value_id-s outside those supplied by user")
    res$metadata_value = metadata_value_df$metadata_value[m1$target_matched_idx]
  } 
  # Reorder columns
  potential_cols = c('dataset_id', 'dataset_name', 'entity', 'attribute_column', 'metadata_value', 'count')
  return(res[, potential_cols[potential_cols %in% colnames(res)]])
}
Paradigm4/revealgenomics documentation built on April 7, 2020, 2:01 a.m.