#
# BEGIN_COPYRIGHT
#
# PARADIGM4 INC.
# This file is part of the Paradigm4 Enterprise SciDB distribution kit
# and may only be used with a valid Paradigm4 contract and in accord
# with the terms and conditions specified by that contract.
#
# Copyright (C) 2011 - 2017 Paradigm4 Inc.
# All Rights Reserved.
#
# END_COPYRIGHT
#
#' Calculate statistics on Variant data
calculate_statistics_variant = function(measurementset_id = NULL, con = NULL) {
con = use_ghEnv_if_null(con = con)
db = con$db
if (!is.null(measurementset_id)) {
res = iquery(db,
paste0("aggregate(
filter(",
full_arrayname(.ghEnv$meta$arrVariant),
", measurementset_id=", measurementset_id,
"), count(*) AS variant_count, measurementset_id, biosample_id)"),
return = TRUE)
m = get_measurementsets(measurementset_id = measurementset_id)
} else { # for all measurementsets
res = iquery(db,
paste0("aggregate(",
full_arrayname(.ghEnv$meta$arrVariant),
", count(*) AS variant_count, measurementset_id, biosample_id)"),
return = TRUE)
res = res[order(res$measurementset_id, res$biosample_id), ]
res
m = get_measurementsets(measurementset_id = sort(unique(res$measurementset_id)))
}
if (unique(m$entity) != 'VARIANT') {
if (!is.null(measurementset_id)) stop("entity entries should have been of VARIANT type here")
}
m = plyr::rename(m, c('name' = 'pipeline'))
d = get_datasets(dataset_id = unique(sort(m$dataset_id)))
d = plyr::rename(d, c('name' = 'study'))
md = merge(d[, c('dataset_id', 'dataset_version', 'study')],
m[, c('dataset_id', 'dataset_version', 'measurementset_id', 'pipeline')],
by = c('dataset_id', 'dataset_version'))
# res = plyr::rename(res, c('count' = 'variant_count'))
res = res %>%
group_by(measurementset_id) %>%
dplyr::summarise(mean_variants_per_sample = mean(variant_count),
num_samples = n())
mdc = merge(md, res,
by = 'measurementset_id')
mdc
}
#' Calculate statistics on Gene-Expression data
calculate_statistics_rnaquantification = function(measurementset_id = NULL, decorateResults = TRUE, con = NULL){
con = use_ghEnv_if_null(con)
idname = get_base_idname(.ghEnv$meta$arrMeasurementSet)
if (is.null(measurementset_id)){
c = iquery(con$db,
paste0("aggregate(",
custom_scan(), "(",
full_arrayname(.ghEnv$meta$arrRnaquantification), "),",
"count(*), ",
idname,
", dataset_version)"),
return = TRUE)
} else { #specific measurementset_id is specified
qq = paste0("filter(",
custom_scan(), "(",
full_arrayname(.ghEnv$meta$arrRnaquantification), "), ",
idname , "=", measurementset_id, ")")
qq = paste0("aggregate(", qq, ", count(*), ", idname, ", dataset_version)")
c = iquery(con$db, qq, return = T)
}
c = c[order(c[, idname], c$dataset_version), ]
if (!decorateResults) {
return(c)
} else {
ms = get_measurementsets(measurementset_id = unique(sort(c$measurementset_id)),
mandatory_fields_only = TRUE,
all_versions = TRUE,
con = con)
ms = plyr::rename(ms, c('name' = 'pipeline'))
d = get_datasets(dataset_id = unique(sort(c$dataset_id)),
mandatory_fields_only = TRUE,
all_versions = TRUE,
con = con)
d = plyr::rename(d, c('name' = 'study_name'))
res = merge(ms[, c('measurementset_id', 'dataset_id', 'dataset_version', 'pipeline')],
c,
by = c('measurementset_id', 'dataset_version'))
res = merge(d[, c('dataset_id', 'dataset_version', 'study_name')],
res,
by = c('dataset_id', 'dataset_version'))
return(res[order(res$dataset_id,
res$dataset_version,
res$measurementset_id), ])
}
}
#' Calculate summary statistics across measurementsets
#'
#' @examples calculate_statistics_across_measurementsets(
#' df_measurementset = search_measurementsets(dataset_id = 31))
#' @examples calculate_statistics_across_measurementsets(
#' df_measurementset = get_measurementsets(measurementset_id = c(2:5)))
calculate_statistics_across_measurementsets = function(df_measurementset, con = NULL) {
con = use_ghEnv_if_null(con = con)
db = con$db
idx = df_measurementset$measurementset_id
zz = df_measurementset[, c('measurementset_id', 'name', 'entity')]
L1 = lapply(unique(zz$entity),
function(entitynm) {
counts = iquery(db, paste0("aggregate(filter(", custom_scan(), "(", full_arrayname(entitynm), "), ",
formulate_base_selection_query(fullarrayname = .ghEnv$meta$arrMeasurementSet,
id = idx), "),
count(*), measurementset_id)"), return = TRUE)
if (nrow(counts) == 0) {
counts = data.frame(measurementset_id = idx,
count = rep(NA, length(idx)))
}
merge(zz[zz$entity == entitynm, ],
counts, by = 'measurementset_id', all.x = TRUE)
}
)
# names(L1) = unique(zz$entity)
dplyr::bind_rows(L1)
}
#' Calculate statistics across all MeasurementSets by consulting MeasurementData cache
#'
#' Depends on size estimates being stored in cache array under \code{payload_size_bytes} field
calculate_statistics_across_measurementsets_from_measurementdata_cache = function(con = NULL, dataset_df = NULL, measurementset_df = NULL) {
con = revealgenomics:::use_ghEnv_if_null(con = con)
if (is.null(dataset_df)) dataset_df = get_datasets(mandatory_fields_only = T, con = con)
if (is.null(measurementset_df)) measurementset_df = get_measurementsets(mandatory_fields_only = T, con = con)
ms_size = iquery(
con$db,
paste0("project(",
revealgenomics:::custom_scan(), "(",
revealgenomics:::full_arrayname(.ghEnv$meta$arrMeasurementDataCache),
")",
", payload_size_bytes)"),
return = T
)
measurementset_df$dataset_name = dataset_df$name[match(measurementset_df$dataset_id, dataset_df$dataset_id)]
measurementset_df$payload_size_bytes = ms_size$payload_size_bytes[match(measurementset_df$measurementset_id, ms_size$measurementset_id)]
measurementset_df = measurementset_df[!is.na(measurementset_df$payload_size_bytes), ]
measurementset_df[order(-measurementset_df$payload_size_bytes), ] %>%
select(dataset_version, dataset_id, measurementset_id, dataset_name, entity, name, payload_size_bytes, everything())
}
#' Aggregate project, study info by phenotype column(s)
#'
#' Aggregate project, study info by phenotype column(s)
#'
#' Aggregate project and study information by selected phenotype columns
#' For example, `disease` is phenotype related information that is retained under the
#' `BIOSAMPLE` entity. If we specify `filter_column = 'primary_disease'`, then
#' the function aggregates counts by disease by study. Function also joins in
#' project id and name
#'
#' @param filter_column one or more filte rcolumns to aggregate by
#'
#' @examples aggr_proj_study_by_pheno(filter_column = c('sample_cell_type', 'sample_molecule_type'))
#' returns
#' study_id study_version project_id project_name study_name filter_column filter_value total
#' 528 237 1 1 MMRF CoMMpass MMRF CoMMpass IA13a primary_disease multiple myeloma 2035
#' 529 238 1 1 MMRF CoMMpass MMRF CoMMpass IA14a primary_disease multiple myeloma 2072
aggr_proj_study_by_pheno = function(
filter_column, method = 1, con = NULL
) {
stop("Need to adjust function to not use old ontology implementation -- ",
"old implementation had `get_ontology()` and `search_ontology()` ",
"return `ontology_id` column. Now `search_ontology()` is unavailable",
" and `get_ontology()` returns columns `category` and `term` only",
"\n\nConsult older version of function to get original implementation")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.