#' Produce a subset of data following predefined criteria
#'
#' After running the \code{prepare_ENCODEDb} function, this function will allow
#' you to extract a subset of data encording to the following criteria :
#' accession, assay name, biosample, dataset accession, file accession,
#' file format, laboratory, donor organism, target and treatment.
#'
#' By default, the query can be made on an exact match term. This behaviour can
#' be modified by setting the \code{fixed} argument at \code{TRUE}
#'
#' @param df \code{list} of two \code{data.frame} containing ENCODE
#' experiment and dataset metadata
#' @param set_accession character string to select the experiment or dataset accession
#' @param assay character string to select the assay type
#' @param biosample character string to select the biosample name
#' @param dataset_accession character string to select the dataset accession
#' @param file_accession character string to select the file accesion
#' @param file_format character string to select the file format
#' @param lab character string to select the laboratory
#' @param organism character string to select the donor organism
#' @param target character string to select the experimental target
#' @param treatment character string to select the treatment
#' @param file_status character string to select the file status
#' ("released", "revoked", "all"). Default "released"
#' @param status character string to select the dataset/experiment status
#' @param fixed logical. If TRUE, pattern is a string to be matched as it is.
#'
#' @return a \code{list} of two \code{data.frame}s containing data about ENCODE
#' experiments and datasets
#'
#' @examples
#' queryEncode(biosample = "A549", file_format = "bam")
#'
#' @export
queryEncode <- function(df = NULL, set_accession = NULL, assay = NULL, biosample = NULL,
dataset_accession = NULL, file_accession = NULL, file_format = NULL,
lab = NULL, organism = NULL, target = NULL, treatment = NULL,
file_status = "released", status = "released", fixed = TRUE) {
if(is.null(df)) {data(encode_df, envir = environment())} else {encode_df = df}
if(is.null(set_accession) && is.null(assay) && is.null(biosample) && is.null(dataset_accession) &&
is.null(file_accession) && is.null(file_format) && is.null(lab) && is.null(organism) &&
is.null(target) && is.null(treatment))
{
warning("Please provide at least one valid criteria", call. = FALSE)
NULL
} else {
s1 = encode_df$experiment
s2 = encode_df$dataset
ac = set_accession
as = assay
bs = biosample
da = dataset_accession
fa = file_accession
ff = file_format
lb = lab
og = organism
tg = target
tr = treatment
es = status
fs = file_status
if(fixed) {
if(!is.null(ac)) {
s1 <- subset(s1, s1$accession == ac)
s2 <- subset(s2, s2$accession == ac)
}
if(!is.null(as)) {
s1 <- subset(s1, s1$assay == as)
s2 <- subset(s2, s2$assay == as)
}
if(!is.null(bs)) {
s1 <- subset(s1, s1$biosample_name == bs)
s2 <- subset(s2, s2$biosample_name == bs)
}
if(!is.null(da)) {
s1 <- subset(s1, s1$dataset_accession == da)
s2 <- subset(s2, s2$accession == da)
}
if(!is.null(fa)) {
s1 <- subset(s1, s1$file_accession == fa)
s2 <- subset(s2, s2$file_accession == fa)
}
if(!is.null(ff)) {
s1 <- subset(s1, s1$file_format == ff)
s2 <- subset(s2, s2$file_format == ff)
}
if(!is.null(lb)) {
s1 <- subset(s1, s1$lab == lb)
s2 <- subset(s2, s2$lab == lb)
}
if(!is.null(og)) {
s1 <- subset(s1, s1$organism == og)
s2 <- subset(s2, s2$organism == og)
}
if(!is.null(tg)) {
s1 <- subset(s1, s1$target == tg)
s2 <- subset(s2, s2$target == tg)
}
if(!is.null(tr)) {
s1 <- subset(s1, s1$treatment == tr)
s2 <- subset(s2, s1$treatment == tr)
}
if(fs != "all") {
s1 <- subset(s1, s1$file_status == fs)
s2 <- subset(s2, s2$file_status == fs)
}
if(es != "all") {
s1 <- subset(s1, s1$status == es)
s2 <- subset(s2, s2$status == es)
}
} else {
# retirer ignorer les espaces, les tirets et la casse
# m cf 7 = MCf7 = mcf-7 = MCF-7 ... etc
if(!is.null(ac)) {
query.transfo = query_transform(ac)
select.entries = grepl(x = s1$accession, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s1 = s1[select.entries,]
select.entries = grepl(x = s2$accession, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s2 = s2[select.entries,]
}
if(!is.null(as)) {
query.transfo = query_transform(as)
select.entries = grepl(x = s1$assay, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s1 = s1[select.entries,]
select.entries = grepl(x = s2$assay, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s2 = s2[select.entries,]
}
if(!is.null(bs)) {
query.transfo = query_transform(bs)
select.entries = grepl(x = s1$biosample_name, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s1 = s1[select.entries,]
select.entries = grepl(x = s2$biosample_name, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s2 = s2[select.entries,]
}
if(!is.null(da)) {
query.transfo = query_transform(da)
select.entries = grepl(x = s1$dataset_accession, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s1 = s1[select.entries,]
select.entries = grepl(x = s2$accession, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s2 = s2[select.entries,]
}
if(!is.null(fa)) {
query.transfo = query_transform(fa)
select.entries = grepl(x = s1$file_accession, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s1 = s1[select.entries,]
select.entries = grepl(x = s2$file_accession, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s2 = s2[select.entries,]
}
if(!is.null(ff)) {
query.transfo = query_transform(ff)
select.entries = grepl(x = s1$file_format, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s1 = s1[select.entries,]
select.entries = grepl(x = s2$file_format, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s2 = s2[select.entries,]
}
if(!is.null(lb)) {
query.transfo = query_transform(lb)
select.entries = grepl(x = s1$lab, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s1 = s1[select.entries,]
select.entries = grepl(x = s2$lab, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s2 = s2[select.entries,]
}
if(!is.null(og)) {
query.transfo = query_transform(og)
select.entries = grepl(x = s1$organism, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s1 = s1[select.entries,]
select.entries = grepl(x = s2$organism, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s2 = s2[select.entries,]
}
if(!is.null(tg)) {
query.transfo = query_transform(tg)
select.entries = grepl(x = s1$target, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s1 = s1[select.entries,]
select.entries = grepl(x = s2$target, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s2 = s2[select.entries,]
}
if(!is.null(tr)) {
query.transfo = query_transform(tr)
select.entries = grepl(x = s1$treatment, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s1 = s1[select.entries,]
select.entries = grepl(x = s2$treatment, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s2 = s2[select.entries,]
}
if(fs != "all") {
query.transfo = query_transform(fs)
select.entries = grepl(x = s1$file_status, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s1 = s1[select.entries,]
select.entries = grepl(x = s2$file_status, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s2 = s2[select.entries,]
}
if(es != "all") {
query.transfo = query_transform(es)
select.entries = grepl(x = s1$status, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s1 = s1[select.entries,]
select.entries = grepl(x = s2$status, pattern = query.transfo,
ignore.case =TRUE, perl =TRUE)
s2 = s2[select.entries,]
}
}
if((nrow(s1) + nrow(s2)) == 0) {
warning("No result found. You can try the <searchEncode> function or set the fixed option to FALSE", call. = FALSE)
NULL
}
else
{
query_results = list(experiment = s1, dataset = s2)
print(paste0("experiment results : ",nrow(query_results$experiment)," files in ",length(unique(query_results$experiment$accession))," experiments ; dataset results : ",nrow(query_results$dataset), " files"))
query_results
}
}
}
query_transform <- function(my.term) {
my.term = gsub(my.term, pattern = " ", replacement = "", fixed =TRUE)
my.term = gsub(my.term, pattern = "-", replacement = "", fixed =TRUE)
my.term = gsub(my.term, pattern = ",", replacement = "", fixed =TRUE)
my.term = strsplit(my.term, split = "")[[1]]
my.term.4.grep = paste0("^",paste(my.term, collapse = "[ ,-]?"))
my.term.4.grep
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.