Nothing
#' Filter and extract function
#'
#' This function lets user to create a new GRangesList with fixed information:
#' seqnames, ranges and strand, and a variable part made up by the regions
#' defined as input. The metadata and metadata_prefix are used to filter
#' the data and choose only the samples that match at least one metdatata
#' with its prefix. The input regions are shown for each sample obtained
#' from filtering.
#'
#' @import xml2
#' @importFrom dplyr bind_cols
#' @importFrom data.table fread
#' @importFrom rtracklayer import
#'
#' @param data string GMQL dataset folder path or GRangesList
#' object
#' @param metadata vector of strings containing names of metadata attributes
#' to be searched for in metadata files.
#' Data will be extracted if at least one condition is satisfied:
#' this condition is logically "ANDed" with prefix filtering (see below)
#' if NULL no filtering action occures
#' (i.e every sample is taken for region filtering)
#' @param metadata_prefix vector of strings that will support the metadata
#' filtering. If defined, each 'metadata' is concatenated with the
#' corresponding prefix.
#' @param region_attributes vector of strings that extracts only region
#' attributes specified; if NULL no regions attribute is taken and the output
#' is only GRanges made up by the region coordinate attributes
#' (seqnames, start, end, strand)
#' @param suffix name for each metadata column of GRanges. By default it is the
#' value of the metadata attribute named "antibody_target". This string is
#' taken from sample metadata file or from metadata() associated.
#' If not present, the column name is the name of selected regions specified
#' by 'region_attributes' input parameter
#'
#' @details
#' This function works only with datatset or GRangesList all whose samples or
#' Granges have the same region coordinates (chr, ranges, strand) ordered in
#' the same way for each sample
#'
#' In case of GRangesList data input, the function searches for metadata
#' into metadata() function associated to GRangesList.
#'
#' @return GRanges with selected regions
#'
#' @examples
#'
#' ## This statement defines the path to the folder "DATASET" in the
#' ## subdirectory "example" of the package "RGMQL" and filters such folder
#' ## dataset including at output only "pvalue" and "peak" region attributes
#'
#' test_path <- system.file("example", "DATASET", package = "RGMQL")
#' filter_and_extract(test_path, region_attributes = c("pvalue", "peak"))
#'
#' ## This statement imports a GMQL dataset as GRangesList and filters it
#' ## including at output only "pvalue" and "peak" region attributes, the sort
#' ## function makes sure that the region coordinates (chr, ranges, strand)
#' ## of all samples are ordered correctly
#'
#'
#' grl = import_gmql(test_path, TRUE)
#' sorted_grl = sort(grl)
#' filter_and_extract(sorted_grl, region_attributes = c("pvalue", "peak"))
#'
#'
#' @export
#'
filter_and_extract <- function(data, metadata = NULL,
metadata_prefix = NULL, region_attributes = NULL,
suffix = "antibody_target")
{
if(is(data,"GRangesList"))
.extract_from_GRangesList(data, metadata, metadata_prefix,
region_attributes, suffix)
else
.extract_from_dataset(data, metadata, metadata_prefix,
region_attributes, suffix)
}
.extract_from_dataset <- function(datasetName, metadata, metadata_prefix,
regions, suffix)
{
datasetName <- sub("/*[/]$","",datasetName)
if(basename(datasetName) !="files")
datasetName <- file.path(datasetName,"files")
if(!dir.exists(datasetName))
stop("Directory does not exists")
gdm_meta_files <- list.files(datasetName, pattern = "*.gdm.meta$",
full.names = TRUE)
gtf_meta_files <- list.files(datasetName, pattern = "*.gtf.meta$",
full.names = TRUE)
if(!length(gdm_meta_files) && !length(gtf_meta_files))
stop("no samples present or no files format supported")
if(length(gdm_meta_files) && length(gtf_meta_files))
stop("GMQL dataset cannot be mixed dataset: no GTF and GDM together")
vector_field <- .schema_header(datasetName)
if(length(gdm_meta_files))
{
samples_file <- .check_metadata_files(metadata,metadata_prefix,
gdm_meta_files)
samples_meta_to_read <- unlist(samples_file)
if(length(samples_meta_to_read))
samples_to_read <- gsub(".meta$", "", samples_meta_to_read)
else
{
samples_to_read <- gsub(".meta$", "", gdm_meta_files)
samples_meta_to_read <- gtf_meta_files
}
suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read)
granges <- .parse_gdm_files(vector_field,samples_to_read,regions,
suffix_vec)
}
else
{
samples_file <- .check_metadata_files(metadata,metadata_prefix,
gtf_meta_files)
samples_meta_to_read <- unlist(samples_file)
if(length(samples_meta_to_read))
samples_to_read <- gsub(".meta$", "", samples_meta_to_read)
else
{
samples_to_read <- gsub(".meta$", "", gtf_meta_files)
samples_meta_to_read <- gtf_meta_files
}
suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read)
granges <- .parse_gtf_files(samples_to_read, regions,suffix_vec)
}
}
.extract_from_GRangesList <- function(rangesList, metadata, metadata_prefix,
regions, suffix)
{
if(!is(rangesList,"GRangesList"))
stop("only GrangesList admitted")
if(!length(rangesList))
stop("rangesList empty")
meta_list <- metadata(rangesList)
samples <- .check_metadata_list(metadata, metadata_prefix, meta_list)
if(!length(unlist(samples)))
samples <- rangesList
else
{
index <- unlist(samples)
samples <- rangesList[c(index)]
}
new_meta_list <- metadata(samples)
suffix_vec <- .get_suffix(suffix, TRUE, new_meta_list)
granges <- .parse_Granges(samples,regions,suffix_vec)
}
.parse_Granges <- function(region_list,regions,suffixes)
{
if(is.null(suffixes))
suffixes = ""
g1 <- region_list[[1]]
elementMetadata(g1) <- NULL
if(!is.null(regions))
{
DF_list <- mapply(function(g_x,h){
meta <- elementMetadata(g_x)[regions]
if(h!="")
names(meta) <- paste(regions,h,sep = ".")
data.frame(meta)
},region_list, suffixes, SIMPLIFY = FALSE)
DF_only_regions <- dplyr::bind_cols(DF_list)
elementMetadata(g1) <- DF_only_regions
}
g1
}
.get_suffix <- function(col_name, from_list, meta_fl)
{
suffix <- paste0(col_name,"$")
if(from_list)
{
meta_list <- mapply(function(x,index){
vec_names <- names(x)
s_index <- grep(suffix,vec_names)
first_index <- s_index[1]
suffix <- unlist(x[first_index]) # ne prendo solo uno
names(suffix) <- NULL
#if found retrieve samples that has at least one choosen metadata
if(first_index && !is.na(first_index))
suffix
else
""
}, meta_fl, seq_along(meta_fl))
}
else
{
meta_list <- vapply(meta_fl, function(x){
list <- .add_metadata(x)
vec_names <- names(list)
index <- grep(suffix,vec_names)
first_index <- index[1]
suffix <- unlist(list[first_index]) # ne prendo solo uno
names(suffix) <- NULL
#if found retrieve samples that has at least one choosen metadata
if(first_index && !is.na(first_index))
suffix
else
""
},character(1))
}
names(meta_list) <- NULL
meta_list
}
.check_metadata_list <- function(metadata,metadata_prefix,meta_list)
{
vec_meta <- paste0(metadata_prefix,metadata)
list <- mapply(function(x,index){
vec_names <- names(x)
a <- lapply(vec_meta, function(y) {
which(y == vec_names)
})
## we would like that manage more index from grep
found <- as.logical(length(unlist(a)))
#if found retrieve samples that has at least one choosen metadata
if(found){index}
}, meta_list, seq_along(meta_list))
}
.check_metadata_files <- function(metadata,metadata_prefix,meta_files)
{
vec_meta <- paste0(metadata_prefix,metadata)
meta_list <- lapply(meta_files, function(x){
list <- .add_metadata(x)
vec_names <- names(list)
a <- lapply(vec_meta, function(y)grep(y,vec_names))
## we would like that manage more index from grep
found <- as.logical(length(unlist(a)))
#if found retrieve samples that has at least one choosen metadata
if(found){x}
})
}
.parse_gtf_files <- function(gtf_region_files, regions, suffixes)
{
g1 <- rtracklayer::import(con = gtf_region_files[1], format = "gtf")
elementMetadata(g1) <- NULL
if(is.null(suffixes))
suffixes = ""
if(!is.null(regions))
{
DF_list <- mapply(function(x,h){
g_x <- rtracklayer::import(con = x, format = "gtf")
meta <- elementMetadata(g_x)[regions]
if(h!="")
names(meta) <- paste(regions,h,sep = ".")
data.frame(meta)
},gtf_region_files, suffixes, SIMPLIFY = FALSE)
DF_only_regions <- dplyr::bind_cols(DF_list)
elementMetadata(g1) <- DF_only_regions
}
g1
}
.parse_gdm_files <- function(vector_field,gdm_region_files,regions,
suffixes)
{
#read first sample cause chromosome regions are the same for all samples
df <- data.table::fread(gdm_region_files[1],col.names = vector_field,
header = FALSE,sep = '\t')
col_names <- names(df)
df <- subset(df, TRUE, c("chr","left","right","strand"))
if(!is.null(regions))
{
df_list <- lapply(gdm_region_files,function(x,regions,vector_field){
region_frame <- data.table::fread(x,col.names = vector_field,
header = FALSE,sep = '\t')
col_names <- names(region_frame)
#delete column not choosen by input
if(!is.null(regions))
col_names <- col_names[col_names %in% regions]
if(length(col_names))
r <- subset(region_frame,TRUE,col_names)
},regions,vector_field)
df_only_regions <- dplyr::bind_cols(df_list)
complete_df <- dplyr::bind_cols(df,df_only_regions)
region_names <- names(complete_df)[-(seq_len(4))]
region_names <- gsub('[0-9]+', '',region_names)
region_names <- paste(region_names,suffixes,sep = ".")
region_names <- c(names(complete_df)[(seq_len(4))],region_names )
names(complete_df) <- region_names
g <- GenomicRanges::makeGRangesFromDataFrame(complete_df,
keep.extra.columns = TRUE,
start.field = "left",
end.field = "right")
}
else
g <- GenomicRanges::makeGRangesFromDataFrame(df,
keep.extra.columns = TRUE,
start.field = "left",
end.field = "right")
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.