R/GMQLtoGRanges.R

Defines functions .importGMQL.gdm .importGMQL.gtf import_gmql

Documented in import_gmql

#' Create GRangesList from GMQL dataset
#'
#' It creates a GRangesList from GMQL samples in dataset. 
#' It reads sample files in GTF or GDM/tab-delimited format.
#'
#' @importFrom rtracklayer import
#' @importClassesFrom GenomicRanges GRangesList
#' @importFrom S4Vectors metadata
#' @importFrom utils read.delim
#' @import xml2
#'
#' @param dataset_path string with GMQL dataset folder path
#' @param is_gtf logical value indicating if dataset samples are in GTF format;
#' if TRUE and dataset does not contain GTF samples, an error occurs 
#' 
#' @return GRangesList containing all GMQL samples in dataset
#'
#' @seealso \code{\link{export_gmql}}
#'
#' @examples
#' 
#' ## This statement defines the path to the subdirectory "example" of the 
#' ## package "RGMQL" and imports as GRangesList the contained GMQL dataset
#' 
#' test_path <- system.file("example", "DATASET", package = "RGMQL")
#' grl = import_gmql(test_path, TRUE)
#'
#'
#' @export
#'
import_gmql <- function(dataset_path, is_gtf) {
    if(is_gtf)
        .importGMQL.gtf(dataset_path)
    else
        .importGMQL.gdm(dataset_path)
}

.importGMQL.gtf <- function(datasetName) {
    datasetName <- sub("/*[/]$","",datasetName)
    if(basename(datasetName) !="files")
        datasetName <- file.path(datasetName,"files")
    
    if(!dir.exists(datasetName))
        stop("Directory does not exists")
    
    if(!length(list.files(datasetName)))
        stop("no samples present in this dataset")
    
    attr_col_names <- .schema_header(datasetName)
    attr_col_names <- attr_col_names[
        !attr_col_names %in% c("seqname", "seqid", "start", "end", "strand")]
    regions <- list.files(datasetName, pattern = "*.gtf$",full.names = TRUE)
    if(length(regions)) {
        name_samples <- lapply(regions, function(x) {
            gsub("*.gtf", "", basename(x))})
        
        sampleList <- lapply(regions, function(x) {
            rtracklayer::import(
                con = x, 
                format = "gtf", 
                colnames = attr_col_names
            )
        })
        
        names(sampleList) <- name_samples
        gRange_list <- GenomicRanges::GRangesList(sampleList)
        
    } else
        stop("No GTF files present")
    
    meta <- list.files(datasetName, pattern = "*.gtf.meta$",full.names = TRUE)
    if(length(meta)) {
        meta_list <- lapply(meta, .add_metadata)
        names(meta_list) <- name_samples
        
    } else
        stop("No meta GTF files present")
    
    S4Vectors::metadata(gRange_list) <- meta_list
    return(gRange_list)
}

.importGMQL.gdm <- function(datasetName) {
    datasetName <- sub("/*[/]$","",datasetName)
    if(basename(datasetName) !="files")
        datasetName <- file.path(datasetName,"files")
    
    if(!dir.exists(datasetName))
        stop("Directory does not exists")
    
    if(!length(list.files(datasetName)))
        stop("no samples present in this dataset")
    
    regions <- list.files(datasetName, pattern = "*.gdm$",full.names = TRUE)
    if(length(regions)) {
        name_samples <- lapply(regions, function(x){
            gsub("*.gdm", "",basename(x))})
        vector_field <- .schema_header(datasetName)
        type_and_coord <- .schema_type_coordinate(datasetName)
        names(vector_field) <- NULL
        if(type_and_coord$coordinate_system %in% c("1-based")) {
            sampleList <- lapply(regions,function(x){
                df <- read.delim(x,col.names = vector_field,header = FALSE)
                g <- GenomicRanges::makeGRangesFromDataFrame(
                    df,
                    keep.extra.columns = TRUE,
                    seqnames.field = c("seqnames", "seqname",
                                       "chromosome", "chrom",
                                       "chr", "chromosome_name"),
                    start.field = "left",
                    end.field = "right")
            })
        } else {
            sampleList <- lapply(regions,function(x) {
                df <- read.delim(x,col.names = vector_field,header = FALSE)
                df$left = df$left +1
                g <- GenomicRanges::makeGRangesFromDataFrame(
                    df,
                    keep.extra.columns = TRUE,
                    seqnames.field = c("seqnames", "seqname",
                                       "chromosome", "chrom",
                                       "chr", "chromosome_name"),
                    start.field = "left",
                    end.field = "right")
                })
        }
        
        names(sampleList) <- name_samples
        gRange_list <- GenomicRanges::GRangesList(sampleList)
        
    } else
        stop("No GDM files present")
    
    meta <- list.files(datasetName, pattern = "*.gdm.meta$",full.names = TRUE)
    if(length(meta)) {
        meta_list <- lapply(meta, .add_metadata)
        names(meta_list) <- name_samples
    } else
        stop("No meta GDM files present")
    
    S4Vectors::metadata(gRange_list) <- meta_list
    return(gRange_list)
}
DEIB-GECO/RGMQL documentation built on Feb. 17, 2024, 10:39 p.m.