R/00_importRankings.R

Defines functions getColumnNames getRowNames importRankings

Documented in getColumnNames getRowNames importRankings

#' @title Import the motif databases for RcisTarget.
#' @param dbFile .feather or .parquet file containing the rankings
#' @param columns Columns to load from the .feather or .parquet file
#' (e.g. to read only specific genes or regions)
#' @param dbDescr Description fields (not used internally) e.g.:
#' \code{dbDescr=list(colType="gene", rowType="motif",
#' org="Human", genome="hg19", maxRank=Inf, description="")}
#' @param indexCol Column name containing the feature IDs (e.g. motif names or chip-seq tracks). 
#' @param warnMissingColumns If 'columns' is provided, warn if any ID is not available in the rankings? 
#' @description
#' The rankings are typically loaded from a .feather or .parquet file
#' with \code{importRankings()}.
#' @return
#' rankingRcisTarget object with the following slots:
#' #' \itemize{
#' \item rankings: data.frame containing the rankings
#' \item colType: 'gene'or 'region'
#' \item nColsInDB: Number of columns (e.g. genes/regions) available
#' in the database (.feather or .parquet file).
#' Note that not all might be loaded in the current object.
#' \item rowType: 'motif' or the type of feature is stored (e.g. ChipSeq)
#' \item org: human/mouse/fly
#' \item genome: hg19, mm9, ...
#' \item description: global description, summary, or any other information
#' \item maxRank: Maximum ranking included in the database,
#' higher values are converted to Inf.
#' }

#' @examples
#' ## Loading from a .feather or .parquet file (the .descr file is read automatically):
#' #motifRankings<-importRankings("hg19-500bp-upstream-7species.mc9nr.feather")
#' #motifRankings<-importRankings("hg19-500bp-upstream-7species.mc9nr.parquet")
#'
#' ## The annotations for Motif collection 9 (sufix 'mc9nr')
#' # are already included in RcisTarget, and can be loaded with:
#' data(motifAnnotations_hgnc)
#'
#' ## For other versions, import the appropiate annotation. e.g.:
#' # annotDb <- importAnnotations("motifs-v9-nr.hgnc-m0.001-o0.0.tbl")
#' # optional: motifsInRanking <- getRanking(motifRankings)$features

##### Load/import the ranking from a feather file:
#' @rdname importRankings
#' @import feather
#' @import utils
#' @export
importRankings <- function(dbFile, columns=NULL, dbDescr=NULL, indexCol="features", warnMissingColumns=TRUE)
{
  dbFile <- path.expand(dbFile)
  if(!file.exists(dbFile)) stop("File does not exist: ", dbFile)

  if(!is.null(columns)){
    missingColumns <- columns[which(!columns %in% getColumnNames(dbFile))]
    if(length(columns)>0 & warnMissingColumns)
    {
      warning("The following columns are missing from the database: ", paste(missingColumns, collapse=", "))
      columns <- columns[which(columns %in% getColumnNames(dbFile))]
    }
    columns <- unique(c(indexCol, columns))
  }
  extension <- strsplit(dbFile, "\\.") [[1]][length(strsplit(dbFile, "\\.") [[1]])]
  if (extension == 'feather'){
    rnks <- feather::read_feather(dbFile, columns=columns) # tibble
    #rnks <- data.frame... #to avoid replacing dash in names: check.names=FALSE
    nColsInDB <- feather::feather_metadata(dbFile)[["dim"]][2]-1
  }
  else if (extension == "parquet"){
    rnks <- arrow::read_parquet(dbFile, columns = columns)
    pq <- arrow::parquet_file_reader(dbFile)
    nColsInDB <- pq$GetSchema()$num_fields()-1
  }
  else{
    stop("Database format must be feather or parquet.")
  }

  dbFile_descr <- gsub(paste0(".", extension),".descr", dbFile, fixed=TRUE)
  if(!is.null(dbDescr))
  {
    dbDescr <- as.matrix(dbDescr)
    if(file.exists(dbFile_descr))
      warning("Ignoring the DB file description (.descr)")
  } else {
    if(file.exists(dbFile_descr))
    {
      dbDescr <- utils::read.table(file=dbFile_descr,
                            sep = "\t", row.names=1, stringsAsFactors=FALSE)
      message("Imported description file:\n",
              paste("\t", unname(sapply(rownames(dbDescr),
            function(x) paste(x, dbDescr[x,1], sep=": "))), collapse="\n"))
    }else{
      # If not provided: keep empty
      dbDescr <- as.matrix(list(colType="column",
                                rowType="row",
                                org="",
                                genome="",
                                nColsAvailable=nColsInDB,
                                maxRank = Inf,
                                description=""))
    }
  }

  dbDescr["nColsAvailable",] <- nColsInDB
  dbDescr["description",] <- paste0(dbDescr["description",],
                                    " [Source file: ", basename(dbFile),"]")

  rownames(dbDescr) <- tolower(rownames(dbDescr))
  new("rankingRcisTarget",
      rankings=rnks,
      colType=as.character(dbDescr["coltype",]),
      rowType=as.character(dbDescr["rowtype",]),
      org=as.character(dbDescr["org",]),
      genome=as.character(dbDescr["genome",]),
      nColsInDB=as.numeric(dbDescr["ncolsavailable",]),
      maxRank = as.numeric(dbDescr["maxrank",]),
      description=as.character(dbDescr["description",]))
}

#' @rdname importRankings
#' @import feather
#' @export
getRowNames <- function(dbFile)
{
  dbPath <- dbFile
  extension <- strsplit(dbPath, "\\.") [[1]][length(strsplit(dbPath, "\\.") [[1]])]
  if (extension == 'feather'){
    ret <- unlist(feather::read_feather(path.expand(dbPath), columns=1))
  }
  else if (extension == "parquet"){
     stop("Not implemented") # TODO: add arrow
  }
  return(ret)
}

#' @rdname importRankings
#' @import feather
#' @export
getColumnNames <- function(dbFile) # TODO: Check if they are really genes/regions
{
  dbPath <- dbFile
  extension <- strsplit(dbPath, "\\.") [[1]][length(strsplit(dbPath, "\\.") [[1]])]
  if (extension == 'feather'){
    ret <- names(feather::feather_metadata(path=path.expand(dbPath))$types)[-1]
  }
  else if (extension == "parquet"){
    stop("Not implemented") # TODO: add arrow
  }
  return(ret)
}
# 
# getGeneNames <- getRegionNames <- getColumnNames 

Try the RcisTarget package in your browser

Any scripts or data that you put into this service are public.

RcisTarget documentation built on Nov. 8, 2020, 6:57 p.m.