################################################################################
# UTILITY FUNCTIONS
################################################################################
# These are functions copied over from my repository of utilities used
# by this package. They are repeated here simply for portability, so this
# package can be deployed on systems without access to my utilities.
# Any changes should probably be backported to the primary functions rather
# than in these convenience duplications.
#
# These functions should probably remain interior to the package (not exported)
#
#' setSharedDataDir
#' Sets global variable specifying the default data directory.
#'
#' @export
#' @param sharedDataDir directory where the shared data is stored.
#' @return No return value.
#' @examples
#' setSharedDataDir("project/data")
setSharedDataDir = function(sharedDataDir) {
options(SHARE.DATA.DIR=sharedDataDir)
return()
}
countOverlapsAny = function(subj, quer, cores=1) {
setLapplyAlias(cores)
l = unlist(lapplyAlias(subj, function(x) { sum(overlapsAny(x, quer)) } ))
return(l)
}
################################################################################
# INTERNAL FUNCTIONS - not exported
################################################################################
#' cleanws takes multi-line, code formatted strings and just formats them
#' as simple strings
#' @param string string to clean
#' @return A string with all consecutive whitespace characters, including
#' tabs and newlines, merged into a single space.
cleanws = function(string) {
return(gsub('\\s+'," ", string))
}
#' This will change the string in filename to have a new extension
#' @param filename string to convert
#' @param extension new extension
#' @return Filename with original extension deleted, replaced by provided
#' extension
replaceFileExtension = function(filename, extension) {
sub("\\..*$", enforceEdgeCharacter(extension, prependChar="."),
paste0(filename, "."))
}
#' Just a reverser. Reverses the order of arguments and passes them
#' untouched to countOverlapsAny -- so you can use it with lapply.
#'
#' @param subj Subject
#' @param quer Query
#' @return Results from countOverlaps
countOverlapsAnyRev = function(subj, quer) {
countOverlapsAny(quer, subj)
}
#' converts a list of GRanges into a GRangesList; strips all metadata.
#' @param lst a list of GRanges objects
#' @return a GRangesList object
listToGRangesList = function(lst) {
if(!is(lst, "GRangesList")) {
if ("list" %in% class(lst)) {
#strip elementMetadata
lst = lapply(lst, function(x) { values(x) <- NULL; return(x) } )
lst = GRangesList(lst)
} else {
message(cleanws("Converting GRanges to GRangesList."))
lst = GRangesList(list(lst))
}
}
return(lst)
}
#' Wrapper of write.table that provides defaults to write a
#' simple .tsv file. Passes additional arguments to write.table
#'
#' @param ... Additional arguments passed to write.table
#' @return No return value
write.tsv = function(...) {
write.table(..., sep="\t", row.names=FALSE, quote=FALSE)
}
######################################################################
#' Imports bed files and creates GRanges objects, using the fread()
#' function from data.table.
#'
#' @param file File name of bed file.
#' @return GRanges Object
#' @export
#' @examples
#' a = readBed(system.file("extdata", "examples/combined_regions.bed",
#' package="LOLA"))
readBed = function(file) {
DT = fread(file)
# bed specification says:
# 1=chr, 2=start, 3=end, 4=name, 5=score (discarded), 6=strand.
cn = rep(NA, 6)
readCols = colnames(DT)
cn[seq_along(readCols)] = readCols
# BED files are 0-based, but internal bioc representations use 1-based coords.
DT[, (colnames(DT)[2]) := (get(colnames(DT)[2]) + 1)]
# Convert any unknown strands to '*'
DT[, (cn[3])]
tfbsgr = dtToGr(DT, chr=cn[1], start=cn[2], end=cn[3],
name=cn[4], strand=cn[6])
return(tfbsgr)
}
#Two utility functions for converting data.tables into GRanges objects
#genes = dtToGR(gModels, "chr", "txStart", "txEnd", "strand", "geneId")
dtToGrInternal = function(DT, chr, start, end=NA,
strand=NA, name=NA, metaCols=NA) {
if (is.na(end)) {
end = start
}
if (is.na(strand)) {
gr=GRanges(
seqnames=DT[[`chr`]],
ranges=IRanges(start=DT[[`start`]],
end=DT[[`end`]]),
strand="*"
)
} else {
# GRanges can only handle '*' for no strand, so replace any non-accepted
# characters with '*'
DT[,strand:=as.character(strand)]
DT[strand=="1", strand:="+"]
DT[strand=="-1", strand:="-"]
DT[[`strand`]] = gsub("[^+-].*", "*", DT[[`strand`]])
gr=GRanges(
seqnames=DT[[`chr`]],
ranges=IRanges(start=DT[[`start`]],
end=DT[[`end`]]),
strand=DT[[`strand`]]
)
}
if (! is.na(name) ) {
names(gr) = DT[[`name`]]
} else {
names(gr) = seq_along(gr)
}
# This code is not used in LOLA:
#if(! is.na(metaCols)) {
# for(x in metaCols) {
# elementMetadata(gr)[[`x`]]=DT[[`x`]]
# }
#}
gr
}
dtToGr = function(DT, chr="chr", start="start", end=NA, strand=NA, name=NA,
splitFactor=NA, metaCols=NA) {
if(is.na(splitFactor)) {
return(dtToGrInternal(DT, chr, start, end, strand, name,metaCols))
}
if ( length(splitFactor) == 1 ) {
if( splitFactor %in% colnames(DT) ) {
splitFactor = DT[, get(splitFactor)]
}
}
lapply(split(seq_len(nrow(DT)), splitFactor),
function(x) {
dtToGrInternal(DT[x,], chr, start, end, strand, name,metaCols)
}
)
}
# If you want to use the GenomicRanges countOverlaps function, but you want to
# do it in an lapply, that will work... but you can only do it in one direction.
# If you want to lapply on the opposite argument, you can't do it (because
# countOverlaps is not symmetric: it depends on which argument comes first).
# If you want to do an lapply, but countOverlaps with the query as the second
# argument instead of the first, you can use this function to simply reverse
# the order of the arguments.
# This is used in the enrichment calculations (originally from the EWS
# project; 2014, CeMM).
countOverlapsRev = function(query, subject, ...) {
return(countOverlaps(subject, query, ...))
}
# For windows, let's try this.
countFileLinesBackup = function(filename) {
# From https://stackoverflow.com/a/23456450/946721
f = file(filename, open="rb")
nlines <- 0L
while (length(chunk <- readBin(f, "raw", 65536)) > 0) {
nlines = nlines + sum(chunk == as.raw(10L))
}
close(f)
return(as.numeric(nlines))
}
# Parses result of system "wc" wordcount to return the number of lines
# in a file into R.
countFileLines = function(filename) {
if (!file.exists(filename)) {
warning("File does not exist:", filename)
return(0)
}
nlines = tryCatch( {
return(as.numeric(
strsplit(sub("^\\s+", "",
system(paste("wc -l ", filename),
intern=TRUE)), " ")[[1]][1]
)) }, error= function(err) { return(NA) }
)
if (is.na(nlines)) {
return(countFileLinesBackup(filename))
}
}
#' Function to sample regions from a GRangesList object, in specified proportion
#'
#' @param GRL GRangesList from which to sample
#' @param prop vector with same length as GRL, of values between 0-1,
#' proportion of the list to select
#'
#' @return A sampled subset of original GRangesList object.
sampleGRL = function(GRL, prop) {
sampleGRanges = function(GR, prop) {
GR[sample(length(GR), floor(length(GR) * prop))]
}
mapply(sampleGRanges, GRL, prop)
}
#' To make parallel processing a possibility but not required,
#' I use an lapply alias which can point at either the base lapply
#' (for no multicore), or it can point to mclapply,
#' and set the options for the number of cores (what mclapply uses).
#' With no argument given, returns intead the number of cpus currently selected.
#'
#' @param cores Number of cpus
#' @return None
setLapplyAlias = function(cores=0) {
if (cores < 1) {
return(getOption("mc.cores"))
}
if(cores > 1) { #use multicore?
if (requireNamespace("parallel", quietly = TRUE)) {
options(mc.cores=cores)
} else {
warning("You don't have package parallel installed. Setting cores to 1.")
options(mc.cores=1) #reset cores option.
}
} else {
options(mc.cores=1) #reset cores option.
}
}
#' Function to run lapply or mclapply, depending on the option set in
#' getOption("mc.cores"), which can be set with setLapplyAlias().
#'
#' @param ... Arguments passed lapply() or mclapply()
#' @param mc.preschedule Argument passed to mclapply
#' @return Result from lapply or parallel::mclapply
lapplyAlias = function(..., mc.preschedule=TRUE) {
if (is.null(getOption("mc.cores"))) { setLapplyAlias(1) }
if(getOption("mc.cores") > 1) {
return(parallel::mclapply(..., mc.preschedule=mc.preschedule))
} else {
return(lapply(...))
}
}
#check for, and fix, trailing slash. if necessary
enforceTrailingSlash = function(folder) {
enforceEdgeCharacter(folder, appendChar="/")
}
enforceEdgeCharacter = function(string, prependChar="", appendChar="") {
if (string=="" | is.null(string)) {
return(string)
}
if(!is.null(appendChar)) {
if (substr(string,nchar(string), nchar(string)) != appendChar) { # +1 ?
string = paste0(string, appendChar)
}
}
if (!is.null(prependChar)) {
if (substr(string,1,1) != prependChar) { # +1 ?
string = paste0(prependChar, string)
}
}
return(string)
}
#' Named list function.
#'
#' This function is a drop-in replacement for the base list() function,
#' which automatically names your list according to the names of the
#' variables used to construct it.
#' It seemlessly handles lists with some names and others absent,
#' not overwriting specified names while naming any unnamed parameters.
#' Took me awhile to figure this out.
#'
#' @param ... arguments passed to list()
#' @return A named list object.
nlist = function(...) {
fcall = match.call(expand.dots=FALSE)
l = list(...)
if(!is.null(names(list(...)))) {
names(l)[names(l) == ""] = fcall[[2]][names(l) == ""]
} else {
names(l) = fcall[[2]]
}
return(l)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.