R/utility.R

Defines functions nlist enforceEdgeCharacter enforceTrailingSlash lapplyAlias setLapplyAlias sampleGRL countFileLines countFileLinesBackup countOverlapsRev dtToGr dtToGrInternal readBed write.tsv listToGRangesList countOverlapsAnyRev replaceFileExtension cleanws countOverlapsAny setSharedDataDir

Documented in cleanws countOverlapsAnyRev lapplyAlias listToGRangesList nlist readBed replaceFileExtension sampleGRL setLapplyAlias setSharedDataDir write.tsv

################################################################################
# UTILITY FUNCTIONS
################################################################################
# These are functions copied over from my repository of utilities used
# by this package. They are repeated here simply for portability, so this
# package can be deployed on systems without access to my utilities.
# Any changes should probably be backported to the primary functions rather
# than in these convenience duplications.
#
# These functions should probably remain interior to the package (not exported)
#


#' setSharedDataDir
#' Sets global variable specifying the default data directory.
#'
#' @export
#' @param sharedDataDir	directory where the shared data is stored.
#' @return No return value.
#' @examples
#' setSharedDataDir("project/data")
setSharedDataDir = function(sharedDataDir) {
	options(SHARE.DATA.DIR=sharedDataDir)
	return()
}


countOverlapsAny = function(subj, quer, cores=1) {
	setLapplyAlias(cores)
	l = unlist(lapplyAlias(subj, function(x) { sum(overlapsAny(x, quer)) } ))
	return(l)
}
################################################################################
# INTERNAL FUNCTIONS - not exported
################################################################################

#' cleanws takes multi-line, code formatted strings and just formats them
#' as simple strings
#' @param string string to clean
#' @return A string with all consecutive whitespace characters, including
#' tabs and newlines, merged into a single space.
cleanws = function(string) {
	return(gsub('\\s+'," ", string))
}


#' This will change the string in filename to have a new extension
#' @param filename	string to convert
#' @param extension	new extension
#' @return	Filename with original extension deleted, replaced by provided
#' extension
replaceFileExtension = function(filename, extension) {
	sub("\\..*$", enforceEdgeCharacter(extension, prependChar="."),
	paste0(filename, "."))
}



#' Just a reverser. Reverses the order of arguments and passes them
#' untouched to countOverlapsAny -- so you can use it with lapply.
#'
#' @param subj Subject
#' @param quer Query
#' @return Results from countOverlaps
countOverlapsAnyRev = function(subj, quer) {
	countOverlapsAny(quer, subj)
}


#' converts a list of GRanges into a GRangesList; strips all metadata.
#' @param lst	a list of GRanges objects
#' @return	a GRangesList object
listToGRangesList = function(lst) {
	if(!is(lst, "GRangesList")) {
		if ("list" %in% class(lst)) {
			#strip elementMetadata
			lst = lapply(lst, function(x) { values(x) <- NULL; return(x) } )
			lst = GRangesList(lst)
		} else {
			message(cleanws("Converting GRanges to GRangesList."))
			lst = GRangesList(list(lst))
		}
	}
	return(lst)
}


#' Wrapper of write.table that provides defaults to write a
#' simple .tsv file. Passes additional arguments to write.table
#'
#' @param ... Additional arguments passed to write.table
#' @return No return value
write.tsv = function(...) {
	write.table(..., sep="\t", row.names=FALSE, quote=FALSE)
}

######################################################################

#' Imports bed files and creates GRanges objects, using the fread()
#' function from data.table.
#'
#' @param file File name of bed file.
#' @return GRanges Object
#' @export
#' @examples
#' a = readBed(system.file("extdata", "examples/combined_regions.bed",
#' package="LOLA"))
readBed = function(file) {
	DT = fread(file)
	# bed specification says:
	# 1=chr, 2=start, 3=end, 4=name, 5=score (discarded), 6=strand.
	cn = rep(NA, 6)
	readCols = colnames(DT)
	cn[seq_along(readCols)] = readCols
	# BED files are 0-based, but internal bioc representations use 1-based coords.
	DT[, (colnames(DT)[2]) := (get(colnames(DT)[2]) + 1)]

	# Convert any unknown strands to '*'
	DT[, (cn[3])]
	tfbsgr = dtToGr(DT, chr=cn[1], start=cn[2], end=cn[3],
		name=cn[4], strand=cn[6])


	return(tfbsgr)
}


#Two utility functions for converting data.tables into GRanges objects
#genes = dtToGR(gModels, "chr", "txStart", "txEnd", "strand", "geneId")
dtToGrInternal = function(DT, chr, start, end=NA,
	strand=NA, name=NA, metaCols=NA) {
	if (is.na(end)) {
		end = start
	}
	if (is.na(strand)) {
			gr=GRanges(
				seqnames=DT[[`chr`]],
				ranges=IRanges(start=DT[[`start`]],
				end=DT[[`end`]]),
				strand="*"
			)
	} else {
		# GRanges can only handle '*' for no strand, so replace any non-accepted
		# characters with '*'
		DT[,strand:=as.character(strand)]
		DT[strand=="1", strand:="+"]
		DT[strand=="-1", strand:="-"]
		DT[[`strand`]] =  gsub("[^+-].*", "*", DT[[`strand`]])

		gr=GRanges(
			seqnames=DT[[`chr`]],
			ranges=IRanges(start=DT[[`start`]],
			end=DT[[`end`]]),
			strand=DT[[`strand`]]
		)
	}
	if (! is.na(name) ) {
		names(gr) = DT[[`name`]]
	} else {
		names(gr) = seq_along(gr)
	}
	# This code is not used in LOLA:
	#if(! is.na(metaCols)) {
	#	for(x in metaCols) {
	#		elementMetadata(gr)[[`x`]]=DT[[`x`]]
	#	}
	#}
	gr
}

dtToGr = function(DT, chr="chr", start="start", end=NA, strand=NA, name=NA,
	splitFactor=NA, metaCols=NA) {
	if(is.na(splitFactor)) {
		return(dtToGrInternal(DT, chr, start, end, strand, name,metaCols))
	}
	if ( length(splitFactor) == 1 ) {
		if( splitFactor %in% colnames(DT) ) {
			splitFactor = DT[, get(splitFactor)]
		}
	}
	lapply(split(seq_len(nrow(DT)), splitFactor),
			function(x) {
				dtToGrInternal(DT[x,], chr, start, end, strand, name,metaCols)
			}
		)


}

# If you want to use the GenomicRanges countOverlaps function, but you want to
# do it in an lapply, that will work... but you can only do it in one direction.
# If you want to lapply on the opposite argument, you can't do it (because
# countOverlaps is not symmetric: it depends on which argument comes first).
# If you want to do an lapply, but countOverlaps with the query as the second
# argument instead of the first, you can use this function to simply reverse
# the order of the arguments.
# This is used in the enrichment calculations (originally from the EWS
# project; 2014, CeMM).
countOverlapsRev = function(query, subject, ...) {
	return(countOverlaps(subject, query, ...))
}

# For windows, let's try this.
countFileLinesBackup = function(filename) {
	# From https://stackoverflow.com/a/23456450/946721
	f = file(filename, open="rb")
	nlines <- 0L
	while (length(chunk <- readBin(f, "raw", 65536)) > 0) {
	   nlines = nlines + sum(chunk == as.raw(10L))
	}
	close(f)
	return(as.numeric(nlines))
}

# Parses result of system "wc" wordcount to return the number of lines
# in a file into R.
countFileLines = function(filename) {
	if (!file.exists(filename)) {
		warning("File does not exist:", filename)
		return(0)
	}
	nlines = tryCatch( {
		return(as.numeric(
			strsplit(sub("^\\s+", "",
			system(paste("wc -l ", filename),
			intern=TRUE)), " ")[[1]][1]
		)) }, error= function(err) { return(NA) }
	)
	
	if (is.na(nlines)) {
		return(countFileLinesBackup(filename))
	}
}


#' Function to sample regions from a GRangesList object, in specified proportion
#'
#' @param GRL	GRangesList from which to sample
#' @param prop	vector with same length as GRL, of values between 0-1,
#' proportion of the list to select
#'
#' @return A sampled subset of original GRangesList object.
sampleGRL = function(GRL, prop) {
	sampleGRanges = function(GR, prop) {
		GR[sample(length(GR), floor(length(GR) * prop))]
	}
	mapply(sampleGRanges, GRL, prop)
}

#' To make parallel processing a possibility but not required,
#' I use an lapply alias which can point at either the base lapply
#' (for no multicore), or it can point to mclapply,
#' and set the options for the number of cores (what mclapply uses).
#' With no argument given, returns intead the number of cpus currently selected.
#'
#' @param cores	Number of cpus
#' @return None
setLapplyAlias = function(cores=0) {
	if (cores < 1) {
		return(getOption("mc.cores"))
	}
	if(cores > 1) { #use multicore?
		if (requireNamespace("parallel", quietly = TRUE)) {
			options(mc.cores=cores)
		} else {
			warning("You don't have package parallel installed. Setting cores to 1.")
			options(mc.cores=1) #reset cores option.
		}
	} else {
		options(mc.cores=1) #reset cores option.
	}
}

#' Function to run lapply or mclapply, depending on the option set in
#' getOption("mc.cores"), which can be set with setLapplyAlias().
#'
#' @param ... Arguments passed lapply() or mclapply()
#' @param mc.preschedule Argument passed to mclapply
#' @return Result from lapply or parallel::mclapply
lapplyAlias = function(..., mc.preschedule=TRUE) {
	if (is.null(getOption("mc.cores"))) { setLapplyAlias(1) }
	if(getOption("mc.cores") > 1) {
		return(parallel::mclapply(..., mc.preschedule=mc.preschedule))
	} else {
		return(lapply(...))
	}
}


#check for, and fix, trailing slash. if necessary
enforceTrailingSlash = function(folder) {
	enforceEdgeCharacter(folder, appendChar="/")
}
enforceEdgeCharacter = function(string, prependChar="", appendChar="") {
	if (string=="" | is.null(string)) {
		return(string)
	}
	if(!is.null(appendChar)) {
		if (substr(string,nchar(string), nchar(string)) != appendChar) { # +1 ?
			string = paste0(string, appendChar)
			}
	}
	if (!is.null(prependChar)) {
		if (substr(string,1,1) != prependChar) { # +1 ?
			string = paste0(prependChar, string)
		}
	}
	return(string)
}


#' Named list function.
#'
#' This function is a drop-in replacement for the base list() function,
#' which automatically names your list according to the names of the
#' variables used to construct it.
#' It seemlessly handles lists with some names and others absent,
#' not overwriting specified names while naming any unnamed parameters.
#' Took me awhile to figure this out.
#'
#' @param ...	arguments passed to list()
#' @return A named list object.
nlist = function(...) {
	fcall = match.call(expand.dots=FALSE)
	l = list(...)
	if(!is.null(names(list(...)))) {
		names(l)[names(l) == ""] = fcall[[2]][names(l) == ""]
	} else {
		names(l) = fcall[[2]]
	}
	return(l)
}
sheffien/LOLA documentation built on Sept. 3, 2020, 8:29 p.m.