R/common_utils.R

Defines functions set_counter diameter is.file format_sample_id_and_subgroup readRDS_or_readRData find_neighbours generate_diff_color_fun generate_color_fun add_transparency is.not.null qq_message qq_stop qq_warning return2

Documented in find_neighbours set_counter

# == title
# Set a counter which represents the percentage finished in a loop
#
# == param
# -n number of iteration in a loop.
# -fmt format of the message, pass to `base::sprintf`. "\%s" will be replaced with the percent string generated by the counter.
# 
# == details
# The counter function should be executed in every iteration in the loop. The message only shows in interactive R session.
#
# == value
# A counter function.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# counter = set_counter(1000)
# for(i in 1:1000) {counter()}
# counter = set_counter(1000, fmt = "processing \%s")
# for(i in 1:1000) {counter()}
set_counter = function(n, fmt = "%s") {

	n = as.integer(n)
	i = 1

	f = function() {
		if(interactive()) {
			pct = round(i/n*100, 1)
			str = paste0(i, "/", n, " (", pct, "%)")
			str = sprintf(fmt, str)

			message(paste(rep("\b", 200), collapse=""), appendLF = FALSE)
			message(str, appendLF = FALSE)
			if(i == n) message("\n", appendLF = FALSE)

			i = i + 1
			assign("i", i, envir = parent.env(environment()))
			return(invisible(i))
		}
	}
}

diameter = function(x) {
	r = range(x)
	r[2] - r[1]
}

is.file = function(path) {
	if(length(path) == 1) {
		is.atomic(path) && is.character(path) && file.exists(path)
	} else {
		return(FALSE)
	}
}

format_sample_id_and_subgroup = function(sample_id, subgroup) {
	if(is.null(names(subgroup))) {
		if(length(subgroup) != length(sample_id)) {
			stop("length of `subgroup` should be same as the number of samples.")
		}
	} else {
		sample_id = intersect(names(subgroup), sample_id)
		subgroup = subgroup[sample_id]
	}
	list(sample_id = sample_id, subgroup = subgroup)
}


readRDS_or_readRData = function(file) {
	if(grepl("\\.rds$", file, ignore.case = TRUE)) {
		cr = readRDS(file)
	} else if(grepl("\\.rdata", file, ignore.case = TRUE)) {
		var_name = load(file)
		eval(parse(text = paste0("cr = ", var_name)))
	}
	cr
}


# == title
# Find neighbour regions
#
# == param
# -query a `GenomicRanges::GRanges` object
# -reference a `GenomicRanges::GRanges` object 
# -upstream upstream that ``query`` is extended
# -downstream downstream that ``query`` is extended
#
# == details
# With a certain extension of ``query``, this funciton looks for ``reference`` which intersects the extended regions.
#
# == value
# A `GenomicRanges::GRanges` object which contains regions in ``query`` for which the extended regisons are overlapped with ``reference``.
# There are three meta columns added:
#
# -distance: distance from the ``query`` to corresponding ``reference``
# -query_index: index of regions in ``query``
# -reference_index: index of regions in ``reference`` 
#
# Note one ``reference`` can correspond to multiple ``query`` regions.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# gr1 = GRanges(seqname = "chr1", ranges = IRanges(start = c(4, 10), end = c(6, 16)))
# gr2 = GRanges(seqname = "chr1", ranges = IRanges(start = c(7, 13), end = c(8, 20)))
# find_neighbours(gr1, gr2, upstream = 3, downstream = 3)
# find_neighbours(gr1, gr2, upstream = 10, downstream = 10)
find_neighbours = function(query, reference, upstream = 1000, downstream = 1000) {

	query_extended = query
	strd = strand(query)
	start(query_extended) = ifelse(strd == "-", start(query) - downstream, start(query) - upstream)
	end(query_extended) = ifelse(strd == "-", end(query) + upstream, end(query) + downstream)

	mtch = findOverlaps(query_extended, reference)

	mtch = as.matrix(mtch)
	neighbours = query[mtch[, 1]]
	neighbours$distance = distance(query[mtch[,1]], reference[mtch[,2]])

	neighbours$query_index = mtch[, 1]
	neighbours$reference_index = mtch[, 2]
	return(neighbours)
}



# == title
# Number of columns which are highly correlated to other columns
#
# == param
# -x a matrix, correlation is calculated by columns
# -abs_cutoff cutoff of absolute correlation. It can be a numeric vector with more than one cutoffs.
# -size size of blocks
# -mc number of cores
# -... pass to `stats::cor`
#
# == details
# For each column, it looks for number of other columns which correlate with absolute correlation coefficient larger tham ``abs_cutoff``.
# The calculation involves pair-wise correlation of all columns in the matrix.
# When number of columns is huge in the matrix, it is out of ability of R to store such long vector. This function
# solves this problem by splitting the columns into k blocks and looks at each block sequentially or in parallel.
#
# The code is partially adapted from https://rmazing.wordpress.com/2013/02/22/bigcor-large-correlation-matrices-in-r/
#
# == value
# A matrix that represents how many other columns correlate to current column under the correlation cutoff.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# \dontrun{
# mat = matrix(rnorm(20000 * 10), ncol = 20000, nrow = 20)
# cor_columns(mat, abs_cutoff = c(0.5, 0.6, 0.7))
# }
# NULL
cor_columns = function (x, abs_cutoff = 0.5, size = 1000, mc = 1, ...) {
    
    split_by_block = function(n, size) {
    	size = min(c(n, size))
    	REST <- n%%size
	    LARGE <- n - REST
	    NBLOCKS <- n%/%size
	    GROUP <- rep(1:NBLOCKS, each = size)
	    if (REST > 0)
	        GROUP <- c(GROUP, rep(NBLOCKS + 1, REST))
	    split(1:n, GROUP)
    }

    NCOL <- ncol(x)
    SPLIT = split_by_block(NCOL, size)
    COMBS <- expand.grid(1:length(SPLIT), 1:length(SPLIT))
    COMBS <- t(apply(COMBS, 1, sort))
    COMBS <- unique(COMBS)

    nr = nrow(COMBS)

    count_list = mclapply(split_by_block(nr, floor(nr/mc)), function(ind) {

	    count = matrix(0, nrow = NCOL, ncol = length(abs_cutoff))
	    for (i in ind) {
	        COMB <- COMBS[i, ]

	        qq_message("block @{COMB[1]}/@{max(COMBS[,1])} (row @{(COMB[1]-1)*size+1}~@{COMB[1]*size}) and @{COMB[2]}/@{max(COMBS[,2])} (row @{(COMB[2]-1)*size+1}~@{COMB[2]*size})")
	        G1 <- SPLIT[[COMB[1]]]
	        G2 <- SPLIT[[COMB[2]]]
	        
	        RES <- cor(x[, G1], x[, G2], ...)
	        RES[is.na(RES)] = 0
	        for(k in seq_along(abs_cutoff)) {
	        	tmp_mat = RES
	        	tmp_mat[abs(tmp_mat) > abs_cutoff[k]] = 1
		        tmp_mat[abs(tmp_mat) < abs_cutoff[k]] = 0

		        count[G1, k] = count[G1, k] + rowSums(tmp_mat)
		        if(COMB[1] != COMB[2]) {
		        	count[G2, k] = count[G2, k] + colSums(tmp_mat)
		        }
		    }
	    }
	    return(count)
	}, mc.cores = mc)

    count = matrix(0, nrow = NCOL, ncol = length(abs_cutoff))
    for(i in seq_along(count_list)) {
    	count = count + count_list[[i]]
    }
	dim(count) = c(NCOL, length(abs_cutoff))
	rownames(count) = colnames(x)
	colnames(count) = abs_cutoff
	return(count)
}

generate_diff_color_fun = function(x, quantile = 0.95, col = c("#3794bf", "#FFFFFF", "#df8640")) {
	x = x[abs(x) > 1e-6]
	if(length(x) == 0) {
		return(colorRamp2(c(-1, 0, 1), c("#3794bf", "#FFFFFF", "#df8640")))
	}
	q = quantile(abs(x), quantile)
	if(q == 0) {
		colorRamp2(c(-1, 0, 1), c("#3794bf", "#FFFFFF", "#df8640"))
	} else if(sum(x < 0)) {
		colorRamp2(c(-q, 0, q), c("#3794bf", "#FFFFFF", "#df8640"))
	} else {
		if(length(col) == 3) {
			colorRamp2(c(0, q), col[2:3])
		} else {
			colorRamp2(c(0, q), col)
		}
	}
}

generate_color_fun = function(x, col = c("white", "purple")) {
	q = quantile(x, c(0, 0.95, 0.99, 1))
	if(q[2] == 0) {
		if(q[3] == 0) {
			if(q[4] == 0) {
				colorRamp2(c(0, 1), col)
			} else {
				colorRamp2(q[c(1, 4)], col)
			}
		} else {
			colorRamp2(q[c(1, 3)], col)
		}
	} else {
		colorRamp2(q[c(1, 2)], col)
	}
}


add_transparency = function(col, transparency = 0) {
	rgb(t(col2rgb(col)/255), alpha = 1 - transparency)
}

is.not.null = function(...) {
	!is.null(...)
}

qq_message = function(text, envir = parent.frame(), ...) {
	text = paste(strwrap(qq(text, envir = envir)), collapse = "\n")
	message(text, ...)
}

qq_stop = function(text, envir = parent.frame(), ...) {
	text = paste(strwrap(qq(text, envir = envir)), collapse = "\n")
	stop(text, ...)
}

qq_warning = function(text, envir = parent.frame(), ...) {
	text = paste(strwrap(qq(text, envir = envir)), collapse = "\n")
	warning(text, ...)
}


return2 = function(expr, invisible = FALSE) {
	env = parent.frame()
	
	if(identical(env, .GlobalEnv)) {
		base::return(NULL)
	}
	
	# check on.exit
	if(exists(".on.exit.expression", envir = env)) {
		.on.exit.expression = get(".on.exit.expression", envir = env)
		for(i in seq_along(.on.exit.expression)) {
			eval(.on.exit.expression[[i]], envir = env)
		}
	}

	value = eval(substitute(expr), envir = env)
	
	obj = ls(envir = env)
	
	all_obj = ls(envir = env, all.names = TRUE)
	rm(list = all_obj, envir = env)
	gc(verbose = FALSE)
	
	if(invisible) {
		base::return(invisible(value))
	} else {
		base::return(value)
	}
}
jokergoo/epik documentation built on Sept. 28, 2019, 9:20 a.m.