R/do_baits.R

Defines functions assign_n_per_seq do_baits

Documented in assign_n_per_seq do_baits

#' Generates random baits
#'
#' Function inspired by this comment in StackOverflow: https://stackoverflow.com/questions/49149839/simulate-random-positions-from-a-list-of-intervals
#'
#' @param n Number of baits to generate (distributed across the various sequences).
#' @param n.per.seq Number of baits to generate per sequence. Ignored if n is set.
#' @param size The size of each bait
#' @param database A database of sequences
#' @param lengths Optional: the lengths of the sequences, compiled through \code{\link{load_lengths}}. If missing, lengths will be compiled from the database on-the-fly.
#' @param exclusions A file containing regions to exclude
#' @param regions A file containing regions of interest.
#' @param regions.prop The proportion of baits that should overlap the regions of interest.
#' @param regions.tiling The minimum number of baits to distribute per region of interest.
#' @param targets A file containing bp's to target.
#' @param targets.prop The proportion of baits that should overlap the targets.
#' @param targets.tiling The minimum number of baits to distribute per target.
#' @param seed A number to fix the randomization process, for reproducibility
#' @param restrict A vector of chromosome names OR position numbers to which the analysis should be restricted.
#' @param gc A vector of two values between 0 and 1, specifying the minimum and maximum GC percentage allowed in the output baits.
#' @param min.per.seq Minimum number of baits per sequence. Defaults to 1.
#' @param verbose Logical: Should detailed bait processing messages be displayed per sequence?
#' @param force Logical: Proceed even if the number of baits requested is very large?
#' 
#' @return A dataframe of baits
#' 
#' @export
#' 
do_baits <- function(n, n.per.seq, size, database, lengths, exclusions = NULL, 
	regions = NULL, regions.prop = 0, regions.tiling = 1,
	targets = NULL, targets.prop = 0, targets.tiling = 1,
	seed = NULL, restrict, gc = c(0, 1), min.per.seq = 1,
	verbose = FALSE, force = FALSE){

	if (getOption("supeRbaits_debug", default = FALSE)) {
    message("!!!--- Debug mode has been activated ---!!!")
		on.exit(save(list = ls(), file = "supeRbaits_debug.RData"), add = TRUE)
	}
	if (getOption("supeRbaits_show_times", default = FALSE))
		message("Start time: ", Sys.time())
	
	flush.console()

	if(!is.null(seed)) {
		set.seed(seed)
		on.exit(set.seed(NULL))
	}

	if (missing(n) & missing(n.per.seq))
		stop("One of 'n' or 'n.per.seq' must be set.", call. = FALSE)

	if (!missing(n) & !missing(n.per.seq))
		warning("Both 'n' and 'n.per.seq' were set. Ignoring 'n.per.seq'.", immediate. = TRUE, call. = FALSE)

	if (!missing(n) && !is.numeric(n))
		stop("'n' must be numeric.", call. = FALSE)
	if (!missing(n) && length(n) != 1)
		stop("Length of 'n' must be 1.", call. = FALSE)
	if (!missing(n) && n < min.per.seq)
		stop("'n' cannot be lower than min.per.seq.", call. = FALSE)
	if (!missing(n) && n > 100000) {
		if (force)
			warning("You have requested a rather large number of baits to be extracted (", n, ").\n         Your machine may run ouf of memory attempting to extract this many baits.\n         Attempting to continue because force = TRUE.", immediate. = TRUE, call. = FALSE)
		else
			stop("You have requested a rather large number of baits to be extracted (", n, ").\n       Your machine may run ouf of memory attempting to extract this many baits.\n       If you are sure you want to proceed, restart the function with 'force = TRUE'.", call. = FALSE)
	}

	if ((missing(n) & !missing(n.per.seq)) && !is.numeric(n.per.seq))
		stop("'n.per.seq' must be numeric.", call. = FALSE)
	if ((missing(n) & !missing(n.per.seq)) && length(n.per.seq) != 1)
		stop("Length of 'n.per.seq' must be 1.", call. = FALSE)
	if ((missing(n) & !missing(n.per.seq)) && n.per.seq < min.per.seq)
		stop("'n.per.seq' cannot be lower than min.per.seq.", call. = FALSE)

	if (!is.numeric(size))
		stop("'size' must be numeric.", call. = FALSE)
	if (length(size) != 1)
		stop("Length of 'size' must be 1.", call. = FALSE)
	size <- as.integer(size)

	if (!is.numeric(regions.prop) | (regions.prop > 1 | regions.prop < 0))
		stop("'regions.prop' must be numeric and between 0 and 1.", call. = FALSE)
	if (!is.null(targets.prop) && (!is.numeric(targets.prop) | (targets.prop > 1 | targets.prop < 0)))
		stop("'targets.prop' must be numeric and between 0 and 1.", call. = FALSE)
	if (!is.numeric(regions.tiling))
		stop("'regions.tiling' must be numeric.", call. = FALSE)
	if (!is.numeric(targets.tiling))
		stop("'targets.tiling' must be numeric.", call. = FALSE)

	if (regions.prop == 0 & !is.null(regions))
		warning("Regions were included but regions.prop = 0. No region baits will be produced.", call. = FALSE, immediate. = TRUE)
	if (targets.prop == 0 & !is.null(targets))
		warning("Regions were included but targets.prop = 0. No region baits will be produced.", call. = FALSE, immediate. = TRUE)

	if (sum(regions.prop, targets.prop) > 1)
		stop("The sum of 'regions.prop' and 'targets.prop' must not be greater than one.\n", call. = FALSE)
	
	if (length(gc) != 2)
		stop("Please provide two values in 'gc' (minimum and maximum percentage).\n", call. = FALSE)
	if (!is.numeric(gc))
		stop("'gc' must be numeric and contain two values between 0 and 1.", call. = FALSE)
	if (any(gc > 1) | any(gc < 0))
		stop("'gc' ranges must be between 0 and 1.", call. = FALSE)
	if (gc[1] > gc[2])
		stop("The first value of 'gc' must be smaller or equal to the second value.", call. = FALSE)


	# figure out lengths
	if (missing(lengths)) {
		message("M: Compiling the sequences' lengths. This operation can take some time."); flush.console()
		the.lengths <- load_lengths(database)
	}	else {
		if (check_lengths(lengths)) {
			message("M: Compiling the sequences' lengths. This operation can take some time."); flush.console()
			the.lengths <- load_lengths(database)
		}
		else {
			the.lengths <- lengths
			getlengths.time <- 0
			names(getlengths.time) <- "elapsed"
		}
	}
	
	# apply restrictions, if present
	if (!missing(restrict)) {
		restrict <- check_restrict(restrict, sequences = the.lengths$name)
		the.lengths <- the.lengths[restrict, ]
	}

	# Remove sequences that are too small
	link <- the.lengths$size >= (min.per.seq + size - 1)
	if (all(!link))
		stop("No sequences have enough space to fit the minimum desired number of baits.", call. = FALSE)
	if (verbose & any(!link))
		warning(sum(!link), " sequence", ifelse(sum(!link) > 1, "s are", " is"), " too small to fit the minimum desired number of baits and will be discarded.", immediate. = TRUE, call. = FALSE)

	# prepare to allocate n's
	the.lengths <- the.lengths[link, ]
	the.lengths$prop <- the.lengths$size / sum(the.lengths$size)
	the.lengths$n <- 0
	
	# allocate n's
	if (!missing(n)) {
		the.lengths <- assign_n_per_seq(the.lengths, n, min.per.seq, size)
	} else {
		the.lengths$n <- n.per.seq
		the.lengths$n[the.lengths$size - (size - 1) < n.per.seq] <- the.lengths$size[the.lengths$size - (size - 1) < n.per.seq] - (size - 1)
		if (sum(the.lengths$n) < n.per.seq * nrow(the.lengths))
			warning("Some sequences are not long enough to allocate the desired number of baits.", immediate. = TRUE, call. = FALSE)
		if (sum(the.lengths$n) > 100000) {
			if (force)
				warning("You have requested a rather large number of baits to be extracted (", sum(the.lengths$n), ").\n         Your machine may run ouf of memory attempting to extract this many baits.\n         Attempting to continue because force = TRUE.", immediate. = TRUE, call. = FALSE)
			else
				stop("You have requested a rather large number of baits to be extracted (", sum(the.lengths$n), ").\n       Your machine may run ouf of memory attempting to extract this many baits.\n       If you are sure you want to proceed, restart the function with 'force = TRUE'.", call. = FALSE)
		}
	}

	# remove unneeded prop column from the lengths
	the.lengths$prop <- NULL

	# load additional parameters
	if (any(!is.null(exclusions), !is.null(regions), !is.null(targets)))
	message("M: Loading exclusions/regions/targets."); flush.console()

	load.extras.time <- system.time({
		if(!is.null(exclusions))
			exclusions <- load_exclusions(file = exclusions)
		if(!is.null(regions))
			regions <- load_regions(file = regions)
		if(!is.null(targets))
			targets <- load_targets(file = targets)
	})
	if (getOption("supeRbaits_show_times", default = FALSE))
		print(load.extras.time)

	# Compatibility checks
	if (any(!is.null(exclusions), !is.null(regions), !is.null(targets)))
	message("M: Checking exclusions/regions/targets quality."); flush.console()

	check.input.time <- system.time({
		recipient <- check_chr_names(exclusions = exclusions, regions = regions, targets = targets, the.lengths = the.lengths)
		exclusions <- recipient$exclusions
		regions <- recipient$regions
		targets <- recipient$targets
		rm(recipient)

		check_chr_boundaries(exclusions = exclusions, regions = regions, targets = targets, the.lengths = the.lengths)
	})
	if (getOption("supeRbaits_show_times", default = FALSE))
		print(check.input.time)

	message("M: Finding bait positions for each sequence."); flush.console()

	sample.baits.time <- system.time({
		bait.points <- tryCatch(
			callr::r(function(sampleBaits, chrom_lens, exclusions, regions, 
							 					targets, size, regions_tiling, targets_tiling, 
							 					regions_prop, targets_prop) 
				{
					sampleBaits(chrom_lens, exclusions, regions, 
											targets, size, regions_tiling, targets_tiling, 
											regions_prop, targets_prop)
				},
				args = list(sampleBaits = sampleBaits,
										chrom_lens = the.lengths, 
										exclusions = exclusions, 
										regions = regions, 
										targets = targets, 
										size = size, 
										regions_tiling = regions.tiling, 
										targets_tiling = targets.tiling, 
										regions_prop = regions.prop, 
										targets_prop = targets.prop),
				spinner = !verbose,
				show = verbose),
			error = function(e, missing_n = missing(n)) {
				if (grepl("Ran out of memory", e)) {
					if (missing_n)
						stop("Not enough available memory to contain all sequences' samples.\n       If your database has many sequences, consider using 'n', rather than 'n.per.seq'.\n       Alternatively, you can use 'restrict' to select a subset of sequences to be analysed.", call. = FALSE)
					else
						stop("Not enough available memory to contain all sequences' samples. Consider lowering 'n'.", call. = FALSE)
				} else {
					stop(e)
				}
			}
		)
	})

	if (getOption("supeRbaits_show_times", default = FALSE))
		print(sample.baits.time)

	message("M: Retrieving bait base pairs. This operation can take some time."); flush.console()

	getbaits.time <- system.time({
		baits <- tryCatch(
			callr::r(function(getBaits, db, df)
			{ getBaits(db_path = db, df = df) }, 
			args = list(getBaits = getBaits,
									db = database,
									df = bait.points), 
			spinner = TRUE,
			show = TRUE),
			error = function(e, missing_n = missing(n)) {
				if (grepl("Ran out of memory", e)) {
					if (missing_n)
						stop("Not enough available memory to contain all the sampled baits.\n       If your database has many sequences, consider using 'n', rather than 'n.per.seq'.\n       Alternatively, you can use 'restrict' to select a subset of sequences to be analysed.", call. = FALSE)
					else
						stop("Not enough available memory to contain all the sampled baits. Consider lowering 'n'.", call. = FALSE)
				} else {
					stop(e)
				}
			}
		)			
	})

	# failsafe for r-oldrel
	to.convert <- which(unlist(lapply(baits, class)) == "factor")

	if (length(to.convert) > 0) {
		for (i in to.convert) {
			baits[, i] <- as.character(baits[, i])
		}
	}
	# ---

	if (getOption("supeRbaits_show_times", default = FALSE))
		print(getbaits.time)

	if (nrow(baits) == 0)
		stop("No baits could be generated for any of the sequences. Aborting.\n", call. = FALSE)

	message("M: Calculating GC content in the baits."); flush.console()

	calc.baits.time <- system.time({
		baits$pGC <- baits$no_GC / size
		baits <- split(baits, baits$bait_chrom_name)
	})

	if (getOption("supeRbaits_show_times", default = FALSE))
		print(calc.baits.time)

	if (gc[1] > 0 | gc[2] < 1) {
		message("M: Examining GC content in the baits."); flush.console()

		good.baits <- list()
		bad.baits <- list()

		assess.baits.time <- system.time({
			capture <- lapply(seq_along(baits), function(i) {
				link <- baits[[i]]$pGC > gc[1] & baits[[i]]$pGC < gc[2]
				if (verbose) {
					if (all(!link)) {
						message(paste0("M: No baits passed the GC percentage test for sequence ", names(baits)[i], "."))
						return(NULL)
					}
					if (any(!link)) {
						if (sum(!link) == 1)
							message(paste0("M: ", sum(!link), " bait was excluded from sequence ", names(baits)[i], " due to its GC percentage."))
						else
							message(paste0("M: ", sum(!link), " baits were excluded from sequence ", names(baits)[i], " due to their GC percentage."))
					}
				}
				good.baits[[i]] <<- baits[[i]][link, ]
				bad.baits[[i]] <<- baits[[i]][!link, ]
			})
			names(good.baits) <- names(baits)
			names(bad.baits) <- names(baits)
		})

		remove.empty.good <- sapply(good.baits, nrow) > 0
		good.baits <- good.baits[remove.empty.good]

		remove.empty.bad <- sapply(bad.baits, nrow) > 0
		bad.baits <- bad.baits[remove.empty.bad]
		
		if (getOption("supeRbaits_show_times", default = FALSE))
			print(assess.baits.time)
	} else {
		good.baits <- baits
		bad.baits <- NULL
	}

	message("M: Analysis completed."); flush.console()

	input.summary <- list(chr.lengths = data.table::as.data.table(the.lengths), 
												exclusions  = data.table::as.data.table(exclusions), 
												targets     = data.table::as.data.table(targets), 
												regions     = data.table::as.data.table(regions),
												size        = size)

	if (getOption("supeRbaits_show_times", default = FALSE)) {
		times <- data.frame(
			getLengths = getlengths.time["elapsed"],
			sampleBaits = sample.baits.time["elapsed"],
			getBaits = getbaits.time["elapsed"],
			stringsAsFactors = FALSE)
		output <- list(baits = good.baits, excluded.baits = bad.baits, input.summary = input.summary, times = times)
	}	else {
		output <- list(baits = good.baits, excluded.baits = bad.baits, input.summary = input.summary)
	}

	attributes(output)$database <- normalizePath(database)

	return(output)
}

#' Distribute the overall number of baits by the multiple sequences
#' 
#' @param the.lengths a dataframe with the length of each sequence, and the respective proportion of the overall size.
#' @inheritParams do_baits
#' 
#' @return An updated the.lengths dataframe, with the number of baits to be extracted per sequence.
#' 
#' @keywords internal
#' 
assign_n_per_seq <- function(the.lengths, n, min.per.seq, size) {
	# if the number of baits to distribute is lower than the number of sequences
	if (n < (nrow(the.lengths) * min.per.seq)) {
		warning("The desired n is smaller than the number of sequences times min.per.seq. Only a subset of the sequences will be used.", immediate. = TRUE, call. = FALSE)
		# decide how many sequences we can pick
		n.seqs <- floor(n / min.per.seq)
		# pick sequences
		which.seqs <- sample(1:nrow(the.lengths), n.seqs, prob = the.lengths$prop)
		# distribute min.per.seq for the chosen sequences
		the.lengths$n[which.seqs] <- min.per.seq
		# if there are baits missing
		if (sum(the.lengths$n) < n) {
			# find out which sequences still have space
			seqs.with.space <- which(the.lengths$size[which.seqs] - min.per.seq - size > 0)
			# choose which from the above will receive + 1
			add.here <- sample(which.seqs[seqs.with.space], n - sum(the.lengths$n))
			# add one bait on the above
			the.lengths$n[add.here] <- the.lengths$n[add.here] + 1
		}
	# if the number of baits to assign is bigger than the number of sequences
	} else {
		# start by assigning the minimum necessary (sequences that are too small have already been excluded)
		the.lengths$n <- min.per.seq
		# distribute the rest of the baits
		while (sum(the.lengths$n) < n) {
			n.to.distribute <- n - sum(the.lengths$n)
			# find which sequences have space
			seqs.with.space <- which(the.lengths$size - the.lengths$n - (size - 1) > 0)
			if (length(seqs.with.space) == 0) {
				warning("Ran out of space in the sequences where to allocate baits. Could not allocate ", n.to.distribute, " baits.", immediate. = TRUE, call. = FALSE)
				break()
			}
			# recalculate weights
			free.space <- the.lengths$size - the.lengths$n - (size - 1)
			the.lengths$prop <- free.space / sum(free.space)
			# draw new places where to add baits based on new weights
			add.here <- sample(seqs.with.space, n.to.distribute, replace = TRUE, prob = the.lengths$prop)
			sort.results <- table(add.here)
			# add new baits in their respective places
			the.lengths$n[as.numeric(names(sort.results))] <- the.lengths$n[as.numeric(names(sort.results))] + sort.results
			# if any of the sequences went over the limit, drop extra baits to be redistributed
			the.lengths$n <- sapply(1:nrow(the.lengths), function(i) min(the.lengths$size[i] - (size - 1), the.lengths$n[i]))
			# If the last line changed anything, then the while will be triggered again
		}
	}
	return(the.lengths)
}
BelenJM/supeRbaits documentation built on Jan. 28, 2022, 1:44 a.m.