#' A partial copying of the getNrNb() function from rtisan.
#'
#' This function takes in the tags data structure I have been using
#' and passes the data to a combination of the Nb calculator I have
#' along with the resiliency filtered Nb values.
#'
#' I do not yet fully understand the various functions/operations
#' performed for the resiliency filtering, as a result, these
#' functions are mostly just transcriptions of the originals with some
#' variables renamed and changes to make it easier to read.
#'
#' @param tags The data structure generated by read_idx or
#' read_csv_tags or read_qiime_otus. E.g. the list of metadata, tags
#' by sample, and some administrativia.
#' @param generations Used by calculate_nb
#' @param metadata_column Used by nb_from_tags to add the pre-rtisan
#' Nb values to the metadata.
#' @param zero_filter Used by calculate_nb to leave off tags not
#' observed in the reference samples. I think this is slightly broken
#' though.
#' @param write_metadat When not null, write out the changes to the
#' metadata to the provided file.
#' @param innoculation_cfu Used by rtisan, defines a constant for the
#' innoculation cfu in case it was not provided in the metadata.
#' @param chosen_seed Set the seed for extraDistr so that can get the
#' same result two times in a row.
#' @param min_weight The minimum weight constant, not sure what it does yet.
#' @param noise_correction Noise correction constant, not sure what it
#' does yet.
#' @param input_calibration Input csv file containing calibration
#' constants. I do not have one, so I cannot say how it works yet.
#' @param output_csv Write the new Nb values to this file: FIXME Move
#' this to the metadata.
#' @param output_noisless Write the noiseless values to this file,
#' FIXME: also put this into the metadata.
#' @param do_plots Print the diagnostic plots? FIXME: ggplot-ify
#' these and potentially get them into the metadata.
#' @param verbose Print some diagnostics while running?
#' @export
rtisan_nb_from_tags <- function(tags, generations = 1,
metadata_prefix = "rtisan",
zero_filter = FALSE, write_metadata = NULL,
innoculation_cfu = 1e8, chosen_seed = 1,
min_weight = 0.03, noise_correction = 0.005,
input_calibration = NULL,
output_csv = "rtisan_values.csv",
output_noiseless = "rtisan_noiseless.csv",
do_plots = FALSE, verbose = FALSE, parallel = TRUE, write = NULL) {
read_csv <- tags[["input_csv"]]
input_metadata <- tags[["modified_metadata"]]
metadata_backup <- input_metadata
cfu_vector <- input_metadata[["cfu"]]
names(cfu_vector) <- rownames(input_metadata)
reference_samples_idx <- input_metadata[["type"]] == "reference"
pre_metadata_column <- paste0("pre_", metadata_prefix, "_Nb")
reference_samples <- rownames(input_metadata)[reference_samples_idx]
original_result <- nb_from_tags(tags, generations = generations,
metadata_column = pre_metadata_column,
zero_filter = FALSE, write = write_metadata)
rtisan_result <- rtisan_nr_nb(read_csv, cfu_vector, innoculation_cfu = innoculation_cfu,
chosen_seed = chosen_seed, reference_samples = reference_samples,
min_weight = min_weight, noise_correction = noise_correction,
input_calibration = input_calibration,
output_csv = output_csv, do_plots = do_plots, verbose = verbose,
parallel = parallel)
estimates <- rtisan_result[["estimate_table"]]
nb_vector <- estimates[["Nb"]]
nr_vector <- estimates[["Nr"]]
ns_vector <- estimates[["Ns"]]
rtisan_nb_column <- paste0(metadata_prefix, "_Nb")
rtisan_nr_column <- paste0(metadata_prefix, "_Nr")
rtisan_ns_column <- paste0(metadata_prefix, "_Ns")
input_metadata[[rtisan_nb_column]] <- NA
input_metadata[[rtisan_nr_column]] <- NA
input_metadata[[rtisan_ns_column]] <- NA
for (r in 1:nrow(estimates)) {
sample <- rownames(estimates)[r]
input_metadata[sample, rtisan_nb_column] <- estimates[sample, "Nb"]
input_metadata[sample, rtisan_nr_column] <- estimates[sample, "Nr"]
input_metadata[sample, rtisan_ns_column] <- estimates[sample, "Ns"]
}
if (!is.null(write)) {
extension <- tools::file_ext(write)
if (extension == "xlsx") {
written <- hpgltools::write_xlsx(data = input_metadata, excel = write)
} else if (extension == "csv") {
written <- write.csv(x = input_metadata, file = write)
} else if (extension == "tsv") {
writen <- write.tsv(x = input_metadata, file = write)
} else {
message("Dunno what to write.")
}
}
retlist <- list(
"previous_metadata" = metadata_backup,
"modified_metadata" = input_metadata,
"Nb_column" = rtisan_nb_column,
"Nb_result" = nb_vector,
"Nr_result" = nr_vector,
"Ns_result" = ns_vector,
"sample_counts" = tags[["sample_counts"]])
return(retlist)
}
#' Internal Resiliency Function.
#'
#' This is run for every experimental or calibration sample in the
#' metadata.
#'
#' @param reference_vector Set of reference reads/tag.
#' @param reads_df Table of the reads/tag for the samples of interest.
#' @param cfu_vector Set of cfu values from which to acquire the
#' target cfu for this particular sample.
#' @param noiseless_table Input table of noisless data, currently
#' unused.
#' @param sample_name Name of this samples, used to extract some data.
#' @param step_df Used to calculate the computed_bottleneck by
#' stats::NLSstClosestX and generate the Ns value.
#' @param min_weight Constant of the minimum weight!
#' @param input_calibration Input calibration file, currently unused.
#' @param output_csv Output csv file for writing new values.
#' @param innoculation_cfu Assumed cfu when not provided.
#' @param do_plots Print plots?
#' @param noise_correction Constant for correcting noise!
#' @param verbose Print text while running?
#' @export
calculate_resiliency <- function(reference_vector, reads_df, cfu_vector, noiseless_table,
sample_name, step_df, min_weight = 0.03,
input_calibration = NULL,
output_csv = "rtisan_values.csv",
output_noiseless = "rtisan_noiseless.csv",
innoculation_cfu = 1e8, do_plots = FALSE,
noise_correction = 0.005, verbose = FALSE) {
## Specifies input vector (which is the average of your inputs),
## output (the row which corresponds to the particular sample
## name), and CFU (a single value corresponding to the name of the
## sample)
input_vector <- reference_vector
wanted_column <- colnames(reads_df) == sample_name
output_vector <- reads_df[, wanted_column]
cfu <- innoculation_cfu
if (!is.null(cfu_vector)) {
wanted_sample <- names(cfu_vector) == sample_name
cfu <- cfu_vector[wanted_sample]
}
## Noise adjustment
if (noise_correction != 0) {
resampled_input <- extraDistr::rmvhyper(1, round(input_vector),
round(noise_correction * sum(output_vector)))
output_vector <- as.numeric(output_vector - resampled_input)
output_vector[output_vector < 0] <- 0
}
## Times specifies how many iterations the resiliency function is
## run. Too many or too few isn't ideal, but we can constrain it
## by CFU. So if you have 2 CFU, it will only run 2 times, because
## it's pointless to try to look farther than that. If you have a
## lot of CFU, this variable is set to the number of barcodes that
## have > 1 read. 1 is arbitrary, but we can be really quite
## confident that anything 1 or below is noise.
times <- min(round(cfu + 1), sum(output_vector > 1))
## Combines input and output vectors into one data frame, and then
## orders everything by the output vector. The barcodes with the
## most reads in the output will be at the bottom of the data
## frame
bind <- as.data.frame(cbind(input_vector, output_vector))
bindsorted <- bind[order(bind[, 2]), ]
## This logical puts in a 0 for all barcodes in the output where
## we initially define noise.
# if(times < length(output_vector)) {
# bindsorted[, 2][1:(length(output_vector) - times)] <- 0
# }
## As the script is run, bindsorted itself will be changed, so we
## make a few copies of it for later use
bindsortedcopy <- bindsorted
bindsortedcopy2 <- bindsortedcopy
## minusone is run for the number of times specified by times. The
## initial 0 is removed from nb_vector and both nb_vector.
nb_vector <- c()
for (t in 1:times) {
partial_nb <- sorted_nb(bindsorted, input_vector, output_vector)
nb_vector <- c(nb_vector, partial_nb)
bindsorted <- bindsorted[1:nrow(bindsorted) - 1, ]
}
## Here we define p, which first requires defining q, which is set
## to be 1/15 of the total number of barcodes. So if this is is
## 1500 numbers long, q will be 1500/15 = 100 numbers long (1, 15,
## 30, 45 ... 1500). Correspondingly, p is the value of nb_vector
## at each of these positions.
q <- round(seq(1, length(na.omit(nb_vector)),
length.out = length(na.omit(nb_vector)) / 15))
p <- nb_vector[q]
## The guesses variable is defined as all the final
## decisionsreached from starting across all the values of p. The
## unique guesses are taken and sorted.
guesses <- map_dbl(p, nb_minimum_search, nb_vector = nb_vector)
sorted_guesses <- sort(unique(c(guesses)))
sorted_guesses <- c(sorted_guesses, length(na.omit(nb_vector)))
## There is sometimes some weirdness at the end of the run, and it
## makes decisions really close together. This part takes anything
## within 5 values of the end of the run and sets it to the
## end. This new sorted list is uniqued again.
sorted_guesses[sorted_guesses > max(sorted_guesses) - 5] <- max(sorted_guesses)
sorted_guesses <- unique(sorted_guesses)
## Identifies greatest log change
xdif <- c(log(nb_vector[1]), log(nb_vector))
xdif <- xdif[1:length(xdif) - 1]
xsub <- (log(nb_vector) - xdif)
greatest_delta <- which(xsub == max(xsub)) - 1
sorted_guesses <- sort(unique(c(sorted_guesses, greatest_delta)))
sorted_guesses <- sorted_guesses[sorted_guesses != 0]
## Sometimes, guesses will be within one unit and this can be
## problematic when it sees this as a big weight jump (because
## only one barcode is in that group). This part makes sure that
## each guess is at least 2 barcodes apart
staggered_guesses <- c(-10000, sorted_guesses)
difference <- c(sorted_guesses, 10000) - staggered_guesses
if (min(difference) == 1) {
removed_guess <- max(which(difference == 1)) - 1
sorted_guesses <- sorted_guesses[-removed_guess]
}
sorted_guesses <- sorted_guesses
## Once we have the guesses, we need to make the table that
## describes the weights assigned to each guess. This function
## takes as an input the sorted_guesses list above and identifies
## the fraction of reads acounded for by all barcodes that have
## more reads than specific by each value in sorted_guesses. For
## example, if one of the guesses was 15, and position 15 was a
## barcode with 100 reads, this function will determine the
## fraction of all reads acounted for by barcodes with at least
## 100 reads.
fraction_accounted <- map_dbl(sorted_guesses, vector_proportion,
output_vector = output_vector)
## Here we adjust the fractionaccounted variable so that it takes
## in the fraction of accounted reads in between guesses, not just
## from that guess upwards
staggered <- c(0, fraction_accounted)[1:length(fraction_accounted)]
subtracted <- fraction_accounted - staggered
## Here we obtain the manb_vector Nb up to each location defined
## in sorted_guesses
nb_intervals <- map_dbl(sorted_guesses, get_interval_nb,
nb_vector = nb_vector)
## And finally we combine these into the indices table. The
## headings explain what each value means. We have now subseted
## our data into descrete segments separated by local minima
indices <- data.frame(nb_intervals, subtracted, sorted_guesses)
colnames(indices) <- c("Nr", "Accounts for", "Number of barcodes")
## These logicals define the start of noise (initial_noise) as the
## location of greatest log change in the "accounts for" section,
## with a few more housekeeping stuff
weights <- log(indices[, 2])
values <- (indices[, 1])
subtracted_weights <- c(weights[2:length(values)], 0)
weights_delta <- (weights - subtracted_weights)[1:length(weights) - 1]
if (length(weights_delta) == 0) {
weights_delta <- values
}
initial_noise <- indices[, 3][which(weights_delta == max(weights_delta))]
initial_noisecopy <- initial_noise
## if there are no populations that are below the minimum weight
## threshold, set noise to be the end of the last detected
## population
if (is.na(indices[, 3][min(which(indices[, 2] < min_weight))])) {
initial_noise <- max(indices[, 3])
}
## if the sum of the weights of the population after the start of
## noise is greater than the miniumum weight threshold, set the
## start to be where the the rest of the reads after are under the
## minimum weight
cutoff_position <- min(which(cumsum(indices[, 2]) > (1 - min_weight)))
if (sum(indices[, 2][which(indices[, 3] > initial_noise)]) > min_weight) {
initial_noise <- indices[, 3][cutoff_position]
}
##Plots nb_vector and barcodes if specified
if (isTRUE(do_plots)) {
par(mfrow = c(3, 1))
plot(1:times, nb_vector, log = "y", ylim = c(.01, 2E6),
xlim = c(1, length(as.numeric(na.omit(nb_vector)))),
main = "Resiliency", ylab = "Nb", xlab = "Iteration")
abline(v = sorted_guesses)
plot(output_vector / sum(output_vector), log = "y",
ylim = c(1E-6, 1), main = "Barcodes", ylab = "Frequency", xlab = "Barcode")
}
## This function takes the values in guessesunquesorted and
## identifies their location in the plot of barcode frequencies vs
## barcode, which is defined by cutoffpositions
cutoff_positions <- map_dbl(sorted_guesses, convert_cutoffs,
output_vector = output_vector, bind_df = bindsortedcopy)
## Now we redefine our output vector such that everything after
## the start of noise is set to 0. Note that this is done on
## bindsortedcopy2, and not bindsorted or bindsortedcopy
noise_length <- dim(bindsortedcopy2)[1] - initial_noise
bindsortedcopy2[1:noise_length, ][2] <- 0
output_vectorwithoutnoise <- as.numeric(bindsortedcopy2[, 2])
## The resiliency function is run again, this time with the new
## noise-less output vector and where the resulting output is
## defined as z. z is functionally the same as x, except z is done
## after the noise correction.
final_nb_vector <- c()
iterations <- sum(bindsortedcopy2[2] != 0)
for (i in 1:iterations) {
partial_nb <- sorted_nb(bindsortedcopy2, input_vector, output_vector)
final_nb_vector <- c(final_nb_vector, partial_nb)
bindsortedcopy2 <- bindsortedcopy2[1:nrow(bindsortedcopy2) - 1, ]
}
## Nr is the defined by final value. It is further constrained to
## ensure that that it is greater than Nb (recall that the
## original Nb estimate is nb_vector[1])
final_value <- max(final_nb_vector)
if (final_value < nb_vector[1]) {
final_value <- nb_vector[1]
}
## Adjustment for when Nb < the identified number of barcodes
## above noise. Here, the number of barcodes is a better
## estimate. So we change Nr to reflect this.
min_nearest_times <- max(indices[, 3][(indices[, 3] <= initial_noise)])
computed_bottleneck <<- stats::NLSstClosestX(step_df, min_nearest_times) ## this is Ns
if (final_value < computed_bottleneck) {
final_value <<- computed_bottleneck
}
## Plots the new resiliency function if needed
if (isTRUE(do_plots)) {
abline(h = cutoff_positions)
print(indices)
print(final_value)
plot(final_nb_vector, log = "y", ylim = c(.01, 2E6),
xlim = c(1, length(as.numeric(na.omit(final_nb_vector)))),
main = "Resiliency without noise", ylab = "Nb", xlab = "Iteration")
}
if (!is.null(input_calibration)) {
lookup_table <- read.csv(input_calibration, row.names = 1)
}
if (nb_vector[1] < 1) {
nb_vector[1] <- 1
}
if (final_value < 1) {
final_value <- 1
}
if (!is.null(input_calibration)) {
nbcalibrated <- as.numeric(lookup_table[as.character(
round(log10(nb_vector[1]), digits = 2)), ])
nrcalibrated <- as.numeric(lookup_table[as.character(
round(log10(final_value), digits = 2)), ])
}
if (is.null(input_calibration)) {
nbcalibrated <- NA
nrcalibrated <- NA
}
number_zeros <- sum(output_vectorwithoutnoise == 0)
output_vector[order(output_vector)][1:number_zeros] <- 0
noiseless_table <- data.frame(noiseless_table, output_vector)
colnames(noiseless_table)[length(colnames(noiseless_table))] <- sample_name
if (is.null(input_calibration)) {
row <- c(nb_vector[1], final_value, computed_bottleneck)
names(row) <- c("Nb", "Nr" , "Ns")
} else {
row <- c(nb_vector[1], final_value, nbcalibrated, nrcalibrated, computed_bottleneck)
names(row) <- c("Nb", "Nr", "Nb_cal", "Nr_cal", "Ns")
}
retlist <- list(
"noiseless_table" = noiseless_table,
"output_vector" = output_vector,
"input_vectorNoiseCorrected" = input_vector,
"input_vector" = reference_vector,
"row" = row)
return(retlist)
}
#' I do not really get this function, here is the author's text:
#'
#' This function takes the values in guessesunquesorted and
#' identifies their location in the plot of barcode frequencies vs
#' barcode, which is defined by cutoffpositions
#'
#' @param p One less than q.
#' @param output_vector Definitely not the input_vector
#' @param bind_df the thing from which we will take the next item...
convert_cutoffs <- function(p, output_vector, bind_df) {
cutoff_position <- length(output_vector) - p
cutoff_value <- bind_df[cutoff_position + 1, 2]
if (cutoff_value == 0) {
cutoff_value <- 1
}
ret <- cutoff_value / sum(output_vector)
return(ret)
}
#' Given a vector and set of steps, use rmvhyper to make a new vector.
#'
#' This takes an input vector and for every element of it provides a
#' new element which is calculated by doing the following:
#' 1. Run rmvhyper on the original vector in order to generate a set
#' of random numbers using the multivariate hypergeometric
#' distribution.
#' 2. Finding the number of values from #1 which are not 0.
#' 3. Putting the number from #2 into the new element.
#' 4. Finally, returns the new vector.
#'
#' It seems to me that there must be a much more efficient way to get
#' this information?
#'
#' @param vector input vector
#' @param steps Set of steps over which to iterate.
#' @return New vector containing the number of times the rhvhyper
#' distribution was not zero for each element of the original vector.
get_bottom_vector <- function(vector, steps) {
new_vector <- rep(0, length(vector))
for (i in 1:length(new_vector)) {
step <- steps[i]
distribution <- as.numeric(extraDistr::rmvhyper(1, vector, step))
new_vector[i] <- sum(distribution != 0)
}
return(new_vector)
}
#' Grab the maximum value from a portion of the nb vector.
get_interval_nb <- function(y, nb_vector = NULL) {
if (is.null(nb_vector)) {
return(NULL)
}
max(nb_vector[1:y])
}
#' Search for minima... Another function I don't understand.
#'
#' Here is the text from the authors:
#' Outer layer of the function that scans for minima. The input to
#' this, p, is later defined. p is a specific set of numbers that
#' specifiy all the poisitions for which the minimum finder will
#' start. scanformin is applied over each of these positions.
#'
#' Hmm yeah...
nb_minimum_search <- function(p, nb_vector) {
start <- p
## This is finds minima. It takes p, the initial guess, and
## looks around with a specified sd (the newlocation
## variable). It asks if this new value (newstart) is less than
## start. If so, start is set to newstart. Repeating this
## function iteratively changes the value of start if the
## newstart guess is smaller
findmin <- function() {
where <- which(nb_vector == start)
newlocation <- abs(rnorm(1, mean = where, sd = length(na.omit(nb_vector)) / 10))
if (newlocation > length(na.omit(nb_vector))) {
newlocation <- where
}
if (newlocation < 1) {
newlocation <- where
}
newstart <- nb_vector[round(newlocation)]
if (newstart < start) {
start <- newstart
}
return(start)
}
## startvector is the resulting output of the findmin function
## run 1000 times. The start variable also changes to be the
## last guess. The position of this guess in the nb_vector
## variable is defined as decision. Put another way, decision is
## the nb_vector coordinate of the resiliency graph, and start
## is the y coordinate.
startvector <- replicate(1000, findmin())
decision <- which(nb_vector == start)
return(decision)
} ## End of scanformin
#' This is the top-level function from the authors. I have it
#' wrapped.
#'
#' It sets up the data and iterates over every experimental sample in
#' order to run the calculate resliency function upon it.
#'
#' @param read_csv CSV file containing the reads/tag, FIXME: use the
#' tag data structure here to simplify things.
#' @param cfu_vector CFU values for each sample of interest.
#' @param innoculation_cfu Assumed cfu if they are not provided.
#' @param chosen_seed Set the seed you maniacs!
#' @param reference_samples Vector of reference samples, provided by
#' the metadata via the parent function.
#' @param min_weight Constant of minimum weight...
#' @param noise_correction Constant of noise correction...
#' @param input_calibration CSV of input calibrations.
#' @param do_plots Plot the results?
#' @param verbose Be verbose?
#' @param output_csv Print the new Nb values here.
#' @param output_noiseless Print the noiseless data here...
rtisan_nr_nb <- function(read_csv, cfu_vector, innoculation_cfu = 1e8, chosen_seed = 1,
reference_samples = c(1, 2, 3), min_weight = 0.03,
noise_correction = 0.005,
input_calibration = NULL, do_plots = FALSE, verbose = FALSE,
output_csv = "rtisan_values.csv",
output_noiseless = "rtisan_noiseless.csv", cpus=NULL, parallel = TRUE) {
## This will eventually be pulled in from the metadata if I incorporate this into my library.
set.seed(chosen_seed)
reads_df <- read.csv(read_csv, row.names = 1)
colnames(reads_df) <- tolower(gsub(pattern = "\\.txt", replacement = "", x=colnames(reads_df)))
if (class(cfu_vector)[1] == "character") {
cfu_vector <- read.csv(cfu_table)[[2]]
}
reference_df <- reads_df[, reference_samples]
reference_vector <- rowMeans(reference_df)
experimental_ids <- ! colnames(reads_df) %in% reference_samples
experimental_df <- reads_df[, experimental_ids]
sample_names <- colnames(experimental_df)
reference_names <- colnames(reference_df)
noiseless_table <- data.frame(row.names=rownames(reads_df))
reference_rounded <- unname(round(round(reference_vector) *
innoculation_cfu / sum(round(reference_vector))))
steps <- seq(from=1, to=length(reference_rounded) * 10, by = 10)
rounded_steps <- get_bottom_vector(reference_rounded, steps)
step_df <- stats::sortedXyData(steps, rounded_steps)
colnames(step_df) <- c("x", "y")
## TableOfEstimates is what the final table is called, and it will
## have different dimentions depending on if you specify a
## calibration curve. This variable is written into your directory
## and important metadata is spit out of the function so you can run
## the resiliency script to stand alone if needed (remembering to
## set plots = TRUE)
estimate_table <- data.frame()
res <- NULL
results <- list()
if (isTRUE(parallel)) {
if (is.null(cpus)) {
total_cpus <- parallel::detectCores()
cpus <- total_cpus - 2
}
message("Starting cluster with: ", cpus, " cores.")
cl <- parallel::makeCluster(cpus)
doParallel::registerDoParallel(cl)
tt <- requireNamespace("parallel")
tt <- requireNamespace("doParallel")
tt <- requireNamespace("iterators")
tt <- requireNamespace("foreach")
res <- foreach(s = 1:length(sample_names),
.packages = c("stampr")) %dopar% {
sample_name <- sample_names[s]
results[[sample_name]] <- calculate_resiliency(
reference_vector, reads_df, cfu_vector, noiseless_table, sample_name, step_df,
do_plots = do_plots, innoculation_cfu = innoculation_cfu,
input_calibration = input_calibration, output_csv = output_csv,
output_noiseless = output_noiseless,
min_weight = min_weight, noise_correction = noise_correction,
verbose = verbose)
}
stopped <- parallel::stopCluster(cl)
## End running multi-threaded
} else {
message("Running samples sequentially one at a time.")
for (s in 1:length(sample_names)) {
sample_name <- sample_names[s]
message("Starting: ", sample_name, ".")
res[[s]] <- calculate_resiliency(
reference_vector, reads_df, cfu_vector, noiseless_table, sample_name, step_df,
do_plots = do_plots, innoculation_cfu = innoculation_cfu,
input_calibration = input_calibration, output_csv = output_csv,
output_noiseless = output_noiseless,
min_weight = min_weight, noise_correction = noise_correction,
verbose = verbose)
}
} ## End single threaded mode
for (s in 1:length(sample_names)) {
sample_estimate <- res[[s]]
sample_name <- sample_names[s]
noiseless_table <- sample_estimate[["noiseless_table"]]
output_vector <- sample_estimate[["output_vector"]]
input_vectorNoiseCorrected = sample_estimate[["input_vectorNoiseCorrected"]]
input_vector <- sample_estimate[["input_vector"]]
table_row <- sample_estimate[["row"]]
estimate_table <- rbind(estimate_table, table_row)
colnames(estimate_table) <- names(table_row)
message("Collected result from ", sample_name, ":")
print(table_row)
}
rownames(estimate_table) <- sample_names
if (isTRUE(verbose)) {
print(estimate_table)
}
if (is.null(input_calibration)) {
colnames(estimate_table) <- c("Nb", "Nr", "Ns")
} else {
colnames(estimate_table) <- c("Nb", "Nr", "NbLower_Cal",
"NbMedian_Cal", "NbUpper_Cal",
"NrLower_Cal", "NrMedian_Cal",
"NrUpper_Cal")
}
if (!is.null(output_csv)) {
write.csv(estimate_table, output_csv)
}
if (!is.null(output_noiseless)) {
write.csv(noiseless_table, output_noiseless)
}
retlist <- list(
"reads_df" = reads_df,
"estimate_table" = estimate_table,
"filtered_table" = noiseless_table,
"reference_vector" = reference_vector,
"sample_names" = sample_names,
"minimum_weight" = min_weight,
"step_df" = step_df)
return(retlist)
}
#' Calculate Nb values on a set of sorted read/tag vectors.
#'
#' This should trivially be able to be deleted in favor of calculate_nb().
sorted_nb <- function(bindsorted, input_vector, output_vector, generations = 1) {
input <- bindsorted[, 1]
out <- bindsorted[, 2]
inputprop <- na.omit(input / sum(input))
outprop <- na.omit(out / sum(out))
num <- (outprop - inputprop) ^ 2
den <- inputprop * (1 - inputprop)
sigma <- num / den
sigma <- sigma[sigma != Inf]
F <- mean(na.omit(sigma))
## note that this Nb is assuming that the total number of reads
## as bindsorted gets trimmed is always equal to the total
## number of reads in the initial sample = otherwise you get
## negative Nbs and the nb_vector variable gets thrown off
nb <- generations / (F - generations / sum(input_vector) - generations / sum(output_vector))
return(nb)
}
#' Calculate the proprotion of each value in the vector with a twist!
#'
#' This is _almost_ a vector version of make_frequency_df, except it
#' takes only the last t values of the vector for the sum. Why? I am
#' not certain.
#'
#' @param t Number of elements on the bottom of the sorted vector from
#' which to calculate the sum which is used as the denominator of the
#' frequencies.
#' @param output_vector The input of reads/tag from which to calculate
#' this peculiar variant of frequencies.
vector_proportion <- function(t, output_vector = NULL) {
if (is.null(output_vector)) {
return(NULL)
}
output_vectorsorted <- sort(output_vector)
topnumbers <- tail(output_vectorsorted, t)
proportion <- sum(topnumbers) / sum(output_vector)
return(proportion)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.