R/rtisan.r

Defines functions vector_proportion sorted_nb rtisan_nr_nb nb_minimum_search get_interval_nb get_bottom_vector convert_cutoffs calculate_resiliency rtisan_nb_from_tags

Documented in calculate_resiliency convert_cutoffs get_bottom_vector get_interval_nb nb_minimum_search rtisan_nb_from_tags rtisan_nr_nb sorted_nb vector_proportion

#' A partial copying of the getNrNb() function from rtisan.
#'
#' This function takes in the tags data structure I have been using
#' and passes the data to a combination of the Nb calculator I have
#' along with the resiliency filtered Nb values.
#'
#' I do not yet fully understand the various functions/operations
#' performed for the resiliency filtering, as a result, these
#' functions are mostly just transcriptions of the originals with some
#' variables renamed and changes to make it easier to read.
#'
#' @param tags The data structure generated by read_idx or
#'  read_csv_tags or read_qiime_otus.  E.g. the list of metadata, tags
#'  by sample, and some administrativia.
#' @param generations Used by calculate_nb
#' @param metadata_column Used by nb_from_tags to add the pre-rtisan
#' Nb values to the metadata.
#' @param zero_filter Used by calculate_nb to leave off tags not
#'  observed in the reference samples. I think this is slightly broken
#'  though.
#' @param write_metadat When not null, write out the changes to the
#'  metadata to the provided file.
#' @param innoculation_cfu Used by rtisan, defines a constant for the
#'  innoculation cfu in case it was not provided in the metadata.
#' @param chosen_seed Set the seed for extraDistr so that can get the
#'  same result two times in a row.
#' @param min_weight The minimum weight constant, not sure what it does yet.
#' @param noise_correction Noise correction constant, not sure what it
#'  does yet.
#' @param input_calibration Input csv file containing calibration
#'  constants.  I do not have one, so I cannot say how it works yet.
#' @param output_csv Write the new Nb values to this file: FIXME Move
#'  this to the metadata.
#' @param output_noisless Write the noiseless values to this file,
#'  FIXME: also put this into the metadata.
#' @param do_plots Print the diagnostic plots?  FIXME: ggplot-ify
#'  these and potentially get them into the metadata.
#' @param verbose Print some diagnostics while running?
#' @export
rtisan_nb_from_tags <- function(tags, generations = 1,
                                metadata_prefix = "rtisan",
                                zero_filter = FALSE, write_metadata = NULL,
                                innoculation_cfu = 1e8, chosen_seed = 1,
                                min_weight = 0.03, noise_correction = 0.005,
                                input_calibration = NULL,
                                output_csv = "rtisan_values.csv",
                                output_noiseless = "rtisan_noiseless.csv",
                                do_plots = FALSE, verbose = FALSE, parallel = TRUE, write = NULL) {
  read_csv <- tags[["input_csv"]]
  input_metadata <- tags[["modified_metadata"]]
  metadata_backup <- input_metadata
  cfu_vector <- input_metadata[["cfu"]]
  names(cfu_vector) <- rownames(input_metadata)
  reference_samples_idx <- input_metadata[["type"]] == "reference"
  pre_metadata_column <- paste0("pre_", metadata_prefix, "_Nb")
  reference_samples <- rownames(input_metadata)[reference_samples_idx]
  original_result <- nb_from_tags(tags, generations = generations,
                                  metadata_column = pre_metadata_column,
                                  zero_filter = FALSE, write = write_metadata)
  rtisan_result <- rtisan_nr_nb(read_csv, cfu_vector, innoculation_cfu = innoculation_cfu,
                                chosen_seed = chosen_seed, reference_samples = reference_samples,
                                min_weight = min_weight, noise_correction = noise_correction,
                                input_calibration = input_calibration,
                                output_csv = output_csv, do_plots = do_plots, verbose = verbose,
                                parallel = parallel)
  estimates <- rtisan_result[["estimate_table"]]
  nb_vector <- estimates[["Nb"]]
  nr_vector <- estimates[["Nr"]]
  ns_vector <- estimates[["Ns"]]
  rtisan_nb_column <- paste0(metadata_prefix, "_Nb")
  rtisan_nr_column <- paste0(metadata_prefix, "_Nr")
  rtisan_ns_column <- paste0(metadata_prefix, "_Ns")
  input_metadata[[rtisan_nb_column]] <- NA
  input_metadata[[rtisan_nr_column]] <- NA
  input_metadata[[rtisan_ns_column]] <- NA

  for (r in 1:nrow(estimates)) {
    sample <- rownames(estimates)[r]
    input_metadata[sample, rtisan_nb_column] <- estimates[sample, "Nb"]
    input_metadata[sample, rtisan_nr_column] <- estimates[sample, "Nr"]
    input_metadata[sample, rtisan_ns_column] <- estimates[sample, "Ns"]
  }

  if (!is.null(write)) {
    extension <- tools::file_ext(write)
    if (extension == "xlsx") {
      written <- hpgltools::write_xlsx(data = input_metadata, excel = write)
    } else if (extension == "csv") {
      written <- write.csv(x = input_metadata, file = write)
    } else if (extension == "tsv") {
      writen <- write.tsv(x = input_metadata, file = write)
    } else {
      message("Dunno what to write.")
    }
  }
  retlist <- list(
    "previous_metadata" = metadata_backup,
    "modified_metadata" = input_metadata,
    "Nb_column" = rtisan_nb_column,
    "Nb_result" = nb_vector,
    "Nr_result" = nr_vector,
    "Ns_result" = ns_vector,
    "sample_counts" = tags[["sample_counts"]])
  return(retlist)
}

#' Internal Resiliency Function.
#'
#' This is run for every experimental or calibration sample in the
#' metadata.
#'
#' @param reference_vector Set of reference reads/tag.
#' @param reads_df Table of the reads/tag for the samples of interest.
#' @param cfu_vector Set of cfu values from which to acquire the
#'  target cfu for this particular sample.
#' @param noiseless_table Input table of noisless data, currently
#'  unused.
#' @param sample_name Name of this samples, used to extract some data.
#' @param step_df Used to calculate the computed_bottleneck by
#'  stats::NLSstClosestX and generate the Ns value.
#' @param min_weight Constant of the minimum weight!
#' @param input_calibration Input calibration file, currently unused.
#' @param output_csv Output csv file for writing new values.
#' @param innoculation_cfu Assumed cfu when not provided.
#' @param do_plots Print plots?
#' @param noise_correction Constant for correcting noise!
#' @param verbose Print text while running?
#' @export
calculate_resiliency <- function(reference_vector, reads_df, cfu_vector, noiseless_table,
                                 sample_name, step_df, min_weight = 0.03,
                                 input_calibration = NULL,
                                 output_csv = "rtisan_values.csv",
                                 output_noiseless = "rtisan_noiseless.csv",
                                 innoculation_cfu = 1e8, do_plots = FALSE,
                                 noise_correction = 0.005, verbose = FALSE) {
  ## Specifies input vector (which is the average of your inputs),
  ## output (the row which corresponds to the particular sample
  ## name), and CFU (a single value corresponding to the name of the
  ## sample)
  input_vector <- reference_vector
  wanted_column <- colnames(reads_df) == sample_name
  output_vector <- reads_df[, wanted_column]

  cfu <- innoculation_cfu
  if (!is.null(cfu_vector)) {
    wanted_sample <- names(cfu_vector) == sample_name
    cfu <- cfu_vector[wanted_sample]
  }

  ## Noise adjustment
  if (noise_correction != 0) {
    resampled_input <- extraDistr::rmvhyper(1, round(input_vector),
                                            round(noise_correction * sum(output_vector)))
    output_vector <- as.numeric(output_vector - resampled_input)
    output_vector[output_vector < 0] <- 0
  }

  ## Times specifies how many iterations the resiliency function is
  ## run. Too many or too few isn't ideal, but we can constrain it
  ## by CFU. So if you have 2 CFU, it will only run 2 times, because
  ## it's pointless to try to look farther than that. If you have a
  ## lot of CFU, this variable is set to the number of barcodes that
  ## have > 1 read. 1 is arbitrary, but we can be really quite
  ## confident that anything 1 or below is noise.
  times <- min(round(cfu + 1), sum(output_vector > 1))

  ## Combines input and output vectors into one data frame, and then
  ## orders everything by the output vector. The barcodes with the
  ## most reads in the output will be at the bottom of the data
  ## frame
  bind <- as.data.frame(cbind(input_vector, output_vector))
  bindsorted <- bind[order(bind[, 2]), ]

  ## This logical puts in a 0 for all barcodes in the output where
  ## we initially define noise.
  # if(times < length(output_vector)) {
  #   bindsorted[, 2][1:(length(output_vector) - times)] <- 0
  # }

  ## As the script is run, bindsorted itself will be changed, so we
  ## make a few copies of it for later use
  bindsortedcopy <- bindsorted
  bindsortedcopy2 <- bindsortedcopy

  ## minusone is run for the number of times specified by times. The
  ## initial 0 is removed from nb_vector and both nb_vector.
  nb_vector <- c()
  for (t in 1:times) {
    partial_nb <- sorted_nb(bindsorted, input_vector, output_vector)
    nb_vector <- c(nb_vector, partial_nb)
    bindsorted <- bindsorted[1:nrow(bindsorted) - 1, ]
  }

  ## Here we define p, which first requires defining q, which is set
  ## to be 1/15 of the total number of barcodes. So if this is is
  ## 1500 numbers long, q will be 1500/15 = 100 numbers long (1, 15,
  ## 30, 45 ... 1500). Correspondingly, p is the value of nb_vector
  ## at each of these positions.
  q <- round(seq(1, length(na.omit(nb_vector)),
                 length.out = length(na.omit(nb_vector)) / 15))
  p <- nb_vector[q]

  ## The guesses variable is defined as all the final
  ## decisionsreached from starting across all the values of p.  The
  ## unique guesses are taken and sorted.
  guesses <- map_dbl(p, nb_minimum_search, nb_vector = nb_vector)
  sorted_guesses <- sort(unique(c(guesses)))
  sorted_guesses <- c(sorted_guesses, length(na.omit(nb_vector)))

  ## There is sometimes some weirdness at the end of the run, and it
  ## makes decisions really close together. This part takes anything
  ## within 5 values of the end of the run and sets it to the
  ## end. This new sorted list is uniqued again.
  sorted_guesses[sorted_guesses > max(sorted_guesses) - 5] <- max(sorted_guesses)
  sorted_guesses <- unique(sorted_guesses)

  ## Identifies greatest log change
  xdif <- c(log(nb_vector[1]), log(nb_vector))
  xdif <- xdif[1:length(xdif) - 1]
  xsub <- (log(nb_vector) - xdif)
  greatest_delta <- which(xsub == max(xsub)) - 1
  sorted_guesses <- sort(unique(c(sorted_guesses, greatest_delta)))
  sorted_guesses <- sorted_guesses[sorted_guesses != 0]

  ## Sometimes, guesses will be within one unit and this can be
  ## problematic when it sees this as a big weight jump (because
  ## only one barcode is in that group). This part makes sure that
  ## each guess is at least 2 barcodes apart

  staggered_guesses <- c(-10000, sorted_guesses)
  difference <- c(sorted_guesses, 10000) - staggered_guesses
  if (min(difference) == 1) {
    removed_guess <- max(which(difference == 1)) - 1
    sorted_guesses <- sorted_guesses[-removed_guess]
  }
  sorted_guesses <- sorted_guesses

  ## Once we have the guesses, we need to make the table that
  ## describes the weights assigned to each guess. This function
  ## takes as an input the sorted_guesses list above and identifies
  ## the fraction of reads acounded for by all barcodes that have
  ## more reads than specific by each value in sorted_guesses. For
  ## example, if one of the guesses was 15, and position 15 was a
  ## barcode with 100 reads, this function will determine the
  ## fraction of all reads acounted for by barcodes with at least
  ## 100 reads.
  fraction_accounted <- map_dbl(sorted_guesses, vector_proportion,
                                output_vector = output_vector)

  ## Here we adjust the fractionaccounted variable so that it takes
  ## in the fraction of accounted reads in between guesses, not just
  ## from that guess upwards
  staggered <- c(0, fraction_accounted)[1:length(fraction_accounted)]
  subtracted <- fraction_accounted - staggered

  ## Here we obtain the manb_vector Nb up to each location defined
  ## in sorted_guesses
  nb_intervals <- map_dbl(sorted_guesses, get_interval_nb,
                          nb_vector = nb_vector)

  ## And finally we combine these into the indices table. The
  ## headings explain what each value means. We have now subseted
  ## our data into descrete segments separated by local minima
  indices <- data.frame(nb_intervals, subtracted, sorted_guesses)
  colnames(indices) <- c("Nr", "Accounts for", "Number of barcodes")

  ## These logicals define the start of noise (initial_noise) as the
  ## location of greatest log change in the "accounts for" section,
  ## with a few more housekeeping stuff
  weights <- log(indices[, 2])
  values <- (indices[, 1])

  subtracted_weights <- c(weights[2:length(values)], 0)
  weights_delta <- (weights - subtracted_weights)[1:length(weights) - 1]
  if (length(weights_delta) == 0) {
    weights_delta <- values
  }
  initial_noise <- indices[, 3][which(weights_delta == max(weights_delta))]
  initial_noisecopy <- initial_noise

  ## if there are no populations that are below the minimum weight
  ## threshold, set noise to be the end of the last detected
  ## population
  if (is.na(indices[, 3][min(which(indices[, 2] < min_weight))])) {
    initial_noise <- max(indices[, 3])
  }
  ## if the sum of the weights of the population after the start of
  ## noise is greater than the miniumum weight threshold, set the
  ## start to be where the the rest of the reads after are under the
  ## minimum weight
  cutoff_position <- min(which(cumsum(indices[, 2]) > (1 - min_weight)))
  if (sum(indices[, 2][which(indices[, 3] > initial_noise)]) > min_weight) {
    initial_noise <- indices[, 3][cutoff_position]
  }

  ##Plots nb_vector and barcodes if specified
  if (isTRUE(do_plots)) {
    par(mfrow = c(3, 1))
    plot(1:times, nb_vector, log = "y", ylim = c(.01, 2E6),
         xlim = c(1, length(as.numeric(na.omit(nb_vector)))),
         main = "Resiliency", ylab = "Nb", xlab = "Iteration")
    abline(v = sorted_guesses)
    plot(output_vector / sum(output_vector), log = "y",
         ylim = c(1E-6, 1), main = "Barcodes", ylab = "Frequency", xlab = "Barcode")
  }

  ## This function takes the values in guessesunquesorted and
  ## identifies their location in the plot of barcode frequencies vs
  ## barcode, which is defined by cutoffpositions
  cutoff_positions <- map_dbl(sorted_guesses, convert_cutoffs,
                              output_vector = output_vector, bind_df = bindsortedcopy)

  ## Now we redefine our output vector such that everything after
  ## the start of noise is set to 0. Note that this is done on
  ## bindsortedcopy2, and not bindsorted or bindsortedcopy
  noise_length <- dim(bindsortedcopy2)[1] - initial_noise
  bindsortedcopy2[1:noise_length, ][2] <- 0
  output_vectorwithoutnoise <- as.numeric(bindsortedcopy2[, 2])

  ## The resiliency function is run again, this time with the new
  ## noise-less output vector and where the resulting output is
  ## defined as z. z is functionally the same as x, except z is done
  ## after the noise correction.
  final_nb_vector <- c()
  iterations <- sum(bindsortedcopy2[2] != 0)
  for (i in 1:iterations) {
    partial_nb <- sorted_nb(bindsortedcopy2, input_vector, output_vector)
    final_nb_vector <- c(final_nb_vector, partial_nb)
    bindsortedcopy2 <- bindsortedcopy2[1:nrow(bindsortedcopy2) - 1, ]
  }

  ## Nr is the defined by final value. It is further constrained to
  ## ensure that that it is greater than Nb (recall that the
  ## original Nb estimate is nb_vector[1])
  final_value <- max(final_nb_vector)
  if (final_value < nb_vector[1]) {
    final_value <- nb_vector[1]
  }

  ## Adjustment for when Nb < the identified number of barcodes
  ## above noise. Here, the number of barcodes is a better
  ## estimate. So we change Nr to reflect this.
  min_nearest_times <- max(indices[, 3][(indices[, 3] <= initial_noise)])
  computed_bottleneck <<- stats::NLSstClosestX(step_df, min_nearest_times) ## this is Ns
  if (final_value < computed_bottleneck) {
    final_value <<- computed_bottleneck
  }

  ## Plots the new resiliency function if needed
  if (isTRUE(do_plots)) {
    abline(h = cutoff_positions)
    print(indices)
    print(final_value)
    plot(final_nb_vector, log = "y", ylim = c(.01, 2E6),
         xlim = c(1, length(as.numeric(na.omit(final_nb_vector)))),
         main = "Resiliency without noise", ylab = "Nb", xlab = "Iteration")
  }

  if (!is.null(input_calibration)) {
    lookup_table <- read.csv(input_calibration, row.names = 1)
  }
  if (nb_vector[1] < 1) {
    nb_vector[1] <- 1
  }
  if (final_value < 1) {
    final_value <- 1
  }
  if (!is.null(input_calibration)) {
    nbcalibrated <- as.numeric(lookup_table[as.character(
        round(log10(nb_vector[1]), digits = 2)), ])
    nrcalibrated <- as.numeric(lookup_table[as.character(
        round(log10(final_value), digits = 2)), ])
  }
  if (is.null(input_calibration)) {
    nbcalibrated <- NA
    nrcalibrated <- NA
  }
  number_zeros <- sum(output_vectorwithoutnoise == 0)
  output_vector[order(output_vector)][1:number_zeros] <- 0
  noiseless_table <- data.frame(noiseless_table, output_vector)
  colnames(noiseless_table)[length(colnames(noiseless_table))] <- sample_name

  if (is.null(input_calibration)) {
    row <- c(nb_vector[1], final_value, computed_bottleneck)
    names(row) <- c("Nb", "Nr" , "Ns")
  } else {
    row <- c(nb_vector[1], final_value, nbcalibrated, nrcalibrated, computed_bottleneck)
    names(row) <- c("Nb", "Nr", "Nb_cal", "Nr_cal", "Ns")
  }

  retlist <- list(
      "noiseless_table" = noiseless_table,
      "output_vector" = output_vector,
      "input_vectorNoiseCorrected" = input_vector,
      "input_vector" = reference_vector,
      "row" = row)
  return(retlist)
}

#' I do not really get this function, here is the author's text:
#'
#' This function takes the values in guessesunquesorted and
#' identifies their location in the plot of barcode frequencies vs
#' barcode, which is defined by cutoffpositions
#'
#' @param p One less than q.
#' @param output_vector Definitely not the input_vector
#' @param bind_df the thing from which we will take the next item...
convert_cutoffs <- function(p, output_vector, bind_df) {
  cutoff_position <- length(output_vector) - p
  cutoff_value <- bind_df[cutoff_position + 1, 2]
  if (cutoff_value == 0) {
    cutoff_value <- 1
  }
  ret <- cutoff_value / sum(output_vector)
  return(ret)
}

#' Given a vector and set of steps, use rmvhyper to make a new vector.
#'
#' This takes an input vector and for every element of it provides a
#' new element which is calculated by doing the following:
#' 1.  Run rmvhyper on the original vector in order to generate a set
#' of random numbers using the multivariate hypergeometric
#' distribution.
#' 2.  Finding the number of values from #1 which are not 0.
#' 3.  Putting the number from #2 into the new element.
#' 4.  Finally, returns the new vector.
#'
#' It seems to me that there must be a much more efficient way to get
#' this information?
#'
#' @param vector input vector
#' @param steps Set of steps over which to iterate.
#' @return New vector containing the number of times the rhvhyper
#'  distribution was not zero for each element of the original vector.
get_bottom_vector <- function(vector, steps) {
  new_vector <- rep(0, length(vector))
  for (i in 1:length(new_vector)) {
    step <- steps[i]
    distribution <- as.numeric(extraDistr::rmvhyper(1, vector, step))
    new_vector[i] <- sum(distribution != 0)
  }
  return(new_vector)
}

#' Grab the maximum value from a portion of the nb vector.
get_interval_nb <- function(y, nb_vector = NULL) {
  if (is.null(nb_vector)) {
    return(NULL)
  }
  max(nb_vector[1:y])
}

#' Search for minima...  Another function I don't understand.
#'
#' Here is the text from the authors:
#' Outer layer of the function that scans for minima. The input to
#' this, p, is later defined. p is a specific set of numbers that
#' specifiy all the poisitions for which the minimum finder will
#' start. scanformin is applied over each of these positions.
#'
#' Hmm yeah...
nb_minimum_search <- function(p, nb_vector) {
  start <- p
  ## This is finds minima. It takes p, the initial guess, and
  ## looks around with a specified sd (the newlocation
  ## variable). It asks if this new value (newstart)  is less than
  ## start. If so, start is set to newstart. Repeating this
  ## function iteratively changes the value of start if the
  ## newstart guess is smaller
  findmin <- function() {
    where <- which(nb_vector == start)
    newlocation <- abs(rnorm(1, mean = where, sd = length(na.omit(nb_vector)) / 10))
    if (newlocation > length(na.omit(nb_vector))) {
      newlocation <- where
    }
    if (newlocation < 1) {
      newlocation <- where
    }
    newstart <- nb_vector[round(newlocation)]
    if (newstart < start) {
      start <- newstart
    }
    return(start)
  }

  ## startvector is the resulting output of the findmin function
  ## run 1000 times. The start variable also changes to be the
  ## last guess. The position of this guess in the nb_vector
  ## variable is defined as decision. Put another way, decision is
  ## the nb_vector coordinate of the resiliency graph, and start
  ## is the y coordinate.
  startvector <- replicate(1000, findmin())
  decision <- which(nb_vector == start)
  return(decision)
} ## End of scanformin

#' This is the top-level function from the authors.  I have it
#' wrapped.
#'
#' It sets up the data and iterates over every experimental sample in
#' order to run the calculate resliency function upon it.
#'
#' @param read_csv CSV file containing the reads/tag, FIXME: use the
#'  tag data structure here to simplify things.
#' @param cfu_vector CFU values for each sample of interest.
#' @param innoculation_cfu Assumed cfu if they are not provided.
#' @param chosen_seed Set the seed you maniacs!
#' @param reference_samples Vector of reference samples, provided by
#'  the metadata via the parent function.
#' @param min_weight Constant of minimum weight...
#' @param noise_correction Constant of noise correction...
#' @param input_calibration CSV of input calibrations.
#' @param do_plots Plot the results?
#' @param verbose Be verbose?
#' @param output_csv Print the new Nb values here.
#' @param output_noiseless Print the noiseless data here...
rtisan_nr_nb <- function(read_csv, cfu_vector, innoculation_cfu = 1e8, chosen_seed = 1,
                         reference_samples = c(1, 2, 3), min_weight = 0.03,
                         noise_correction = 0.005,
                         input_calibration = NULL, do_plots = FALSE, verbose = FALSE,
                         output_csv = "rtisan_values.csv",
                         output_noiseless = "rtisan_noiseless.csv", cpus=NULL, parallel = TRUE) {
  ## This will eventually be pulled in from the metadata if I incorporate this into my library.
  set.seed(chosen_seed)
  reads_df <- read.csv(read_csv, row.names = 1)
  colnames(reads_df) <- tolower(gsub(pattern = "\\.txt", replacement = "", x=colnames(reads_df)))
  if (class(cfu_vector)[1] == "character") {
    cfu_vector <- read.csv(cfu_table)[[2]]
  }

  reference_df <- reads_df[, reference_samples]
  reference_vector <- rowMeans(reference_df)
  experimental_ids <- ! colnames(reads_df) %in% reference_samples
  experimental_df <- reads_df[, experimental_ids]
  sample_names <- colnames(experimental_df)
  reference_names <- colnames(reference_df)

  noiseless_table <- data.frame(row.names=rownames(reads_df))
  reference_rounded <- unname(round(round(reference_vector) *
                                    innoculation_cfu / sum(round(reference_vector))))
  steps <- seq(from=1, to=length(reference_rounded) * 10, by = 10)

  rounded_steps <- get_bottom_vector(reference_rounded, steps)
  step_df <- stats::sortedXyData(steps, rounded_steps)
  colnames(step_df) <- c("x", "y")

  ## TableOfEstimates is what the final table is called, and it will
  ## have different dimentions depending on if you specify a
  ## calibration curve. This variable is written into your directory
  ## and important metadata is spit out of the function so you can run
  ## the resiliency script to stand alone if needed (remembering to
  ## set plots = TRUE)
  estimate_table <- data.frame()
  res <- NULL
  results <- list()
  if (isTRUE(parallel)) {
    if (is.null(cpus)) {
      total_cpus <- parallel::detectCores()
      cpus <- total_cpus - 2
    }
    message("Starting cluster with: ", cpus, " cores.")
    cl <- parallel::makeCluster(cpus)
    doParallel::registerDoParallel(cl)
    tt <- requireNamespace("parallel")
    tt <- requireNamespace("doParallel")
    tt <- requireNamespace("iterators")
    tt <- requireNamespace("foreach")
    res <- foreach(s = 1:length(sample_names),
                   .packages = c("stampr")) %dopar% {
                     sample_name <- sample_names[s]
                     results[[sample_name]] <- calculate_resiliency(
                         reference_vector, reads_df, cfu_vector, noiseless_table, sample_name, step_df,
                         do_plots = do_plots, innoculation_cfu = innoculation_cfu,
                         input_calibration = input_calibration, output_csv = output_csv,
                         output_noiseless = output_noiseless,
                         min_weight = min_weight, noise_correction = noise_correction,
                         verbose = verbose)
                   }
    stopped <- parallel::stopCluster(cl)
    ## End running multi-threaded
  } else {
    message("Running samples sequentially one at a time.")
    for (s in 1:length(sample_names)) {
      sample_name <- sample_names[s]
      message("Starting: ", sample_name, ".")
      res[[s]] <- calculate_resiliency(
          reference_vector, reads_df, cfu_vector, noiseless_table, sample_name, step_df,
          do_plots = do_plots, innoculation_cfu = innoculation_cfu,
          input_calibration = input_calibration, output_csv = output_csv,
          output_noiseless = output_noiseless,
          min_weight = min_weight, noise_correction = noise_correction,
          verbose = verbose)
    }
  } ## End single threaded mode

  for (s in 1:length(sample_names)) {
    sample_estimate <- res[[s]]
    sample_name <- sample_names[s]
    noiseless_table <- sample_estimate[["noiseless_table"]]
    output_vector <- sample_estimate[["output_vector"]]
    input_vectorNoiseCorrected = sample_estimate[["input_vectorNoiseCorrected"]]
    input_vector <- sample_estimate[["input_vector"]]
    table_row <- sample_estimate[["row"]]
    estimate_table <- rbind(estimate_table, table_row)
    colnames(estimate_table) <- names(table_row)
    message("Collected result from ", sample_name, ":")
    print(table_row)
  }
  rownames(estimate_table) <- sample_names
  if (isTRUE(verbose)) {
    print(estimate_table)
  }
  if (is.null(input_calibration)) {
    colnames(estimate_table) <- c("Nb", "Nr", "Ns")
  } else {
    colnames(estimate_table) <- c("Nb", "Nr", "NbLower_Cal",
                                  "NbMedian_Cal", "NbUpper_Cal",
                                  "NrLower_Cal", "NrMedian_Cal",
                                  "NrUpper_Cal")
  }

  if (!is.null(output_csv)) {
    write.csv(estimate_table, output_csv)
  }
  if (!is.null(output_noiseless)) {
    write.csv(noiseless_table, output_noiseless)
  }

  retlist <- list(
      "reads_df" = reads_df,
      "estimate_table" = estimate_table,
      "filtered_table" = noiseless_table,
      "reference_vector" = reference_vector,
      "sample_names" = sample_names,
      "minimum_weight" = min_weight,
      "step_df" = step_df)
  return(retlist)
}

#' Calculate Nb values on a set of sorted read/tag vectors.
#'
#' This should trivially be able to be deleted in favor of calculate_nb().
sorted_nb <- function(bindsorted, input_vector, output_vector, generations = 1) {
  input <- bindsorted[, 1]
  out <- bindsorted[, 2]
  inputprop <- na.omit(input / sum(input))
  outprop <- na.omit(out / sum(out))
  num <- (outprop - inputprop) ^ 2
  den <- inputprop * (1 - inputprop)
  sigma <- num / den
  sigma <- sigma[sigma != Inf]
  F <- mean(na.omit(sigma))
  ## note that this Nb is assuming that the total number of reads
  ## as bindsorted gets trimmed is always equal to the total
  ## number of reads in the initial sample = otherwise you get
  ## negative Nbs and the nb_vector variable gets thrown off
  nb <- generations / (F - generations / sum(input_vector) - generations / sum(output_vector))
  return(nb)
}

#' Calculate the proprotion of each value in the vector with a twist!
#'
#' This is _almost_ a vector version of make_frequency_df, except it
#' takes only the last t values of the vector for the sum.  Why?  I am
#' not certain.
#'
#' @param t Number of elements on the bottom of the sorted vector from
#'  which to calculate the sum which is used as the denominator of the
#'  frequencies.
#' @param output_vector The input of reads/tag from which to calculate
#'  this peculiar variant of frequencies.
vector_proportion <- function(t, output_vector = NULL) {
  if (is.null(output_vector)) {
    return(NULL)
  }
  output_vectorsorted <- sort(output_vector)
  topnumbers <- tail(output_vectorsorted, t)
  proportion <- sum(topnumbers) / sum(output_vector)
  return(proportion)
}
abelew/stampr documentation built on April 14, 2022, 5:03 a.m.