library("hpgltools")
library("reticulate")
tt <- devtools::load_all("~/hpgltools")
knitr::opts_knit$set(width=120,
                     progress=TRUE,
                     verbose=TRUE,
                     echo=TRUE)
knitr::opts_chunk$set(error=TRUE,
                      dpi=96)
old_options <- options(digits=9,
                       stringsAsFactors=FALSE,
                       knitr.duplicate.label="allow")
ggplot2::theme_set(ggplot2::theme_bw(base_size=10))
rundate <- format(Sys.Date(), format="%Y%m%d")
previous_file <- ""
ver <- format(Sys.Date(), "%Y%m%d")

##tmp <- sm(loadme(filename=paste0(gsub(pattern="\\.Rmd", replace="", x=previous_file), "-v", ver, ".rda.xz")))
rmd_file <- "test_rtisanv9.Rmd"

A query from Nour

The following is an email from Nour regarding the usage of the STAMP implementation from a recent paper with her calibration curve data. The implementation from that paper is on github:

https://github.com/hullahalli/stampr_rtisan

I forked a copy of it in the hopes that I can understand what they did and see where Nour's problems are originating. With that in mind, here is her email:

"Here is the data I'm using to run the STAMPR script below. The script runs fine with the ExPEC data in the stampr_rtisan repo, but once I run the getNrNb function on our Pseudomonas calibration curve data, the Nb values do not match what we previously had with the STAMP (version 1) script. I made a few edits in the code to print out the Ns, Nr, and Nb values individually."

In her email, she attached a copy of the complete cfu csv file, the calibration curve reordered file, and an R script which I believe came out of their repository (except with some changes at the top to use the two other files). I am downloading those three files into the directory 'rtisan_inputs':

My understanding of this is that it should calculate the same Nb values as the reference STAMP implementation (and therefore the same value as my own implementation) followed by a series of values which are more accurate due to the filtering applied by the authors.

With all that in mind, I am going to insert the script in the following R block and play with it until I think I understand what is going on. Oh, one more caveat, the csv files are in a weirdo encoding and I ran dos2unix on them.

My current assumption is that I can find this function in the stampr_rtisan directory. I am going to strip out the function and just run each line with comments/modifications as I deem them interesting/necessary.

The following block is just my reference.

Version 8

Final internal function.

## START
library(tidyverse)
get_bottom_vector <- function(vector, steps) {
  new_vector <- rep(0, length(vector))
  for (i in 1:length(new_vector)) {
    step <- steps[i]
    distribution <- as.numeric(extraDistr::rmvhyper(1, vector, step))
    new_vector[i] <- sum(distribution != 0)
  }
  return(new_vector)
}

## This function takes the values in guessesunquesorted and
## identifies their location in the plot of barcode frequencies vs
## barcode, which is defined by cutoffpositions
convert_cutoffs <- function(p, output_vector, bind_df) {
  cutoff_position <- length(output_vector) - p
  cutoff_value <- bind_df[cutoff_position + 1, 2]
  if (cutoff_value == 0) {
    cutoff_value <- 1
  }
  ret <- cutoff_value / sum(output_vector)
  return(ret)
}


sorted_nb <- function(bindsorted, input_vector, output_vector) {
  input <- bindsorted[, 1]
  out <- bindsorted[, 2]
  inputprop <- na.omit(input / sum(input))
  outprop <- na.omit(out / sum(out))
  num <- (outprop - inputprop) ^ 2
  den <- inputprop * (1 - inputprop)
  sigma <- num / den
  sigma <- sigma[sigma != Inf]
  F <- mean(na.omit(sigma))
  ## note that this Nb is assuming that the total number of reads
  ## as bindsorted gets trimmed is always equal to the total
  ## number of reads in the initial sample = otherwise you get
  ## negative Nbs and the nb_vector variable gets thrown off
  nb <- 1 / (F - 1 / sum(input_vector) - 1 / sum(output_vector))
  return(nb)
}

get_interval_nb <- function(y, nb_vector=NULL) {
  if (is.null(nb_vector)) {
    return(NULL)
  }
  max(nb_vector[1:y])
}

## Outer layer of the function that scans for minima. The input to
## this, p, is later defined. p is a specific set of numbers that
## specifiy all the poisitions for which the minimum finder will
## start. scanformin is applied over each of these positions
nb_minimum_search <- function(p, nb_vector) {
  start <- p
  ## This is finds minima. It takes p, the initial guess, and
  ## looks around with a specified sd (the newlocation
  ## variable). It asks if this new value (newstart)  is less than
  ## start. If so, start is set to newstart. Repeating this
  ## function iteratively changes the value of start if the
  ## newstart guess is smaller
  findmin <- function() {
    where <- which(nb_vector == start)
    newlocation <- abs(rnorm(1, mean=where, sd=length(na.omit(nb_vector)) / 10))
    if (newlocation > length(na.omit(nb_vector))) {
      newlocation <- where
    }
    if (newlocation < 1) {
      newlocation <- where
    }
    newstart <- nb_vector[round(newlocation)]
    if (newstart < start) {
      start <- newstart
    }
    return(start)
  }

  ## startvector is the resulting output of the findmin function
  ## run 1000 times. The start variable also changes to be the
  ## last guess. The position of this guess in the nb_vector
  ## variable is defined as decision. Put another way, decision is
  ## the nb_vector coordinate of the resiliency graph, and start
  ## is the y coordinate.
  startvector <- replicate(1000, findmin())
  decision <- which(nb_vector == start)
  return(decision)
} ## End of scanformin


vector_proportion <- function(t, output_vector=NULL) {
  if (is.null(output_vector)) {
    return(NULL)
  }
  output_vectorsorted <- sort(output_vector)
  topnumbers <- tail(output_vectorsorted, t)
  proportion <- sum(topnumbers) / sum(output_vector)
  return(proportion)
}

## Internal Resiliency Function - this is the bulk of the
## script. The function gets applied on the vector of sample names
## (test_names above), so this is run for every noninput sample in
## The file containing the mapping of CFU values to the filename which was the source of the table of tags
## your dataset
calculate_resiliency <- function(reference_vector, reads_df, cfu_df, noiseless_table,
                                 sample_name, step_df, do_plots=FALSE, noise_correction=0.005) {
  ## Specifies input vector (which is the average of your inputs),
  ## output (the row which corresponds to the particular sample
  ## name), and CFU (a single value corresponding to the name of the
  ## sample)
  input_vector <- reference_vector
  wanted_column <- colnames(reads_df) == sample_name
  output_vector <- reads_df[, wanted_column]

  cfu <- 1E8
  if (!is.null(cfu_df)) {
    wanted_sample <- cfu_df[, 1] == sample_name
    cfu <- cfu_df[wanted_sample, 2]
  }

  ## Noise adjustment
  if (noise_correction != 0) {
    resampled_input <- extraDistr::rmvhyper(1, round(input_vector),
                                            round(noise_correction * sum(output_vector)))
    output_vector <- as.numeric(output_vector - resampled_input)
    output_vector[output_vector < 0] <- 0
  }

  ## Times specifies how many iterations the resiliency function is
  ## run. Too many or too few isn't ideal, but we can constrain it
  ## by CFU. So if you have 2 CFU, it will only run 2 times, because
  ## it's pointless to try to look farther than that. If you have a
  ## lot of CFU, this variable is set to the number of barcodes that
  ## have > 1 read. 1 is arbitrary, but we can be really quite
  ## confident that anything 1 or below is noise.
  times <- min(round(cfu + 1), sum(output_vector > 1))

  ## Combines input and output vectors into one data frame, and then
  ## orders everything by the output vector. The barcodes with the
  ## most reads in the output will be at the bottom of the data
  ## frame
  bind <- as.data.frame(cbind(input_vector, output_vector))
  bindsorted <- bind[order(bind[, 2]), ]

  ## This logical puts in a 0 for all barcodes in the output where
  ## we initially define noise.
  # if(times < length(output_vector)) {
  #   bindsorted[, 2][1:(length(output_vector) - times)] <- 0
  # }

  ## As the script is run, bindsorted itself will be changed, so we
  ## make a few copies of it for later use
  bindsortedcopy <- bindsorted
  bindsortedcopy2 <- bindsortedcopy

  ## minusone is run for the number of times specified by times. The
  ## initial 0 is removed from nb_vector and both nb_vector.
  nb_vector <- c()
  for (t in 1:times) {
    partial_nb <- sorted_nb(bindsorted, input_vector, output_vector)
    nb_vector <- c(nb_vector, partial_nb)
    bindsorted <- bindsorted[1:nrow(bindsorted) - 1, ]
  }

  ## Here we define p, which first requires defining q, which is set
  ## to be 1/15 of the total number of barcodes. So if this is is
  ## 1500 numbers long, q will be 1500/15 = 100 numbers long (1, 15,
  ## 30, 45 ... 1500). Correspondingly, p is the value of nb_vector
  ## at each of these positions.
  q <- round(seq(1, length(na.omit(nb_vector)),
                 length.out=length(na.omit(nb_vector)) / 15))
  p <- nb_vector[q]

  ## The guesses variable is defined as all the final
  ## decisionsreached from starting across all the values of p.  The
  ## unique guesses are taken and sorted.
  guesses <- map_dbl(p, nb_minimum_search, nb_vector=nb_vector)
  sorted_guesses <- sort(unique(c(guesses)))
  sorted_guesses <- c(sorted_guesses, length(na.omit(nb_vector)))

  ## There is sometimes some weirdness at the end of the run, and it
  ## makes decisions really close together. This part takes anything
  ## within 5 values of the end of the run and sets it to the
  ## end. This new sorted list is uniqued again.
  sorted_guesses[sorted_guesses > max(sorted_guesses) - 5] <- max(sorted_guesses)
  sorted_guesses <- unique(sorted_guesses)

  ## Identifies greatest log change
  xdif <- c(log(nb_vector[1]), log(nb_vector))
  xdif <- xdif[1:length(xdif) - 1]
  xsub <- (log(nb_vector) - xdif)
  greatest_delta <- which(xsub == max(xsub)) - 1
  sorted_guesses <- sort(unique(c(sorted_guesses, greatest_delta)))
  sorted_guesses <- sorted_guesses[sorted_guesses != 0]

  ## Sometimes, guesses will be within one unit and this can be
  ## problematic when it sees this as a big weight jump (because
  ## only one barcode is in that group). This part makes sure that
  ## each guess is at least 2 barcodes apart

  staggered_guesses <- c(-10000, sorted_guesses)
  difference <- c(sorted_guesses, 10000) - staggered_guesses
  if (min(difference) == 1) {
    removed_guess <- max(which(difference == 1)) - 1
    sorted_guesses <- sorted_guesses[-removed_guess]
  }
  sorted_guesses <- sorted_guesses

  ## Once we have the guesses, we need to make the table that
  ## describes the weights assigned to each guess. This function
  ## takes as an input the sorted_guesses list above and identifies
  ## the fraction of reads acounded for by all barcodes that have
  ## more reads than specific by each value in sorted_guesses. For
  ## example, if one of the guesses was 15, and position 15 was a
  ## barcode with 100 reads, this function will determine the
  ## fraction of all reads acounted for by barcodes with at least
  ## 100 reads.
  fraction_accounted <- map_dbl(sorted_guesses, vector_proportion, output_vector=output_vector)

  ## Here we adjust the fractionaccounted variable so that it takes
  ## in the fraction of accounted reads in between guesses, not just
  ## from that guess upwards
  staggered <- c(0, fraction_accounted)[1:length(fraction_accounted)]
  subtracted <- fraction_accounted - staggered

  ## Here we obtain the manb_vector Nb up to each location defined
  ## in sorted_guesses
  nb_intervals <- map_dbl(sorted_guesses, get_interval_nb, nb_vector=nb_vector)

  ## And finally we combine these into the indices table. The
  ## headings explain what each value means. We have now subseted
  ## our data into descrete segments separated by local minima
  indices <- data.frame(nb_intervals, subtracted, sorted_guesses)
  colnames(indices) <- c("Nr", "Accounts for", "Number of barcodes")

  ## These logicals define the start of noise (initial_noise) as the
  ## location of greatest log change in the "accounts for" section,
  ## with a few more housekeeping stuff
  weights <- log(indices[, 2])
  values <- (indices[, 1])

  subtracted_weights <- c(weights[2:length(values)], 0)
  weights_delta <- (weights - subtracted_weights)[1:length(weights) - 1]
  if (length(weights_delta) == 0) {
    weights_delta <- values
  }
  initial_noise <- indices[, 3][which(weights_delta == max(weights_delta))]
  initial_noisecopy <- initial_noise

  ## if there are no populations that are below the minimum weight
  ## threshold, set noise to be the end of the last detected
  ## population
  if (is.na(indices[, 3][min(which(indices[, 2] < minimum_weight))])) {
    initial_noise <- max(indices[, 3])
  }
  ## if the sum of the weights of the population after the start of
  ## noise is greater than the miniumum weight threshold, set the
  ## start to be where the the rest of the reads after are under the
  ## minimum weight
  cutoff_position <- min(which(cumsum(indices[, 2]) > (1 - minimum_weight)))
  if (sum(indices[, 2][which(indices[, 3] > initial_noise)]) > minimum_weight) {
    initial_noise <- indices[, 3][cutoff_position]
  }

  ##Plots nb_vector and barcodes if specified
  if (isTRUE(do_plots)) {
    par(mfrow=c(3, 1))
    plot(1:times, nb_vector, log="y", ylim=c(.01, 2E6),
         xlim=c(1, length(as.numeric(na.omit(nb_vector)))),
         main="Resiliency", ylab="Nb", xlab="Iteration")
    abline(v=sorted_guesses)
    plot(output_vector / sum(output_vector), log="y",
         ylim=c(1E-6, 1), main="Barcodes", ylab="Frequency", xlab="Barcode")
  }

  ## This function takes the values in guessesunquesorted and
  ## identifies their location in the plot of barcode frequencies vs
  ## barcode, which is defined by cutoffpositions
  cutoff_positions <- map_dbl(sorted_guesses, convert_cutoffs,
                              output_vector=output_vector, bind_df=bindsortedcopy)

  ## Now we redefine our output vector such that everything after
  ## the start of noise is set to 0. Note that this is done on
  ## bindsortedcopy2, and not bindsorted or bindsortedcopy
  noise_length <- dim(bindsortedcopy2)[1] - initial_noise
  bindsortedcopy2[1:noise_length, ][2] <- 0
  output_vectorwithoutnoise <- as.numeric(bindsortedcopy2[, 2])

  ## The resiliency function is run again, this time with the new
  ## noise-less output vector and where the resulting output is
  ## defined as z. z is functionally the same as x, except z is done
  ## after the noise correction.
  final_nb_vector <- c()
  iterations <- sum(bindsortedcopy2[2] != 0)
  for (i in 1:iterations) {
    partial_nb <- sorted_nb(bindsortedcopy2, input_vector, output_vector)
    final_nb_vector <- c(final_nb_vector, partial_nb)
    bindsortedcopy2 <- bindsortedcopy2[1:nrow(bindsortedcopy2) - 1, ]
  }

  ## Nr is the defined by final value. It is further constrained to
  ## ensure that that it is greater than Nb (recall that the
  ## original Nb estimate is nb_vector[1])
  final_value <- max(final_nb_vector)
  if (final_value < nb_vector[1]) {
    final_value <- nb_vector[1]
  }

  ## Adjustment for when Nb < the identified number of barcodes
  ## above noise. Here, the number of barcodes is a better
  ## estimate. So we change Nr to reflect this.
  min_nearest_times <- max(indices[, 3][(indices[, 3] <= initial_noise)])
  computed_bottleneck <<- stats::NLSstClosestX(step_df, min_nearest_times) ## this is Ns
  if (final_value < computed_bottleneck) {
    final_value <<- computed_bottleneck
  }

  ## Plots the new resiliency function if needed
  if (isTRUE(do_plots)) {
    abline(h=cutoff_positions)
    print(indices)
    print(final_value)
    plot(final_nb_vector, log="y", ylim=c(.01, 2E6),
         xlim=c(1, length(as.numeric(na.omit(final_nb_vector)))),
         main="Resiliency without noise", ylab="Nb", xlab="Iteration")
  }

  if (!is.null(calibration_file)) {
    lookup_table <- read.csv(calibration_file, row.names=1)
  }
  if (nb_vector[1] < 1) {
    nb_vector[1] <- 1
  }
  if (final_value < 1) {
    final_value <- 1
  }
  if (!is.null(calibration_file)) {
    nbcalibrated <- as.numeric(lookup_table[as.character(
        round(log10(nb_vector[1]), digits=2)), ])
    nrcalibrated <- as.numeric(lookup_table[as.character(
        round(log10(final_value), digits=2)), ])
  }
  if (is.null(calibration_file)) {
    nbcalibrated <- NA
    nrcalibrated <- NA
  }
  number_zeros <- sum(output_vectorwithoutnoise == 0)
  output_vector[order(output_vector)][1:number_zeros] <- 0
  noiseless_table <- data.frame(noiseless_table, output_vector)
  colnames(noiseless_table)[length(colnames(noiseless_table))] <- sample_name

  row <- c(nb_vector[1], final_value, nbcalibrated, nrcalibrated, computed_bottleneck)
  names(row) <- c("Nb", "Nr", "Nb_cal", "Nr_cal", "Ns")

  retlist <- list(
      "noiseless_table" = noiseless_table,
      "output_vector" = output_vector,
      "input_vectorNoiseCorrected" = input_vector,
      "input_vector" = reference_vector,
      "row" = row)
  return(retlist)
}

calculate_NrNb <- function(reads_csv, cfu_csv, innoculation_cfu, reference_samples,
                           minimum_weight, noise_correction=0.005,
                           output_csv="calibration_curve.csv",
                           calibration_file=NULL, do_plots=FALSE,
                           noiseless_csv="FrequenciesWithoutNoise.csv",
                           chosen_seed=1) {
  set.seed(chosen_seed)
  ## These four lines set up the necessary metadata, and split up the
  ## table of reads into inputs and outputs
  reads_df <- read.csv(reads_csv, row.names=1)
  cfu_df <- NULL
  if (!is.null(cfu_csv)) {
    cfu_df <- read.csv(cfu_csv)
  }

  reference_vector <- rowMeans(cbind(reads_df[, reference_samples]))
  sample_names <- colnames(reads_df)[-reference_samples]
  ## Makes a table for estimating bottleneck from fraction of barcodes that are identified
  noiseless_table <- data.frame(row.names=rownames(reads_df))
  reference_rounded <- unname(round(round(reference_vector) *
                                    innoculation_cfu / sum(round(reference_vector))))
  steps <- seq(from=1, to=length(reference_rounded) * 10, by=10)

  rounded_steps <- get_bottom_vector(reference_rounded, steps)
  step_df <- stats::sortedXyData(steps, rounded_steps)
  colnames(step_df) <- c("x", "y")

  ## TableOfEstimates is what the final table is called, and it will
  ## have different dimentions depending on if you specify a
  ## calibration curve. This variable is written into your directory
  ## and important metadata is spit out of the function so you can run
  ## the resiliency script to stand alone if needed (remembering to
  ## set plots = TRUE)
  estimate_table <- data.frame()
  for (s in 1:length(sample_names)) {
    sample_name <- sample_names[s]
    message("Working on sample: ", sample_name, ".")
    sample_estimate <- calculate_resiliency(
        reference_vector, reads_df, cfu_df,
        noiseless_table,
        sample_name, step_df, do_plots=do_plots,
        noise_correction=noise_correction)
    print(sample_estimate[["row"]])
    noiseless_table <- sample_estimate[["noiseless_table"]]
    output_vector <- sample_estimate[["output_vector"]]
    input_vectorNoiseCorrected = sample_estimate[["input_vectorNoiseCorrected"]]
    input_vector <- sample_estimate[["input_vector"]]
    table_row <- sample_estimate[["row"]]
    estimate_table <- rbind(estimate_table, table_row)
    colnames(estimate_table) <- names(table_row)
  }
  rownames(estimate_table) <- sample_names
  print(estimate_table)
  if (!is.null(calibration_file)) {
    colnames(estimate_table) <- c("Nb", "Nr", "NbLower_Cal",
                                  "NbMedian_Cal", "NbUpper_Cal",
                                  "NrLower_Cal", "NrMedian_Cal",
                                  "NrUpper_Cal")
  }
  if (is.null(calibration_file)) {
    colnames(estimate_table) <- c("Nb", "Nr", "Nb_cal", "Nr_cal", "Ns")
  }

  write.csv(estimate_table, output_csv)
  write.csv(noiseless_table, noiseless_csv)

  retlist <- list(
      "reads_df" = reads_df,
      "estimate_table" = estimate_table,
      "filtered_table" = noiseless_table,
      "reference_vector" = reference_vector,
      "sample_names" = sample_names,
      "minimum_weight" = minimum_weight,
      "step_df" = step_df)
  return(retlist)
}
## By the time this function is employed, this has been changed to the column name in the ReadsTable.
cfu_csv <- "rtisan_inputs/20220228_Complete-CFU.csv"
## This is the table of tags to # observations.
reads_csv <- "rtisan_inputs/20220228_STAMP_CAUTI_calcurve_OrderedFrequencies.csv"
## The bacterial concentration when innoculated for the calibration curve.
innoculation_cfu <- 1e7
## Nour reorganized the read table to move these three samples to the front
reference_samples <- c(1, 2, 3)
## The next arguments I do not know
minimum_weight <- 0.03
noise_correction <- 0.005
output_csv <- "final_calibration_curve.csv"
calibration_file <- NULL
do_plots <- FALSE
chosen_seed <- 1
result <- calculate_NrNb(reads_csv=reads_csv, cfu_csv=cfu_csv,
                         output_csv=output_csv,
                         innoculation_cfu=innoculation_cfu,
                         reference_samples = 1:3,
                         minimum_weight=0.03,
                         noise_correction = 0.005)
result[["estimate_table"]]

Previous result

A1.txt 1.291928e+02 2.914526e+02 NA NA 5.00000 A2.txt 4.857502e+02 4.857502e+02 NA NA 44.33333 A3.txt 5.572276e+03 5.572276e+03 NA NA 2421.00000 A4.txt 4.198916e+04 4.198916e+04 NA NA 1506.00000 A5.txt 1.740399e+05 1.740399e+05 NA NA 83201.00000 A6.txt 5.747539e+05 5.747539e+05 NA NA 83201.00000 A7.txt 1.000000e+00 1.029965e+07 NA NA 83201.00000 B2.txt 6.282483e+02 6.282483e+02 NA NA 26.71429 B3.txt 4.233901e+03 4.233901e+03 NA NA 401.00000 B4.txt 3.652900e+04 8.320100e+04 NA NA 83201.00000 B5.txt 2.038735e+05 2.038735e+05 NA NA 83201.00000 B6.txt 1.663816e+06 1.663816e+06 NA NA 83201.00000 B7.txt 1.429455e+06 1.429455e+06 NA NA 83201.00000 C1.txt 8.648846e+01 8.648846e+01 NA NA 7.00000 C2.txt 1.129677e+03 1.129677e+03 NA NA 125.00000 C3.txt 1.162878e+04 1.801100e+04 NA NA 18011.00000 C4.txt 1.225763e+05 1.225763e+05 NA NA 2701.00000 C5.txt 1.321329e+06 1.393901e+06 NA NA 83201.00000 C6.txt 1.000000e+00 4.167221e+06 NA NA 83201.00000 C7.txt 1.000000e+00 2.056423e+06 NA NA 83201.00000

rr pander::pander(sessionInfo()) message(paste0("This is hpgltools commit: ", get_git_commit())) this_save <- paste0(gsub(pattern="\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz") message(paste0("Saving to ", this_save)) tmp <- sm(saveme(filename=this_save)) ```



abelew/stampr documentation built on April 14, 2022, 5:03 a.m.