library("hpgltools") library("reticulate") tt <- devtools::load_all("~/hpgltools") knitr::opts_knit$set(width=120, progress=TRUE, verbose=TRUE, echo=TRUE) knitr::opts_chunk$set(error=TRUE, dpi=96) old_options <- options(digits=9, stringsAsFactors=FALSE, knitr.duplicate.label="allow") ggplot2::theme_set(ggplot2::theme_bw(base_size=10)) rundate <- format(Sys.Date(), format="%Y%m%d") previous_file <- "" ver <- format(Sys.Date(), "%Y%m%d") ##tmp <- sm(loadme(filename=paste0(gsub(pattern="\\.Rmd", replace="", x=previous_file), "-v", ver, ".rda.xz"))) rmd_file <- "test_rtisanv9.Rmd"
The following is an email from Nour regarding the usage of the STAMP implementation from a recent paper with her calibration curve data. The implementation from that paper is on github:
https://github.com/hullahalli/stampr_rtisan
I forked a copy of it in the hopes that I can understand what they did and see where Nour's problems are originating. With that in mind, here is her email:
"Here is the data I'm using to run the STAMPR script below. The script runs fine with the ExPEC data in the stampr_rtisan repo, but once I run the getNrNb function on our Pseudomonas calibration curve data, the Nb values do not match what we previously had with the STAMP (version 1) script. I made a few edits in the code to print out the Ns, Nr, and Nb values individually."
In her email, she attached a copy of the complete cfu csv file, the calibration curve reordered file, and an R script which I believe came out of their repository (except with some changes at the top to use the two other files). I am downloading those three files into the directory 'rtisan_inputs':
My understanding of this is that it should calculate the same Nb values as the reference STAMP implementation (and therefore the same value as my own implementation) followed by a series of values which are more accurate due to the filtering applied by the authors.
With all that in mind, I am going to insert the script in the following R block and play with it until I think I understand what is going on. Oh, one more caveat, the csv files are in a weirdo encoding and I ran dos2unix on them.
My current assumption is that I can find this function in the stampr_rtisan directory. I am going to strip out the function and just run each line with comments/modifications as I deem them interesting/necessary.
The following block is just my reference.
Final internal function.
## START library(tidyverse) get_bottom_vector <- function(vector, steps) { new_vector <- rep(0, length(vector)) for (i in 1:length(new_vector)) { step <- steps[i] distribution <- as.numeric(extraDistr::rmvhyper(1, vector, step)) new_vector[i] <- sum(distribution != 0) } return(new_vector) } ## This function takes the values in guessesunquesorted and ## identifies their location in the plot of barcode frequencies vs ## barcode, which is defined by cutoffpositions convert_cutoffs <- function(p, output_vector, bind_df) { cutoff_position <- length(output_vector) - p cutoff_value <- bind_df[cutoff_position + 1, 2] if (cutoff_value == 0) { cutoff_value <- 1 } ret <- cutoff_value / sum(output_vector) return(ret) } sorted_nb <- function(bindsorted, input_vector, output_vector) { input <- bindsorted[, 1] out <- bindsorted[, 2] inputprop <- na.omit(input / sum(input)) outprop <- na.omit(out / sum(out)) num <- (outprop - inputprop) ^ 2 den <- inputprop * (1 - inputprop) sigma <- num / den sigma <- sigma[sigma != Inf] F <- mean(na.omit(sigma)) ## note that this Nb is assuming that the total number of reads ## as bindsorted gets trimmed is always equal to the total ## number of reads in the initial sample = otherwise you get ## negative Nbs and the nb_vector variable gets thrown off nb <- 1 / (F - 1 / sum(input_vector) - 1 / sum(output_vector)) return(nb) } get_interval_nb <- function(y, nb_vector=NULL) { if (is.null(nb_vector)) { return(NULL) } max(nb_vector[1:y]) } ## Outer layer of the function that scans for minima. The input to ## this, p, is later defined. p is a specific set of numbers that ## specifiy all the poisitions for which the minimum finder will ## start. scanformin is applied over each of these positions nb_minimum_search <- function(p, nb_vector) { start <- p ## This is finds minima. It takes p, the initial guess, and ## looks around with a specified sd (the newlocation ## variable). It asks if this new value (newstart) is less than ## start. If so, start is set to newstart. Repeating this ## function iteratively changes the value of start if the ## newstart guess is smaller findmin <- function() { where <- which(nb_vector == start) newlocation <- abs(rnorm(1, mean=where, sd=length(na.omit(nb_vector)) / 10)) if (newlocation > length(na.omit(nb_vector))) { newlocation <- where } if (newlocation < 1) { newlocation <- where } newstart <- nb_vector[round(newlocation)] if (newstart < start) { start <- newstart } return(start) } ## startvector is the resulting output of the findmin function ## run 1000 times. The start variable also changes to be the ## last guess. The position of this guess in the nb_vector ## variable is defined as decision. Put another way, decision is ## the nb_vector coordinate of the resiliency graph, and start ## is the y coordinate. startvector <- replicate(1000, findmin()) decision <- which(nb_vector == start) return(decision) } ## End of scanformin vector_proportion <- function(t, output_vector=NULL) { if (is.null(output_vector)) { return(NULL) } output_vectorsorted <- sort(output_vector) topnumbers <- tail(output_vectorsorted, t) proportion <- sum(topnumbers) / sum(output_vector) return(proportion) } ## Internal Resiliency Function - this is the bulk of the ## script. The function gets applied on the vector of sample names ## (test_names above), so this is run for every noninput sample in ## The file containing the mapping of CFU values to the filename which was the source of the table of tags ## your dataset calculate_resiliency <- function(reference_vector, reads_df, cfu_df, noiseless_table, sample_name, step_df, do_plots=FALSE, noise_correction=0.005) { ## Specifies input vector (which is the average of your inputs), ## output (the row which corresponds to the particular sample ## name), and CFU (a single value corresponding to the name of the ## sample) input_vector <- reference_vector wanted_column <- colnames(reads_df) == sample_name output_vector <- reads_df[, wanted_column] cfu <- 1E8 if (!is.null(cfu_df)) { wanted_sample <- cfu_df[, 1] == sample_name cfu <- cfu_df[wanted_sample, 2] } ## Noise adjustment if (noise_correction != 0) { resampled_input <- extraDistr::rmvhyper(1, round(input_vector), round(noise_correction * sum(output_vector))) output_vector <- as.numeric(output_vector - resampled_input) output_vector[output_vector < 0] <- 0 } ## Times specifies how many iterations the resiliency function is ## run. Too many or too few isn't ideal, but we can constrain it ## by CFU. So if you have 2 CFU, it will only run 2 times, because ## it's pointless to try to look farther than that. If you have a ## lot of CFU, this variable is set to the number of barcodes that ## have > 1 read. 1 is arbitrary, but we can be really quite ## confident that anything 1 or below is noise. times <- min(round(cfu + 1), sum(output_vector > 1)) ## Combines input and output vectors into one data frame, and then ## orders everything by the output vector. The barcodes with the ## most reads in the output will be at the bottom of the data ## frame bind <- as.data.frame(cbind(input_vector, output_vector)) bindsorted <- bind[order(bind[, 2]), ] ## This logical puts in a 0 for all barcodes in the output where ## we initially define noise. # if(times < length(output_vector)) { # bindsorted[, 2][1:(length(output_vector) - times)] <- 0 # } ## As the script is run, bindsorted itself will be changed, so we ## make a few copies of it for later use bindsortedcopy <- bindsorted bindsortedcopy2 <- bindsortedcopy ## minusone is run for the number of times specified by times. The ## initial 0 is removed from nb_vector and both nb_vector. nb_vector <- c() for (t in 1:times) { partial_nb <- sorted_nb(bindsorted, input_vector, output_vector) nb_vector <- c(nb_vector, partial_nb) bindsorted <- bindsorted[1:nrow(bindsorted) - 1, ] } ## Here we define p, which first requires defining q, which is set ## to be 1/15 of the total number of barcodes. So if this is is ## 1500 numbers long, q will be 1500/15 = 100 numbers long (1, 15, ## 30, 45 ... 1500). Correspondingly, p is the value of nb_vector ## at each of these positions. q <- round(seq(1, length(na.omit(nb_vector)), length.out=length(na.omit(nb_vector)) / 15)) p <- nb_vector[q] ## The guesses variable is defined as all the final ## decisionsreached from starting across all the values of p. The ## unique guesses are taken and sorted. guesses <- map_dbl(p, nb_minimum_search, nb_vector=nb_vector) sorted_guesses <- sort(unique(c(guesses))) sorted_guesses <- c(sorted_guesses, length(na.omit(nb_vector))) ## There is sometimes some weirdness at the end of the run, and it ## makes decisions really close together. This part takes anything ## within 5 values of the end of the run and sets it to the ## end. This new sorted list is uniqued again. sorted_guesses[sorted_guesses > max(sorted_guesses) - 5] <- max(sorted_guesses) sorted_guesses <- unique(sorted_guesses) ## Identifies greatest log change xdif <- c(log(nb_vector[1]), log(nb_vector)) xdif <- xdif[1:length(xdif) - 1] xsub <- (log(nb_vector) - xdif) greatest_delta <- which(xsub == max(xsub)) - 1 sorted_guesses <- sort(unique(c(sorted_guesses, greatest_delta))) sorted_guesses <- sorted_guesses[sorted_guesses != 0] ## Sometimes, guesses will be within one unit and this can be ## problematic when it sees this as a big weight jump (because ## only one barcode is in that group). This part makes sure that ## each guess is at least 2 barcodes apart staggered_guesses <- c(-10000, sorted_guesses) difference <- c(sorted_guesses, 10000) - staggered_guesses if (min(difference) == 1) { removed_guess <- max(which(difference == 1)) - 1 sorted_guesses <- sorted_guesses[-removed_guess] } sorted_guesses <- sorted_guesses ## Once we have the guesses, we need to make the table that ## describes the weights assigned to each guess. This function ## takes as an input the sorted_guesses list above and identifies ## the fraction of reads acounded for by all barcodes that have ## more reads than specific by each value in sorted_guesses. For ## example, if one of the guesses was 15, and position 15 was a ## barcode with 100 reads, this function will determine the ## fraction of all reads acounted for by barcodes with at least ## 100 reads. fraction_accounted <- map_dbl(sorted_guesses, vector_proportion, output_vector=output_vector) ## Here we adjust the fractionaccounted variable so that it takes ## in the fraction of accounted reads in between guesses, not just ## from that guess upwards staggered <- c(0, fraction_accounted)[1:length(fraction_accounted)] subtracted <- fraction_accounted - staggered ## Here we obtain the manb_vector Nb up to each location defined ## in sorted_guesses nb_intervals <- map_dbl(sorted_guesses, get_interval_nb, nb_vector=nb_vector) ## And finally we combine these into the indices table. The ## headings explain what each value means. We have now subseted ## our data into descrete segments separated by local minima indices <- data.frame(nb_intervals, subtracted, sorted_guesses) colnames(indices) <- c("Nr", "Accounts for", "Number of barcodes") ## These logicals define the start of noise (initial_noise) as the ## location of greatest log change in the "accounts for" section, ## with a few more housekeeping stuff weights <- log(indices[, 2]) values <- (indices[, 1]) subtracted_weights <- c(weights[2:length(values)], 0) weights_delta <- (weights - subtracted_weights)[1:length(weights) - 1] if (length(weights_delta) == 0) { weights_delta <- values } initial_noise <- indices[, 3][which(weights_delta == max(weights_delta))] initial_noisecopy <- initial_noise ## if there are no populations that are below the minimum weight ## threshold, set noise to be the end of the last detected ## population if (is.na(indices[, 3][min(which(indices[, 2] < minimum_weight))])) { initial_noise <- max(indices[, 3]) } ## if the sum of the weights of the population after the start of ## noise is greater than the miniumum weight threshold, set the ## start to be where the the rest of the reads after are under the ## minimum weight cutoff_position <- min(which(cumsum(indices[, 2]) > (1 - minimum_weight))) if (sum(indices[, 2][which(indices[, 3] > initial_noise)]) > minimum_weight) { initial_noise <- indices[, 3][cutoff_position] } ##Plots nb_vector and barcodes if specified if (isTRUE(do_plots)) { par(mfrow=c(3, 1)) plot(1:times, nb_vector, log="y", ylim=c(.01, 2E6), xlim=c(1, length(as.numeric(na.omit(nb_vector)))), main="Resiliency", ylab="Nb", xlab="Iteration") abline(v=sorted_guesses) plot(output_vector / sum(output_vector), log="y", ylim=c(1E-6, 1), main="Barcodes", ylab="Frequency", xlab="Barcode") } ## This function takes the values in guessesunquesorted and ## identifies their location in the plot of barcode frequencies vs ## barcode, which is defined by cutoffpositions cutoff_positions <- map_dbl(sorted_guesses, convert_cutoffs, output_vector=output_vector, bind_df=bindsortedcopy) ## Now we redefine our output vector such that everything after ## the start of noise is set to 0. Note that this is done on ## bindsortedcopy2, and not bindsorted or bindsortedcopy noise_length <- dim(bindsortedcopy2)[1] - initial_noise bindsortedcopy2[1:noise_length, ][2] <- 0 output_vectorwithoutnoise <- as.numeric(bindsortedcopy2[, 2]) ## The resiliency function is run again, this time with the new ## noise-less output vector and where the resulting output is ## defined as z. z is functionally the same as x, except z is done ## after the noise correction. final_nb_vector <- c() iterations <- sum(bindsortedcopy2[2] != 0) for (i in 1:iterations) { partial_nb <- sorted_nb(bindsortedcopy2, input_vector, output_vector) final_nb_vector <- c(final_nb_vector, partial_nb) bindsortedcopy2 <- bindsortedcopy2[1:nrow(bindsortedcopy2) - 1, ] } ## Nr is the defined by final value. It is further constrained to ## ensure that that it is greater than Nb (recall that the ## original Nb estimate is nb_vector[1]) final_value <- max(final_nb_vector) if (final_value < nb_vector[1]) { final_value <- nb_vector[1] } ## Adjustment for when Nb < the identified number of barcodes ## above noise. Here, the number of barcodes is a better ## estimate. So we change Nr to reflect this. min_nearest_times <- max(indices[, 3][(indices[, 3] <= initial_noise)]) computed_bottleneck <<- stats::NLSstClosestX(step_df, min_nearest_times) ## this is Ns if (final_value < computed_bottleneck) { final_value <<- computed_bottleneck } ## Plots the new resiliency function if needed if (isTRUE(do_plots)) { abline(h=cutoff_positions) print(indices) print(final_value) plot(final_nb_vector, log="y", ylim=c(.01, 2E6), xlim=c(1, length(as.numeric(na.omit(final_nb_vector)))), main="Resiliency without noise", ylab="Nb", xlab="Iteration") } if (!is.null(calibration_file)) { lookup_table <- read.csv(calibration_file, row.names=1) } if (nb_vector[1] < 1) { nb_vector[1] <- 1 } if (final_value < 1) { final_value <- 1 } if (!is.null(calibration_file)) { nbcalibrated <- as.numeric(lookup_table[as.character( round(log10(nb_vector[1]), digits=2)), ]) nrcalibrated <- as.numeric(lookup_table[as.character( round(log10(final_value), digits=2)), ]) } if (is.null(calibration_file)) { nbcalibrated <- NA nrcalibrated <- NA } number_zeros <- sum(output_vectorwithoutnoise == 0) output_vector[order(output_vector)][1:number_zeros] <- 0 noiseless_table <- data.frame(noiseless_table, output_vector) colnames(noiseless_table)[length(colnames(noiseless_table))] <- sample_name row <- c(nb_vector[1], final_value, nbcalibrated, nrcalibrated, computed_bottleneck) names(row) <- c("Nb", "Nr", "Nb_cal", "Nr_cal", "Ns") retlist <- list( "noiseless_table" = noiseless_table, "output_vector" = output_vector, "input_vectorNoiseCorrected" = input_vector, "input_vector" = reference_vector, "row" = row) return(retlist) } calculate_NrNb <- function(reads_csv, cfu_csv, innoculation_cfu, reference_samples, minimum_weight, noise_correction=0.005, output_csv="calibration_curve.csv", calibration_file=NULL, do_plots=FALSE, noiseless_csv="FrequenciesWithoutNoise.csv", chosen_seed=1) { set.seed(chosen_seed) ## These four lines set up the necessary metadata, and split up the ## table of reads into inputs and outputs reads_df <- read.csv(reads_csv, row.names=1) cfu_df <- NULL if (!is.null(cfu_csv)) { cfu_df <- read.csv(cfu_csv) } reference_vector <- rowMeans(cbind(reads_df[, reference_samples])) sample_names <- colnames(reads_df)[-reference_samples] ## Makes a table for estimating bottleneck from fraction of barcodes that are identified noiseless_table <- data.frame(row.names=rownames(reads_df)) reference_rounded <- unname(round(round(reference_vector) * innoculation_cfu / sum(round(reference_vector)))) steps <- seq(from=1, to=length(reference_rounded) * 10, by=10) rounded_steps <- get_bottom_vector(reference_rounded, steps) step_df <- stats::sortedXyData(steps, rounded_steps) colnames(step_df) <- c("x", "y") ## TableOfEstimates is what the final table is called, and it will ## have different dimentions depending on if you specify a ## calibration curve. This variable is written into your directory ## and important metadata is spit out of the function so you can run ## the resiliency script to stand alone if needed (remembering to ## set plots = TRUE) estimate_table <- data.frame() for (s in 1:length(sample_names)) { sample_name <- sample_names[s] message("Working on sample: ", sample_name, ".") sample_estimate <- calculate_resiliency( reference_vector, reads_df, cfu_df, noiseless_table, sample_name, step_df, do_plots=do_plots, noise_correction=noise_correction) print(sample_estimate[["row"]]) noiseless_table <- sample_estimate[["noiseless_table"]] output_vector <- sample_estimate[["output_vector"]] input_vectorNoiseCorrected = sample_estimate[["input_vectorNoiseCorrected"]] input_vector <- sample_estimate[["input_vector"]] table_row <- sample_estimate[["row"]] estimate_table <- rbind(estimate_table, table_row) colnames(estimate_table) <- names(table_row) } rownames(estimate_table) <- sample_names print(estimate_table) if (!is.null(calibration_file)) { colnames(estimate_table) <- c("Nb", "Nr", "NbLower_Cal", "NbMedian_Cal", "NbUpper_Cal", "NrLower_Cal", "NrMedian_Cal", "NrUpper_Cal") } if (is.null(calibration_file)) { colnames(estimate_table) <- c("Nb", "Nr", "Nb_cal", "Nr_cal", "Ns") } write.csv(estimate_table, output_csv) write.csv(noiseless_table, noiseless_csv) retlist <- list( "reads_df" = reads_df, "estimate_table" = estimate_table, "filtered_table" = noiseless_table, "reference_vector" = reference_vector, "sample_names" = sample_names, "minimum_weight" = minimum_weight, "step_df" = step_df) return(retlist) }
## By the time this function is employed, this has been changed to the column name in the ReadsTable. cfu_csv <- "rtisan_inputs/20220228_Complete-CFU.csv" ## This is the table of tags to # observations. reads_csv <- "rtisan_inputs/20220228_STAMP_CAUTI_calcurve_OrderedFrequencies.csv" ## The bacterial concentration when innoculated for the calibration curve. innoculation_cfu <- 1e7 ## Nour reorganized the read table to move these three samples to the front reference_samples <- c(1, 2, 3) ## The next arguments I do not know minimum_weight <- 0.03 noise_correction <- 0.005 output_csv <- "final_calibration_curve.csv" calibration_file <- NULL do_plots <- FALSE chosen_seed <- 1 result <- calculate_NrNb(reads_csv=reads_csv, cfu_csv=cfu_csv, output_csv=output_csv, innoculation_cfu=innoculation_cfu, reference_samples = 1:3, minimum_weight=0.03, noise_correction = 0.005) result[["estimate_table"]]
A1.txt 1.291928e+02 2.914526e+02 NA NA 5.00000 A2.txt 4.857502e+02 4.857502e+02 NA NA 44.33333 A3.txt 5.572276e+03 5.572276e+03 NA NA 2421.00000 A4.txt 4.198916e+04 4.198916e+04 NA NA 1506.00000 A5.txt 1.740399e+05 1.740399e+05 NA NA 83201.00000 A6.txt 5.747539e+05 5.747539e+05 NA NA 83201.00000 A7.txt 1.000000e+00 1.029965e+07 NA NA 83201.00000 B2.txt 6.282483e+02 6.282483e+02 NA NA 26.71429 B3.txt 4.233901e+03 4.233901e+03 NA NA 401.00000 B4.txt 3.652900e+04 8.320100e+04 NA NA 83201.00000 B5.txt 2.038735e+05 2.038735e+05 NA NA 83201.00000 B6.txt 1.663816e+06 1.663816e+06 NA NA 83201.00000 B7.txt 1.429455e+06 1.429455e+06 NA NA 83201.00000 C1.txt 8.648846e+01 8.648846e+01 NA NA 7.00000 C2.txt 1.129677e+03 1.129677e+03 NA NA 125.00000 C3.txt 1.162878e+04 1.801100e+04 NA NA 18011.00000 C4.txt 1.225763e+05 1.225763e+05 NA NA 2701.00000 C5.txt 1.321329e+06 1.393901e+06 NA NA 83201.00000 C6.txt 1.000000e+00 4.167221e+06 NA NA 83201.00000 C7.txt 1.000000e+00 2.056423e+06 NA NA 83201.00000
r
r
pander::pander(sessionInfo())
message(paste0("This is hpgltools commit: ", get_git_commit()))
this_save <- paste0(gsub(pattern="\.Rmd", replace="", x=rmd_file), "-v", ver, ".rda.xz")
message(paste0("Saving to ", this_save))
tmp <- sm(saveme(filename=this_save))
```
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.