R/ModifyData.R

Defines functions qfa.edgecultures normalisePlates qfa.normalize

Documented in normalisePlates qfa.normalize

## normalisation functions -----

#' Function to normalize QFA data
#'
#' This function normalizes the input data to be in the range of o to 1.
#' (Input data.frame preferentially generated by colonyzer.read
#' or GC2QFA). The target values need to be stored in a column named Growth.
#' Different methods for normalization are provided. "All", the default methods,
#' normalizes all data within the data.frame. The method "Plate" normalizes per plate
#' indicated by a distinct "Barcode" within the data.frame. Method "Sample"
#' normalizes over each position indicated by Row and COlumn and distinct Barcode.
#' Method "ZeroShift" subtracts from each position its own minimal value to move the
#' starting values of the growth curve to zero. Method "Adjust" is useful if at one timepoint
#' an arbitrarily high value was recorded. This can happen if the light was considerably
#' higher at one timepoint due to experimental error. If this only happens infrequently,
#' the user can specify the AdjInd as the position of measurement to be replaced with
#' the value of AdjTo. Usually AdjTo should be AdjInd+/-1. Use cautiosly.
#' The last method "Reference" normalizes the given data but does not take the min and max values
#' of itself but the values MaxVal and MinVal provided by the user. This may be useful if comparing
#' datasets generated with slightly varying imaging settings.
#'
#'
#' @param path dta data.frame. This should contain data generated by either GC2QFA or colonyzer.read and a column
#' named Growth in which the growth data is stored. This column's data will be normalized.
#' @param  NormMode String. This indicates the method used for the normalisation. For explanations, see the text above.
#' Possible options: All, Plate, Sample, ZeroShift, Adjust, Reference.
#' @param AdjInd Numerical. Only necessary to specify for the Adjust method. Indicates the index of the
#' growth value to be replaced with the value at index AdjTo.
#' @param AdjTo Numerical. Only necesarry to specify for the Adjust method. Indicates the index to
#' take the value from to replace at index AdjInd.
#' @param MaxVal Numerical. Only necessary to specify for the Reference method. This
#' indicates the potential max value that could have been recorded.
#' @param MinVal Numerical. Only necessary to specify for the Reference method. This
#' indicates the potential min value that could have been recorded.
#'
qfa.normalize <- function(dta, NormMode = "All", AdjTo = NA, AdjInd = NA, MaxVal = NA, MinVal = NA) {
    # this function normalizes the growth data with a given method input variables: dta: growth dataframe (either from GC2QFA or colonyzer.read) NormMode: Either ='All'
    # for all values in dta, ='Plate' for normalizing over the whole plate, ='Sample' for normalizing per positiono/sample, or ='Adjust' for replacing index AdjInd
    # values with AdjTo values per sample/position, or ='ZeroShift' to subtract from each sample its own min(growth) value or ='Reference' for which the user spefies
    # MaxVal and MinVal that the specific setup the user has can output. This is useful if multiple runs on differing system need to be compared. Measure the same max
    # and min references (eg. a given strain, let it grow until saturation) and specify these here. Measured Growth values are normalized to lie between MinVal and
    # MaxVal to downshift all to zero. only suggested if no differences in initial value is expected AdjTo: Index of value to take to replace value in AdjInd (1=the
    # first in timeseries, 2 =second etc) AdjInd: The index of value to replace in timeseries per sample MinVal and MaxVal: For NormMode='Reference', userspecified max
    # and min possible values

    control <- dta
    control$Normalization = NA
    barcodes <- unique(control$Barcode)
    nbc <- length(barcodes)

    if (!"Growth" %in% names(control)) {
      stop("Please specify a column named Growth in the input data.frame which is used as target value.")
    }

    if (NormMode == "All") {
        # normlize all samples not regarding different plates
        control$Growth = ((control$Growth - min(control$Growth))/(max(control$Growth) - min(control$Growth)))
        control$Normalization = NormMode

    } else if (NormMode == "Plate") {
        # normalize all measurements between 0 and 1 on one plate
        for (bcode in barcodes) {
            dbc <- control[control$Barcode == bcode, ]
            dbc$Growth = ((dbc$Growth - min(dbc$Growth))/(max(dbc$Growth) - min(dbc$Growth)))
            dbc$Normalization = NormMode
            control[control$Barcode == bcode, ] = dbc
        }
    } else if (NormMode == "Sample") {
        # normalize each sample/well between 0 and 1
        for (bcode in barcodes) {
            # Get list of unique colony positions
            dbc <- control[control$Barcode == bcode, ]
            positions <- lapply(1:length(dbc[, 1]), index2pos, dbc)
            positions <- unique(positions)
            for (i in 1:length(positions)) {
                position <- positions[[i]]
                row <- position[1]
                col <- position[2]
                # print(paste(bcdata$Barcode[1],'Row:',row,'Col:',col))
                do <- dbc[(dbc$Row == row) & (dbc$Column == col), ]
                dbc$Growth[(dbc$Row == row) & (dbc$Column == col)] = ((do$Growth - min(do$Growth))/(max(do$Growth) - min(do$Growth)))
            }
            control[control$Barcode == bcode, ] = dbc
        }


    } else if (NormMode == "Adjust") {
        # copying value of AdjInd to AdjTo
        if (is.na(AdjTo) | is.na(AdjInd)) {
            stop("Please specify AdjTo and AdjInd")
        }

        for (bcode in barcodes) {
            # Get list of unique colony positions
            dbc <- control[control$Barcode == bcode, ]
            positions <- lapply(1:length(dbc[, 1]), index2pos, dbc)
            positions <- unique(positions)

            for (i in 1:length(positions)) {
                position <- positions[[i]]
                row <- position[1]
                col <- position[2]
                # print(paste(bcdata$Barcode[1],'Row:',row,'Col:',col))
                do <- dbc[(dbc$Row == row) & (dbc$Column == col), ]
                do$Growth[AdjInd] = do$Growth[AdjTo]
                control$Growth[(control$Row == row) & (control$Column == col)] = do$Growth
            }
        }

    } else if (NormMode == "ZeroShift") {
        for (bcode in barcodes) {
            # Get list of unique colony positions
            dbc <- control[control$Barcode == bcode, ]
            positions <- lapply(1:length(dbc[, 1]), index2pos, dbc)
            positions <- unique(positions)


            for (i in 1:length(positions)) {
                position <- positions[[i]]
                row <- position[1]
                col <- position[2]
                # print(paste(bcdata$Barcode[1],'Row:',row,'Col:',col))
                do <- dbc[(dbc$Row == row) & (dbc$Column == col), ]
                do$Growth = do$Growth - min(do$Growth)
                control$Growth[(control$Row == row) & (control$Column == col)] = do$Growth
            }
        }
    } else if (NormMode == "Reference") {
        if (is.na(MaxVal) | is.na(MinVal)) {
            stop("Please specify both MaxVal and MinVal")
        }
        control$Growth = ((control$Growth - MinVal)/(MaxVal - MinVal))
        control$Normalization = NormMode
        control$NormMaxValRef = MaxVal
        control$NormMinValRef = MinVal
    } else {
        # wrong insertion by user
        stop("Please specify NormMode correctly (see documentation)")
    }
    control$Normalization = NormMode
    return(control)
}




####Normalisation over plates function----
#'Normalising culture fitness by plate
#'
#'Sometimes estimated culture fitnesses vary systematically depending on the plate on which they are inoculated. Agar in individual
#'plates could come from different batches, and therefore have slightly different levels of nutrients or water. Plates could be inoculated
#'at different times, and stored at slightly different temperatures for example. Depending on inoculation method, inoculation time specified
#'may be less accurate for individual plates. Any of these issues could effect simulated fitness slightly. This function allows us to normalise
#'culture fitnessses across plates to eliminate such effects. It should only really be used for small differences. In order to preserve real
#'signal, the experimental sources of larger differences should be corrected before analysis instead of by normalising them away.
#'This normalisation function only works if there are more than one plate in the input data.frame as (of course) otherwise there is no
#'normalisation between plates possible.
#'
#'Starting with a data frame describing the output from the qfa.fit function (with optional added columns from the makeFitness function)
#'normalisePlates finds all unique groups in that data frame, calculates a median value from the indicated column for all plates in a given group
#'and then normalises the fitnesses of each culture on each plate so that the median fitness on each plate is equal to the median fitness for all
#'plates in the group. The function returns a vector which can be added to the original data frame or used to over-write the original, raw data.
#'
#'Typically a "group" will be a description of treatment (e.g. temperature) or of growth medium (e.g. drug added to solid agar) or differenet strains.
#'
#'@param d data.frame. Output from qfa.fit2 for normalization
#'@param column String. name of column to normalise (typically the fitness measure or model fit value of interest). See qfa.fit2 and
#'makeFitness2 help files for descriptions of available culture fitness measures
#'@param groupcol String. Name of column labelling group membership of each plate/barcode. Typically this will be the Treatment
#'or Medium column (or some combination of both if there are more than one treatment and more than one medium within the dataframe d)
#'@param fmin Epsilon (very small number greater than zero). Do not scale by dividing fitnesses by a plate-wide median if median is less than fmin
#'
#'@return Vector. The output vector contains the normalised values of the input column. This can be added to the input data.frame or replace the
#'original values.
#'
#' @keywords qfa.fit2
#' @examples
#' data(qfa.testdata)
#' #Strip non-experimental edge cultures
#' qfa.testdata = qfa.testdata[(qfa.testdata$Row!=1) & (qfa.testdata$Col!=1) & (qfa.testdata$Row!=8) & (qfa.testdata$Col!=12),]
#' # Define which measure of cell density to use
#' qfa.testdata$Growth = qfa.testdata$Intensity
#' GmpFit = qfa.fit2(qfa.testdata, inocguess=NULL, detectThresh=0, globalOpt=F, AUCLim=NA, TimeFormat="h", Model="Gmp")
#' # Construct fitness measures
#' GmpFit = makeFitness2(GmpFit, AUCLim=NA, plotFitness="no", filename = NA)
#' #create a mock data.frame with slightly increased MDR values
#' GmpFit_m = GmpFit
#' GmpFit_m$MDR = GmpFit_m$MDR + rnorm(length(GmpFit_m$MDR), mean = 0.3, sd = 0.1)
#' #change barcode of mock data.frame and combine into one
#' GmpFit_m$Barcode = "Mock"
#' GmpFit_m = rbind(GmpFit, GmpFit_m)
#' #Add the normalised values of MDR, indicate that the different treatments are stored within ORF column
#' GmpFit_m$MDRnorm = normalisePlates(GmpFit_m, "MDR", "ORF")
#' #create plots
#' qfaFitnessPlot(GmpFit_m, plotFitness = "MDR")
normalisePlates = function(d, column, groupcol = "Treatment", fmin = 1e-06) {
  d[, groupcol] = as.character(d[, groupcol])
  d[, column] = as.numeric(d[, column])
  d$Barcode = as.character(d$Barcode)

  if (length(unique(d$Barcode))==1) {
    stop("There is only one plate/barcode in the input data.frame. Normalisation meaningless.")
  }

  for (grouplabel in sort(unique(d[, groupcol]))) {
    print(paste("Normalising plates by group", grouplabel))
    med = median(d[d[, groupcol] == grouplabel, column])
    for (b in unique(d$Barcode[d[, groupcol] == grouplabel])) {
      datlst = as.numeric(d[(d$Barcode == b) & (d[, groupcol] == grouplabel), column])
      p.med = median(datlst)
      if (p.med >= fmin)
        datlst = datlst * med/p.med  # Avoid division by zero median fitness...
      d[(d$Barcode == b) & (d[, groupcol] == grouplabel), column] = datlst
    }
  }
  return(as.numeric(d[, column]))
}



## edge culture cutting ----

qfa.edgecultures = function(d) {
  if (! "Column" %in% names(d)) d$Column = d$Col
  bcd = unique(d$Barcode)

  dmod = NA
  for (i in 1:length(bcd)) {
    maxR = max(d$Row[d$Barcode == bcd[i]], na.rm=T)
    minR = min(d$Row[d$Barcode == bcd[i]], na.rm=T)
    maxC = max(d$Column[d$Barcode == bcd[i]], na.rm=T)
    minC = min(d$Column[d$Barcode == bcd[i]], na.rm=T)
    if (i == 1) {
      dmod = d[((d$Row!=minR)&(d$Column!=minC)&(d$Row!=maxR)&(d$Column!=maxC)) & d$Barcode==bcd[i],]
    }else{
      dmod2 = d[((d$Row!=minR)&(d$Column!=minC)&(d$Row!=maxR)&(d$Column!=maxC)) & d$Barcode==bcd[i],]
      dmod = rbind(dmod, dmod2)
    }
  }
  return(dmod)
}
JulBaer/baQFA documentation built on Feb. 19, 2023, 10:32 p.m.