R/extract_lxb_data.R

Defines functions join pbox bootstrapping extractExperimentalSetup readBeads processReadout writeMIDAS writeMIDASfile analyseBeadDistributions sortBeads

Documented in analyseBeadDistributions bootstrapping extractExperimentalSetup processReadout readBeads sortBeads writeMIDAS writeMIDASfile

#' @import lxb

# join function to create a string from a list of string
join = function(string_list, sep=" ") {
    first=TRUE
    for (element in string_list) {
        if (element != "") {
            if(first) {
                full_string = element;
                first = FALSE;
            } else {
                full_string = paste(full_string, element, sep=sep)
            }
        }
    }
    return(full_string)
}
# Personnalized boxplot
pbox = function(data, main="", sub="", xlab, ylab) {
    boxplot(data, main=main, sub=sub, xlab=xlab, ylab=ylab, range=0)
}

#' Bootstrap function
#'
#' Converging boostrap, with a minimum number of run to ensure some initial variation
#' @param func Function to use to aggregate the data
#' @param error Error rate of the confidence interval to report
#' @param min_run Minimum number of bootstrap steps to take
#' @param max_run Maximum number of bootstrap steps to take
#' @param ESP Convergence limit
#' @return a vector or length 3 with the mean, and the 'error' confidence interval of this mean. 5\% by default (0.025 percentile on each side)
#' @export
bootstrapping <- function(values, func=median, error=0.05, min_run=100, max_run=1000*1000, EPS=0.001) {
    bootstrap = c()
    for (i in 1:min_run) {
        med = func(sample(values, replace=T))
        bootstrap = c(bootstrap, med)
    }

    old_mean = mean(bootstrap)
    new_mean = old_mean + EPS
    while(abs(old_mean - new_mean) >= EPS && length(bootstrap) < max_run)
    {
        #for (i in 1:min_run) {
            #med = func(sample(values, replace=T))
            #bootstrap = c(bootstrap, med)
        #}
        bootstrap = c(bootstrap, sapply(1:min_run, function(X) { func(sample(values, replace=T)) }))
        
        old_mean = new_mean
        new_mean = mean(bootstrap)
    }
    bootstrap = sort(bootstrap)

    return(c(mean(bootstrap), bootstrap[floor(error/2 * length(bootstrap))], bootstrap[ceiling((1-error/2) * length(bootstrap))]))
}

#' Get the association between wells and treatments
#' @param plate An array describing the organisation of the plate. The empty string "" indicates that no beads were present and that informations should not be extracted from the well.
#' @export
extractExperimentalSetup <- function(plate) {
    treatments = list()
    for (row in rownames(plate)) {
        for (col in colnames(plate)) {
            well = paste0(row, col)
            treatments[[well]] = plate[row, col]
        }
    }
    wells_per_treatment = list()
    blanks = c(); controls = c(); externals=c()
    for (well in names(treatments)) {
        if (grepl("[bB]lank", treatments[[well]])) { # Blank wells
            blanks = c(blanks, well)
        } else if (grepl( "control", tolower(treatments[[well]]) )) {
            controls = c(controls, well)
        } else if (length(treatments[[well]]) == 0 || grepl("NULL|empty", treatments[[well]]) || treatments[[well]]=="") {
            externals = c(externals, well)
        } else {
            wells_per_treatment[[ treatments[[well]] ]] = c(wells_per_treatment[[ treatments[[well]] ]], well)
        }
    }
    wells_per_treatment_with_blank = wells_per_treatment;
    wells_per_treatment_with_blank[["blank"]] = blanks;
# Extract stimulators and perturbators
    perturbators = unique(unlist(strsplit(unique(unlist(treatments)), "\\+")))
    perturbators = perturbators[perturbators!="control" & perturbators!="blank"]
    stimulators = perturbators[!grepl("i$", perturbators)]
    inhibitors = perturbators[grepl("i$", perturbators)]

    return(list(treatments=treatments, wptb=wells_per_treatment_with_blank, wpt=wells_per_treatment, stimulators=stimulators, inhibitors=inhibitors, controls=controls, blanks=blanks, externals=externals))
}

#' Process the bead dataset
#'
#' @param lxb_dataset The lxb dataset
#' @param region Beads IDs used
#' @param analyte Analyte corresponding to the analysis.
#' @param controls Wells containing control conditions.
#' @param blanks Wells containing no cell.
#' @param wells_per_treatment_with_blank List of wells where each treatment is applied. It is a list where each entry is named after a treatment and contains a vector of well names.
#' @param externals Wells to exclude from the processing
#' @param PLOT Whether data should be plotted
#' @export
readBeads <- function(lxb_dataset, region, analyte, controls, blanks, wells_per_treatment_with_blank, plot_dir="", externals=c(""), PLOT=FALSE, verbose=TRUE, min_bead_number=15) {
    wells_per_treatment = wells_per_treatment_with_blank
    wells_per_treatment[["blank"]] = NULL # Suppress this entry
    # Collection of samples caracteristics
    cell_data = c()
    cell_variation = c()
    # Local collection of the histogram data
    normality = c()
    dist_beads = c()
    invalids = c()
    #wells_variation = data.frame(row.names=names(lxb_dataset))
    wells_variation = c()
    for (bead in region) {
        antibody = analyte[region == bead]
        dist_beads[[antibody]] = list()
        bead_carac = c(); # Value of the antibody in each well
        bead_blank = c(); # Value of the blank for this antibody
        bead_control = c(); # Value of the control for this antibody
        median_cv = c(); # Collect the sample CV from the bootstrap
        if(PLOT) { pdf(paste(plot_dir, gsub("/", "-", gsub(" ", "_", antibody)), "_", basename(plot_dir), ".pdf", sep="")) }
        for (well in names(lxb_dataset)) {
            if (!(well %in% externals)) {
                # Extract the information (bootstrapped median + error)
                processing = processReadout(lxb_dataset, bead, well, PLOT)
                if (length(processing$values) <  min_bead_number) { # If there are no beads, do not use the data
                    #warning(paste0("Well ", well, " has 0 beads of type ", analyte[which(region==bead)]))
                    invalids = rbind(invalids, c(well, analyte[which(region==bead)]))
                }
                # Gather the information with a convenient indexing
                dist_beads[[antibody]][[well]] = processing$values
                normality = c(normality, processing$shapiro)
                # Bead distribution, per cell line and per antibody
                bead_carac[well] = processing$bootstrap[1]
                median_cv[well] = processing$cv
            }
        }
        if (PLOT) { dev.off() }

        variations = computeVariation(wells_per_treatment_with_blank, bead_carac, dist_beads[[antibody]], median_cv, blanks, controls)
        wells_variation = cbind(wells_variation, median_cv)
        cell_variation = cbind(cell_variation, variations$cv)
        cell_data = cbind(cell_data, bead_carac)
        if (verbose) {
            message(paste(antibody, "processed..."))
        }
    }
    colnames(wells_variation) = analyte
    colnames(cell_variation) = analyte
    colnames(cell_data) = analyte
    if (length(invalids) > 0) { warning(paste( "Some wells do not have enough valid beads for some readouts:", paste0(unique(invalids[,1]), collapse=", ") )) }
    return(list(datas=cell_data, cv=cell_variation, controls=controls, blanks=blanks, wpt=wells_per_treatment, invalid_wells=invalids, normality=normality, wells_variation=wells_variation))
}

#' Extract data for a bead number in a well from an lxb dataset
#' 
#' Select valid data for the specified bead of the specified well and perform a bootstrap to extract a robust mean an the variation
#' @param dataset An full lxb dataset
#' @param bead The bead number
#' @param well The well identifier
#' @export 
processReadout <- function(dataset, bead, well, PLOT=FALSE) {
    data = dataset[[well]]

    # Select valid measurements for the selected bead and get rid of outlayers for the plot
    # CL = Classification channel and RP = Readout value
    valids = which(data[,"Bead Valid"]==1 & data[,"Bead ID"]==bead & data[, "RP1S Valid"]==1 & data[, "RP1L Valid"]==1 & data[, "CL1 Valid"]==1 & data[, "CL2 Valid"]==1 )
    values = data[valids, "RP1 Value"]
    unfiltered = values
    values = values[values > mean(values)/10] # Remove beads without signal (see the analysis of SKNAS_plex_panel_300818 for details on the choice of threshold)
    if (length(values) < 3) { # 3 values are required for the shapiro test
        return(list(values=values, shapiro=c(), cv=0, bootstrap=c(NA, NA, NA)))
    }
    # Bootstrap the median on all the data
    bst = bootstrapping(values, error=0.05)
    #run2 = c(run2, bootstrapping(data[valids, "RP1 Value"])[1])
    median_cv = (bst[3] - bst[2]) / (bst[1] * 2 * 1.96) # 2 * 1.96 to have one sd under normality assumption
    # Plot the histogram for each bead in the well with a normal density on top of it
    if(PLOT) {# False for quick computation
        plot_values = values[values < median(values) + 3*sd(values)] # Remove high values outliers so the plot stays readable
        unfiltered = unfiltered[unfiltered < median(values) + 3*sd(values)]
        par(col="black")
        hist(unfiltered, 1 + round(length(plot_values)/5), prob=TRUE, col=rgb(0.5, 0.5, 0.5, 0.5), main=paste(analyte[which(region==bead)], "for well", well))
        hist(plot_values, 1 + round(length(plot_values)/5), col=rgb(0.2, 0.7, 0.2, 0.5), prob=TRUE, add=TRUE)

        med = median(plot_values);
        dev = sd(plot_values)
        range = med + 4*dev;
        nb_points = 10 * 1000;
        par(col="red")
        lines(seq(0, range, range/nb_points), dnorm(seq(0, range, range/nb_points), mean=med, sd=dev))
        # Bootstrap results
        lines(rep(bst[1], 101), 0:100/1000, col="blue", lty=2, lwd=1.5)
        lines(rep(bst[2], 101), 0:100/1000, col="blue", lty=2, lwd=0.7)
        lines(rep(bst[3], 101), 0:100/1000, col="blue", lty=2, lwd=0.7)
        mtext(paste(length(values), "beads selected (out of ", length(unfiltered), ")"), 3)
    }
    # Normality test, collection per cell line and per antibody
    normal = tryCatch(shapiro.test(values)$statistic, error=function(X) {return(1)})

    return(list(values=values, shapiro=normal, cv=median_cv, bootstrap=bst))
}

#' Compute the coefficient of variation for one readout
#'
#' Compute the coefficient of variation for one readout taking into account the variation of the for each well and the variation between samples
#' @param wells_per_treatment List of vectors, each entry of the list is named after a treatment and contains a list of wells coordinates
#' @param bead_carac Values for all the wells for one bead
#' @param wilcox_sample List of the distribution of values of the bead in each well
#' @export
computeVariation <- function (wells_per_treatment, bead_carac, wilcox_sample, median_cv, blanks, controls, default_cv=0.3, min_cv=0.1) {
    # Get CVs from replicated conditions that are not too close to blank (i.e where CV != noise) to have a global noise level for the readout
    # Without blank, we don't know the background noise level so we collect all
    replicates_cv = c()
    for (wells in wells_per_treatment) {
        collect = c()
        for (well in wells) {
            if (is.na(bead_carac[well])) {
                warning(paste("No beads in well ", well))
            } else if (length(blanks) == 0 || bead_carac[well] > 2 * mean(bead_carac[blanks], na.rm=TRUE)) {
                collect = c(collect, well)
            }
        }
        if (length(collect) > 1) {
            replicates_cv = c( replicates_cv, sd(bead_carac[wells]) / mean(bead_carac[wells]) )
        }
    }
    # Compute the global CV, which is the average of the local ones
    #global_cv = exp(mean(log(replicates_cv[!is.na(replicates_cv)]))) # Geometric mean
    global_cv = mean(replicates_cv, na.rm=TRUE)
    if (is.nan(global_cv) | is.na(global_cv)) { global_cv = default_cv }
    else if (global_cv < min_cv) { global_cv = min_cv }

    # Compute variation for each readout, plus the controls and the blank
    bead_variation = c()
    pvalues = list()
    all_wells = wells_per_treatment
    all_wells[["blank"]] = blanks
    all_wells[["control"]] = controls
    for (treatment in names(all_wells)) {
        wells = all_wells[[treatment]]

        # Wilcox test on replicates
        pvalues[[treatment]] = c()
        if (length(wells) > 1) {
            for ( i in 1:(length(wells)-1) ) {
                for (j in (i+1):length(wells)) {
                    if (length(wilcox_sample[[wells[i]]])!=0 && length(wilcox_sample[[wells[j]]])!=0) {
                        pvalues[[treatment]][paste0(i, "_", j)] = suppressWarnings(wilcox.test(wilcox_sample[[wells[i]]], wilcox_sample[[wells[j]]] + sample(c(-1, 1))/10000, alternative="two.sided")$p.value) # Wilcox test with tiny noise to avoid ties
                    #} else {
                        #pvalues[[treatment]][paste0(i, "_", j)] = 1
                    }
                    #wilcox_antibody[[analyte[which(region==bead)] ]] = c(wilcox_antibody[[ analyte[which(region==bead)] ]], pvalue )
                    #wilcox_cell_line[[lname]] = c(wilcox_cell_line[[lname]], pvalue)
                }
            }
        }

        # Compare the replicate CV with the boostrap CVs and collect the biggest
        square_sum = c()
        for (well in wells) {
            if (!is.na(median_cv[well]) && global_cv < median_cv[well]) {
                square_sum = c(square_sum, median_cv[well]^2 * bead_carac[well]^2)
            } else {
                square_sum = c(square_sum, global_cv^2 * bead_carac[well]^2)
            }
        }
        # Take the standard error of the mean and convert it back to a cv (a.k.a cv of the mean)
        for (well in wells) {
            bead_variation[well] = sqrt(sum(square_sum)/length(wells)) / mean(bead_carac[wells])
        }
    }
    return(list(wilcox=pvalues, cv=bead_variation))
}

#' Write lxb data in MIDAS format
#'
#' Write data extracted and processed using readBeads in MIDAS format
#' @param beads Beads data
#' @export
writeMIDAS <- function(beads, dname, inhibitors, stimulators, midas_dir="midas_data", others=c()) {
    writeMIDASfile(beads$datas, beads$cv, dname, beads$blanks, beads$controls, beads$wpt, inhibitors, stimulators, midas_dir, others)
}

#' Write the data in a file in MIDAS format
#'
#' @param datas The values measured. Should a matrix with the wells as rownames and the analyte as column names
#' @param variations The coefficient of variation from the replicates
#' @param dname The name of the data
#' @export
writeMIDASfile <- function(datas, variations, dname, blanks, controls, wells_per_treatment, inhibitors, stimulators, midas_dir="midas_data", others=c()) {
    #TODO: Other informations
    perturbators = c(stimulators, inhibitors)
    if (nchar(midas_dir) == 0) { midas_dir="midas_data" }
    if (!grepl("/$", midas_dir)) { midas_dir=paste0(midas_dir, "/") }
    suppressWarnings(dir.create(midas_dir))
    handle = file(paste0(midas_dir, "blunt_", dname, "_MIDAS.csv", sep=""), open="w")
    vhandle = file(paste0(midas_dir, "blunt_", dname, "_MIDAS.var", sep=""), open="w")
    # First line with headers, remove previous file content
    new_line = "ID:type,ID:well"
    for (treatment in perturbators) {
        new_line = paste(new_line, ",TR:", treatment, sep="")
    }
    new_line = paste(new_line, ",DA:ALL", sep="")
    for (antibody in colnames(datas)) {
        new_line = paste(new_line, ",DV:", antibody, sep="")
    }
    writeLines(new_line, handle)
    writeLines(new_line, vhandle)
    # Blanks
    for (well in blanks) {
        new_line = paste("blank", well, sep=",")
        for (treatment in perturbators) { # TR fields
            new_line = paste(new_line, ",0", sep="")
        }
        new_line = paste(new_line, ",0", sep="") # DA field
        var_line = new_line
        for (antibody in colnames(datas)) { # DV fields
            new_line = paste(new_line, round(datas[well, antibody]), sep=",")
            var_line = paste(var_line, signif(variations[well, antibody], 3), sep=",")
        }
        write(new_line, handle, sep="\n", append=TRUE)
        write(var_line, vhandle, sep="\n", append=TRUE)
    }
    # Controls
    for (well in controls) {
        new_line = paste("c", well, sep=",")
        for (treatment in perturbators) { # TR fields
            new_line = paste(new_line, ",0", sep="")
        }
        new_line = paste(new_line, ",0", sep="") # DA field
        var_line = new_line
        for (antibody in colnames(datas)) { # DV field
            new_line = paste(new_line, round(datas[well, antibody]), sep=",")
            var_line = paste(var_line, signif(variations[well, antibody], 3), sep=",")
        }
        write(new_line, handle, sep="\n", append=TRUE)
        write(var_line, vhandle, sep="\n", append=TRUE)
    }
    # Samples
    for (treatment in names(wells_per_treatment)) {
        if (!(wells_per_treatment[[treatment]] %in% controls || wells_per_treatment[[treatment]] %in% blanks)) {
            # Identify which stimulus and inhibition are used
            stim = stimulators[stimulators %in% unlist(strsplit(treatment, "\\+"))]
            inhib = inhibitors[inhibitors %in% unlist(strsplit(treatment, "\\+"))]
            # Put the perturbation in the MIDAS format
            init_line = "t,WELL"
            for (perturbation in perturbators) {
                if (perturbation %in% c(inhib, stim)) {
                    init_line = paste(init_line, "1", sep=",")
                } else {
                    init_line = paste(init_line, "0", sep=",")
                }
            }
            init_line = paste(init_line, "0", sep=",") # DA field

            for (well in wells_per_treatment[[treatment]]) {
                new_line = gsub("WELL", well, init_line)
                var_line = new_line;
                for (antibody in colnames(datas)) { # DV fields
                    new_line = paste(new_line, round(datas[well, antibody]), sep=",")
                    var_line = paste(var_line, signif(variations[well, antibody], 3), sep=",")
                }         
                write(new_line, handle, sep="\n", append=TRUE)
                write(var_line, vhandle, sep="\n", append=TRUE)
            }
        }
    }

    close(handle)
    close(vhandle)
}

#' Plot the distribution of beads in each well
#'
#' @param region Number of the beads to analyse
#' @param analyte Name of the analytes corresponding to the bead number of region
#' @param tpw List of the treatments applied to each well (names(tpw) is the name of the well)
#' @param bn_limit Bead number lower limit to plot the histograms
#' @param good_count Maximum ordinate of the bead count density
#' @export
analyseBeadDistributions <- function(lxb_dataset, region, analyte="", tpw=list(), bn_limit=20, good_count=300) {
    if (all(analyte == "")) { analyte == region }
    if (length(analyte) != length(region)) { warning("Length of 'analyte' and 'region' differ, setting 'analyte' to 'region'"); analyte=region }
    cum_beads = list()
    for (well in names(lxb_dataset)) {
        wdata = lxb_dataset[[well]]
        if (is.null(wdata)) {
            warning(paste0("Well ", well, " does not contain any bead"))
        } else {
            hist_data = lapply( region, function(X) { dd=wdata[wdata[,"Bead ID"]==X,"RP1 Value"]; if (length(dd)==0) {return(0)}; return(dd) })
            names(hist_data) = analyte
            plot(c(-1, 18), rep(0, 2), type="l", col="grey", xlab="log2 Value", main=paste0("Value for well ", well, ifelse(well%in%names(tpw), paste0(" (", tpw[[well]], ")"), "")), xlim=c(0, 17), ylim=c(0, 1.2), ylab="Density")
            tmp=sapply(analyte, function(X) {
                            if (length(hist_data[[X]]) > bn_limit) {
                                lines(density(log2(hist_data[[X]])), col=which(analyte==X))
                            }
                        })
            nbeads = sapply(hist_data, function(X) { ll=length(X); ifelse(is.null(ll), 0, ll)  })
            names(nbeads) = analyte
            cum_beads[[well]] = nbeads
            legend(13, 1.2, paste0(analyte, " (#", nbeads, ")"), col=1:length(analyte), lwd=1)
        }
    }
    cum_beads = as.data.frame(cum_beads)
    cum_beads[cum_beads>300] = 300
    plot(c(0, good_count + 20), rep(0, 2), type="l", col="grey", xlab="Bead count", ylab="Density", main="Bead number distribution", xlim=c(0, good_count + 10), ylim=c(0, 30/good_count))
    tmp=sapply(analyte, function(X) { lines(density(as.numeric(cum_beads[X,]), n=2048), col=which(analyte==X)) })
    legend(0.8*good_count, 30/good_count, analyte, col=1:length(analyte), lwd=1)
}

#' Extract values from the lxb dataset
#'
#' Extract values from the lxb dataset in a nicely formated list
#' @param dataset An full lxb dataset
#' @param region Beads IDs used
#' @param analyte Analyte corresponding to the analysis.
#' @param wells_per_treatment_with_blank List of wells where each treatment is applied. It is a list where each entry is named after a treatment and contains a vector of well names.
#' @export
sortBeads <- function(lxb_dataset, wells_per_treatment_with_blank, region, analyte) {
    output = list()
    for (well in unlist(wells_per_treatment_with_blank)) {
        data = lxb_dataset[[well]]
        output[[well]] = list()
        for (bnb in 1:length(region)) {
            valids = which(data[,"Bead Valid"]==1 & data[,"Bead ID"]==region[bnb] & data[, "RP1S Valid"]==1 & data[, "RP1L Valid"]==1 & data[, "CL1 Valid"]==1 & data[, "CL2 Valid"]==1 )
            output[[well]][[analyte[bnb]]] = data[valids,"RP1 Value"]
        }
    }
    return(output)
}
MathurinD/extractlxb documentation built on Feb. 25, 2020, 8:41 p.m.