R/HTDoseResponseCurve.R

Defines functions timecourse_AUC_ratio timecourse_relative_grid convert_to_foldchange fit_statistics_matrixes fit_statistics summary.HT_fit fit_DRC_synergy fit_DRC convert_dataset_for_synergy_DRC metric_to_grid combine_data_and_map create_synergy_dataset create_dataset normalize_by_vehicle prepare_for_normalization get_hours get_concentrations get_treatments get_sample_types create_empty_plate_map create_empty_plate AUC.HT_fit AUC AUC_across_timepoints subset_treatments HT_fit plate_dimensions_from_wells

Documented in combine_data_and_map convert_to_foldchange create_dataset create_empty_plate create_empty_plate_map create_synergy_dataset fit_DRC fit_statistics fit_statistics_matrixes get_concentrations get_hours get_sample_types get_treatments metric_to_grid summary.HT_fit timecourse_AUC_ratio timecourse_relative_grid

plate_dimensions_from_wells = function(number_of_wells){
    if( number_of_wells != 6 & number_of_wells != 24 & 
        number_of_wells != 96 & number_of_wells != 384){
        stop("number_of_wells must be 6, 24, 96, or 384")   
    }
    if( number_of_wells == 384 ){
        list( rows=16, cols=24 )
    }else if( number_of_wells == 96 ){
        list( rows=8, cols=12 )
    }else if( number_of_wells == 24 ){
        list( rows=4, cols=6 )
    }else if( number_of_wells == 12 ){
        list( rows=3, cols=4 )
    }else if( number_of_wells == 6 ){
        list( rows=2, cols=3 )
    }
}

#-------------------------------------------------------------------------------

# Function constructor for FIT object
#
HT_fit = function(...){
    FIT=list()
    FIT[[ "unique_conditions" ]] = NA
    FIT[[ "is_fitted" ]] = FALSE
    FIT[[ "sample_types" ]] = NA
    FIT[[ "treatments" ]] = NA
    FIT[[ "fit_stats" ]] = NA
    FIT[[ "input" ]] = NA
    FIT[[ "model" ]] = NA
    FIT[[ "ANOVA_F_test" ]] = NA
    FIT[[ "ANOVA_P_value" ]] = NA
    attr(FIT, "class") = "HT_fit"
    class(FIT) = "HT_fit"
    return( FIT )
}

# If two drugs or lines are on different plates, they should be fit using 
# the vehicle concentrations from their own plate as the 0 concentration.    
# for each unique sample_type + treatment, find the appropriate vehicle
#
subset_treatments = function( D, sample_types, treatments, hours, 
                              concentration_columns, treatment_columns ){
    idx = c()
    conditions_to_fit = c()
    concentrations = c()
    
    for(i in 1:length(sample_types)){
        for(j in 1:length(treatments)){
            #idx_D index of this {treatment, sample, hour} 
            if( treatment_columns[j]=="treatment" ){
                idx_D =  which(D$treatment == treatments[j] & 
                           D$sample_type == sample_types[i] & 
                           D$hours %in% hours )
            }else{
                idx_D =  which(D$treatment_2 == treatments[j] & 
                           D$sample_type == sample_types[i] & 
                           D$hours %in% hours )                
            }
            if( length(idx_D)==0 ){
                stop( paste("Requested combination of treatment=",
                             treatments[j], "sample_type =", sample_types[j], 
                              "hour =", hours, "not found in D" ))
            }
            cur_plate_ids = unique( D$plate_id[idx_D] )
            vehicle_names = unique( D$negative_control[ idx_D ] )
            if( treatment_columns[j]=="treatment" ){
                idx_V = which(D$treatment %in% vehicle_names & 
                          D$sample_type == sample_types[i] & 
                          D$hours %in% hours & 
                          D$plate_id %in% cur_plate_ids &
                          D$is_negative_control )
            }else{
                idx_V = which(D$treatment_2 %in% vehicle_names & 
                          D$sample_type == sample_types[i] & 
                          D$hours %in% hours & 
                          D$plate_id %in% cur_plate_ids &
                          D$is_negative_control_2 )
            }
            idx = c(idx, idx_D, idx_V)
            new_condition =rep( paste(sample_types[i], treatments[j],sep="_|_"), 
                                 length(idx_D)+length(idx_V))
            conditions_to_fit = c(conditions_to_fit, new_condition)
            if( concentration_columns[j]=="concentration" ){
                concentrations = c( concentrations, D$concentration[idx_D], 
                                    D$concentration[idx_V] )
            }else{
                concentrations = c( concentrations, D$concentration_2[idx_D],
                                    D$concentration_2[idx_V] )
            }
        }
    }
    res=data.frame( value =             D$value_normalized[ idx ], 
                    concentration =     concentrations,
                    sample_type =       D$sample_type[idx], 
                    conditions_to_fit = conditions_to_fit, 
                    stringsAsFactors=FALSE )
    res
}


# Calculate area under all values in D for a single value of sample_type,
# treatment, and concentration using pracma:trapz. Agnostic to normalization.
#' @import pracma
#' @import plyr
#' @importFrom stats median
AUC_across_timepoints = function( D, sample_type, treatment, concentration, 
                                  summary_method="mean"){
    ds_type = is_dataset_valid(D)
    if( summary_method != "mean" & summary_method != "median"){
        stop("summary_method parameter must be either mean or median")
    }
    DA = D[D$treatment==treatment & 
               D$concentration==concentration & 
               D$sample_type==sample_type,]
    Dsum = plyr::ddply( DA, c("hours"), 
                        function(x){ data.frame( 
                            mu=mean(x$value, na.rm=TRUE), 
                            med=stats::median(x$value, na.rm=TRUE) ) }
    )
    Dsum = Dsum[order(Dsum$hours),]
    if( summary_method=="mean"){
        values=Dsum$mu
    }
    else{
        values = Dsum$med
    }
    pracma::trapz( Dsum$hours, values  )
}

# Calculate AUC using trapezoids method; works either for curves fit with drc()
# or for point estimates when curve fitting fails.
#
AUC = function(FIT, granularity=0.01){
    if(is.null(attr(FIT, "class"))){ stop("Must be called on a class") }
    else{ UseMethod("AUC") }
}

#' @importFrom stats predict coefficients
AUC.HT_fit = function(FIT, granularity=0.01, summary_method="mean"){
    obs_min = rep(NA, length(FIT$unique_conditions))
    obs_max = rep(NA, length(FIT$unique_conditions))
    coef_b = rep(NA, length(FIT$unique_conditions))
    coef_c = rep(NA, length(FIT$unique_conditions))
    coef_d = rep(NA, length(FIT$unique_conditions))
    coef_e = rep(NA, length(FIT$unique_conditions))
    SF50 = rep(NA, length(FIT$unique_conditions))
    auc = rep(NA, length(FIT$unique_conditions))
    
    # Summarize
    Mstat = plyr::ddply( FIT$input, c("conditions_to_fit", "concentration"), 
                         function(x){ data.frame( 
                             mu=mean(x$value, na.rm=TRUE), 
                             med=stats::median(x$value, na.rm=TRUE) ) } )
    if(summary_method=="mean"){
        Mstat$value = Mstat$mu
    }else{
        Mstat$value = Mstat$med
    }
    
    # Record min/max observed summary data
    for( i in 1:length(FIT$unique_conditions) ){
        observed = Mstat$value[ Mstat$conditions_to_fit==
                                        FIT$unique_conditions[i] ]
        obs_min[i] = min(observed, na.rm=-TRUE)
        obs_max[i] = max(observed, na.rm=-TRUE)
    }
    
    if( !FIT$is_fitted ){    
        # If a curve was not fitted, use the summarized observed values 
        # to calculate AUC but do not try to infer other values and do not 
        # estimate EC50
        for(i in 1:length(FIT$unique_conditions)){
            cur_condition=FIT$unique_conditions[i]
            Mcur=Mstat[ Mstat$conditions_to_fit==cur_condition,]
            Mcur = Mcur[order(Mcur$concentration),]
            auc[i] = pracma::trapz(10^(Mcur$concentration), Mcur$value)
        }
    }else{
        max_concentration = max(FIT$input$concentration, na.rm=TRUE) 
        # extremely high conc, in excess of 20uM, will slow us to a crawl
        # Use reduced granualarity of fit for highest values to compensate.
        if( max_concentration>2e4 ){
            conc_to_fit= seq(from=0, to=2e4, by=granularity )
            conc_to_fit=c(conc_to_fit,seq(from=2e4, to=max_concentration, by=1))
        }
        else{
            conc_to_fit= seq(from=0, to=max_concentration, by=granularity )
        }    
        conc_test = rep(conc_to_fit, length(FIT$unique_conditions) )
        cond_test = c()
        for(i in 1:length(FIT$unique_conditions)){
            cond_test = c( cond_test, 
                           rep(FIT$unique_conditions[i],length(conc_to_fit)))
        }
        model_input_test = data.frame( concentration=conc_test, 
                                       conditions_to_fit=factor(cond_test),
                                       stringsAsFactors=FALSE ) 
        preds = try( stats::predict(FIT$model, model_input_test), silent=TRUE )
        
        if( !"try-error" %in% class( preds ) ){
            model_input_test$ys=preds
            for( i in 1:length(FIT$unique_conditions) ){
                cur_cond=FIT$unique_conditions[i]
                
                idxs=which( model_input_test$ys<0.5 & 
                            model_input_test$conditions_to_fit==cur_cond)
                if( length(idxs)>0 ){
                    idx_50 = min( idxs )
                    SF50[i]=model_input_test$concentration[ idx_50 ]
                }
                idx_first = min( which(cond_test==cur_cond ) )
                idx_last =  max( which(cond_test==cur_cond ) )
                ys = preds[ idx_first : idx_last ]
                ys[ys<0]=0 # No negative values allowed for AUC
                nn=names(stats::coefficients(FIT$model))
                idx_b = which(nn=="b:(Intercept)")
                idx_c = which(nn=="c:(Intercept)")
                idx_d = which(nn=="d:(Intercept)")
                idx_e = which(nn=="e:(Intercept)")
                if(length(idx_b)==0 ){
                    idx_b = which(nn==paste("b",cur_cond,sep=":"))
                }
                if(length(idx_c)==0 ){
                    idx_c = which(nn==paste("c",cur_cond,sep=":"))
                }
                if(length(idx_d)==0 ){
                    idx_d = which(nn==paste("d",cur_cond,sep=":"))
                }
                if(length(idx_e)==0 ){
                    idx_e = which(nn==paste("e",cur_cond,sep=":"))
                }
                
                if(length(idx_b)>0){
                    coef_b[i] = as.numeric(stats::coefficients(FIT$model)[idx_b])
                }
                if(length(idx_c)>0){
                    coef_c[i] = as.numeric(stats::coefficients(FIT$model)[idx_c])
                }else{
                    coef_c[i] = 0
                }
                if(length(idx_d)>0){
                    coef_d[i] = as.numeric(stats::coefficients(FIT$model)[idx_d])
                }else{
                    coef_d[i] = 1
                }
                if(length(idx_e)>0){
                    coef_e[i] = as.numeric(stats::coefficients(FIT$model)[idx_e])
                }
                auc[i] = pracma::trapz(conc_to_fit, ys) / max_concentration
                observed = FIT$input$value[ FIT$input$conditions_to_fit==
                                                cur_cond ]
            }
        }
    }
    
    FIT$fit_stats = data.frame( AUC=auc, 
                                coef_slope = round( coef_b, 3 ), 
                                coef_asymp_low = round( coef_c, 3 ), 
                                coef_asymp_high = round( coef_d, 3 ),
                                coef_EC50 = round( coef_e, 3),
                                obs_min = round( obs_min, 3), 
                                obs_max = round( obs_max, 3 ), 
                                SF50 = round( SF50, 3 ),
                                row.names = FIT$unique_conditions )
    FIT
}



#-------------------------------------------------------------------------------

#' Create a plate list with empty elements
#' 
#' 
#' @param number_of_wells Number of wells in the plate, must be one of 
#' {6, 12, 24, 96, 384}.
#' @param hour Number of hours since the experiment began. Defaults to 0.
#' @param plate_id Text string that identifies this plate map. Defaults to 
#' "plate_1".
#' @return A data frame 
#' @examples 
#' create_empty_plate( number_of_wells=6 )
#' create_empty_plate( number_of_wells=96, hour=0, plate_id="plate_12")
#' create_empty_plate( number_of_wells=384, hour=12, plate_id="plate_9")
#' @seealso \code{\link{create_empty_plate_map}} 
#' @export
create_empty_plate = function( number_of_wells, hour=0, plate_id="plate_1" ){
    PLATE_ROWS = plate_dimensions_from_wells(number_of_wells)$rows
    PLATE_COLS = plate_dimensions_from_wells(number_of_wells)$cols
    if( !is.character( plate_id ) ){
        stop( "parameter plate_id must be a character string")   
    }
    if( !is.numeric( hour ) ){
        stop( "parameter hour must be a number")
    }
    plate = data.frame(matrix(NA, nrow=PLATE_ROWS, ncol=PLATE_COLS))
    plate = cbind(plate, hours=rep(hour, PLATE_ROWS), 
                         plate_id=rep(plate_id, PLATE_ROWS), 
                         stringsAsFactors=FALSE)
    names(plate)[1:PLATE_COLS] = 1:PLATE_COLS
    dimnames(plate)[[1]] = abc[1:PLATE_ROWS]
    plate
}


#' Create an empty plate map 
#' 
#' The plate map object is a list of identically sized matrixes named 
#' treatment, concentration, sample_type, density, and passage. By default, only 
#' a single concentration and treatment is expected each individual well. If 
#' this is a synergy experiment and each well can contain two or more different 
#' treatments at varying concentrations, pass the maximum number of conditions 
#' per well to conditions_per_well. This will create additional matrixes, 
#' e.g. treatment_2 and concentration_2.
#' 
#' @param number_of_wells Number of wells in the plate, must be one of 
#' {6, 12, 24, 96, 384}
#' @param max_treatments_per_well The maximum number of treatment 
#' conditions per each well, either 1 or 2. Defaults to 1. Pass 2 only if 
#' performing a synergy experiment. 
#' @return a plate_map list with matrixes for treatment, concentration, 
#' sample_types, density, and passage
#' @seealso \code{\link{create_empty_plate}} 
#' @examples 
#' create_empty_plate_map( number_of_wells=96 )
#' @export
create_empty_plate_map = function( number_of_wells, max_treatments_per_well=1 ){
    if( !is.numeric(max_treatments_per_well)){
        stop("parameter max_treatments_per_well must be 1 or 2")
    }
    if( max_treatments_per_well!=1 & max_treatments_per_well!=2){
        stop("parameter max_treatments_per_well must be 1 or 2")
    }
    PLATE_ROWS = plate_dimensions_from_wells(number_of_wells)$rows
    PLATE_COLS = plate_dimensions_from_wells(number_of_wells)$cols 
    if( max_treatments_per_well==1 ){
        list(
            max_treatments_per_well = max_treatments_per_well,
            treatment = matrix("", ncol=PLATE_COLS, nrow=PLATE_ROWS),
            concentration = matrix(NA, ncol=PLATE_COLS, nrow=PLATE_ROWS),
            sample_type = matrix("", ncol=PLATE_COLS, nrow=PLATE_ROWS),
            density = matrix("", ncol=PLATE_COLS, nrow=PLATE_ROWS),
            passage = matrix("", ncol=PLATE_COLS, nrow=PLATE_ROWS)
        )
    }
    else{
        list(
            max_treatments_per_well = max_treatments_per_well,
            treatment = matrix("", ncol=PLATE_COLS, nrow=PLATE_ROWS),
            concentration = matrix(NA, ncol=PLATE_COLS, nrow=PLATE_ROWS),
            treatment_2 = matrix("", ncol=PLATE_COLS, nrow=PLATE_ROWS),
            concentration_2 = matrix(NA, ncol=PLATE_COLS, nrow=PLATE_ROWS),
            sample_type = matrix("", ncol=PLATE_COLS, nrow=PLATE_ROWS),
            density = matrix("", ncol=PLATE_COLS, nrow=PLATE_ROWS),
            passage = matrix("", ncol=PLATE_COLS, nrow=PLATE_ROWS)
        )
    }
}


#' Sorted unique list of sample types
#' 
#' Convenience method to extract sample types from a data set.
#' 
#' @param D A data frame matching the form of the output of 
#'  \code{combine_data_and_map}.
#' @return sorted vector of sample types
#' @examples 
#' pm = create_empty_plate_map( 6 )
#' pm$sample_type[1,1:3] = "line1"
#' pm$sample_type[2,1:3] = "line2"
#' ds = create_dataset( 
#'   sample_types= c("line1","line1","line1","line2","line2","line2"),
#'   treatments = c("DMSO","drug1","drug2","DMSO","drug1","drug2"),
#'   concentrations = c(0, 100, 200, 0, 100, 200),
#'   hours = c(48, 48, 48, 48, 48, 48),
#'   values = c(100, 90, 20, 100, 89, 87), 
#'   plate_id = "plate_1",
#'   negative_control = "DMSO")
#' get_sample_types( ds )
#' @seealso  \code{\link{get_treatments}}
#' @seealso  \code{\link{get_concentrations}}
#' @seealso  \code{\link{get_hours}}
#' @seealso  \code{\link{combine_data_and_map}}
#' @export
get_sample_types = function( D ){
    ds_type = is_dataset_valid(D)
    setdiff( sort(unique(as.vector(t(D$sample_type)))), "" )
}

#' Sorted unique list of treatments
#'
#' Convenience method to extract treatments
#' @param D A a data frame matching the form of one generated by 
#' \code{combine_data_and_map}.
#' @return A sorted vector of treatments.
#' @examples 
#' ds = create_dataset( 
#'   sample_types= c("line1","line1","line1","line2","line2","line2"),
#'   treatments = c("DMSO","drug1","drug2","DMSO","drug1","drug2"),
#'   concentrations = c(0, 100, 200, 0, 100, 200),
#'   hours = c(48, 48, 48, 48, 48, 48),
#'   values = c(100, 90, 20, 100, 89, 87), 
#'   plate_id = "plate_1",
#'   negative_control = "DMSO")
#' get_treatments( ds )
#' @seealso  \code{\link{get_sample_types}}
#' @seealso  \code{\link{get_concentrations}}
#' @seealso  \code{\link{get_hours}}
#' @seealso  \code{\link{combine_data_and_map}}
#' @export
get_treatments = function( D ){
    ds_type = is_dataset_valid(D)
    if( ds_type==2 ){
        unique( c( setdiff( sort(unique(as.vector(t(D$treatment)))), ""),
                   setdiff( sort(unique(as.vector(t(D$treatment_2)))), "")) )
    }else{
        setdiff( sort(unique(as.vector(t(D$treatment)))), "")
    }
}

#' Sorted unique list of treatment concentrations
#' 
#' Convenience method to extract treatment concentrations
#' 
#' @param D A data frame matching the format of one generated by 
#' \code{combine_data_and_map}
#' @return sorted vector of concentrations, including 0 if present
#' @examples 
#' ds = create_dataset( 
#'   sample_types= c("line1","line1","line1","line2","line2","line2"),
#'   treatments = c("DMSO","drug1","drug2","DMSO","drug1","drug2"),
#'   concentrations = c(0, 100, 200, 0, 100, 200),
#'   hours = c(48, 48, 48, 48, 48, 48),
#'   values = c(100, 90, 20, 100, 89, 87), 
#'   plate_id = "plate_1",
#'   negative_control = "DMSO")
#' get_concentrations( ds )
#' @seealso  \code{\link{get_sample_types}}
#' @seealso  \code{\link{get_treatments}}
#' @seealso  \code{\link{get_hours}}
#' @seealso  \code{\link{combine_data_and_map}}
#' @export
get_concentrations = function( D ){
    ds_type = is_dataset_valid(D)
    setdiff( sort(unique(as.vector(t(D$concentration)))), "")
}

#' Sorted unique list of hours since time zero
#' 
#' Convenience method to extract a sorted list of the hours 
#' 
#' @param D A dataframe matching the format of one generated by 
#' \code{combine_data_and_map}
#' @return A sorted vector of hours.
#' @examples 
#' ds = create_dataset( 
#'   sample_types= c("line1","line1","line1","line2","line2","line2"),
#'   treatments = c("DMSO","drug1","drug2","DMSO","drug1","drug2"),
#'   concentrations = c(0, 100, 200, 0, 100, 200),
#'   hours = c(48, 48, 48, 48, 48, 48),
#'   values = c(100, 90, 20, 100, 89, 87), 
#'   plate_id = "plate_1",
#'   negative_control = "DMSO")
#' get_hours( ds )
#' @seealso  \code{\link{get_sample_types}}
#' @seealso  \code{\link{combine_data_and_map}}
#' @seealso  \code{\link{get_treatments}}
#' @seealso  \code{\link{get_hours}}
#' @export
get_hours = function( D ){
    ds_type = is_dataset_valid(D)
    sort(unique(D$hours))
}

# helper function to normalize data in D relative to vehicle
prepare_for_normalization = function( D, negative_control ){
    
    # neg_ctl and neg_ctl_2 are strings that contain the name of the treatment
    # that provided a negative control. This could be DMSO, Vehicle, or the 
    # name of the drug treatment if we identify negative controls by 
    # concentration rather than by treatment name.
    # is_neg_ctl and is_neg_ctl_2 are TRUE if that well is negative control 
    # for treatment 1 or treatment 2 respectively. A well can have zero, 
    # one, or both of these variables set to TRUE.
    # for synergy experiments, need to normalize only against wells where
    # both is_neg_ctl == is_neg_ctl_2 == TRUE 
    neg_ctl = rep( NA, dim(D)[1] )
    neg_ctl_2 = rep( NA, dim(D)[1] )
    is_neg_ctl = rep( FALSE, dim(D)[1] )
    is_neg_ctl_2 = rep( FALSE, dim(D)[1] )
    
    is_synergy = "treatment_2" %in% names(D)
    
    if( is.numeric(negative_control) ){
        neg_ctl = D$treatment
        treatments = unique(D$treatment)
        # Check treatments have a well with concentration==negative_control
        for(i in 1:length(treatments)){
            idx = which( D$concentration[ D$treatment==treatments[i] ]==
                             negative_control )
            if( length( idx ) == 0 ){
                stop(paste("negative_control parameter passed with value",
                           negative_control,"so we assume each",
                           "treatment has one or more wells with concentration",
                           "=",negative_control, "which is not the case for", 
                           treatments[i] ) )
            }
            is_neg_ctl[ D$concentration==negative_control & 
                        D$treatment==treatments[i] ] = TRUE
            neg_ctl[ D$treatment==treatments[i] ] = treatments[i]
        }
        if( is_synergy ){
            neg_ctl_2 = D$treatment_2
            treatments = unique(D$treatment_2)
            # Check treatments have a well with concentration==negative_control
            for(i in 1:length(treatments)){
                idx = which( D$concentration_2[ D$treatment_2==treatments[i] ]==
                                 negative_control )
                if( length( idx ) == 0 ){
                    stop(paste("negative_control parameter passed with value",
                               negative_control,". Each treatment should have",
                               "one or more wells with concentration",
                               "=",negative_control, "which is not the case for", 
                               treatments[i] ) )
                }
                is_neg_ctl_2[ D$concentration_2==negative_control & 
                                D$treatment_2==treatments[i] ] = TRUE
                neg_ctl_2[ D$treatment_2==treatments[i] ] = treatments[i]
            } 
        }
    }else if( is.character( negative_control ) ){
        if( length( intersect( negative_control, 
                               c(D$treatment, D$treatment_2)) ) != 1 ){
            stop(paste("negative_control", negative_control,
                       "is not among the treatments on this plate"))
        }
        neg_ctl = rep( negative_control, dim(D)[1])
        is_neg_ctl = !is.na(D$treatment) & D$treatment==negative_control
        sum_of_neg_ctl = sum(is_neg_ctl)
        if( is_synergy ){
            neg_ctl_2 = rep( negative_control, dim(D)[1])
            is_neg_ctl_2 = !is.na(D$treatment_2)&D$treatment_2==negative_control
            sum_of_neg_ctl = sum_of_neg_ctl + sum(is_neg_ctl_2)
        }
        if( sum_of_neg_ctl==0 ) {
            warning(paste("negative control",negative_control,"was specified", 
                          "but no wells were marked as untreated when",
                          "including both treatment and treatment_2 columns"))   
        }
    }else if( is.data.frame( negative_control ) ){
        VD = negative_control
        if( sum(names(VD)=="drug" ) == 0 | 
            sum(names(VD)=="vehicle") == 0 ){
            stop(paste("if passed as a data frame, parameter negative_control",
                       "must have columns named 'drug' and 'vehicle'"))
        }
        drugs = unique( VD$drug )
        vehicles = unique( VD$vehicle )
        d2v = hsh_from_vectors( VD$drug, VD$vehicle )
        # vehicles are their own vehicles
        for(i in 1:length(vehicles)){
            hsh_set( d2v, VD$vehicle[i], VD$vehicle[i] ) 
        }
        treatments = c(drugs, vehicles)
        
        for( i in 1:length(treatments)){
            neg_ctl[ which(D$treatment==treatments[i]) ]=
                hsh_get(d2v,treatments[i])
        }
        
        for( i in 1:length(vehicles)){
            is_neg_ctl[ which(D$treatment==vehicles[i]) ] = TRUE
        }
        if( sum(is.na(neg_ctl))>0 ){
            stop(paste( "Not all drugs have been assigned a",
                        "negative control, check negative_control parameter"))
        }
        if( length( setdiff(neg_ctl, D$treatment)>0 ) ){
            stop( paste( "parameter negative_control contain an element in",
                         "the vehicle column that is not found in the",
                         "treatments for this plate") )
        }
        if( is_synergy ){
            for( i in 1:length(treatments)){
                neg_ctl_2[ which(D$treatment_2==treatments[i]) ]=
                    hsh_get(d2v,treatments[i])
            }
            for( i in 1:length(vehicles)){
                is_neg_ctl_2[ which(D$treatment_2==vehicles[i]) ] = TRUE
            }
            if( sum(is.na(neg_ctl_2))>0 ){
                stop(paste( "Not all drugs have been assigned a",
                          "negative control, check negative_control parameter"))
            }
            if( length( setdiff(neg_ctl, D$treatment)>0 ) ){
                stop( paste( "parameter negative_control contain an element in",
                             "the vehicle column that is not found in the",
                             "treatments for this plate") )
            }
        }
    }
    D$is_negative_control = is_neg_ctl
    D$negative_control = neg_ctl
    if( is_synergy ){
        D$is_negative_control_2 = is_neg_ctl_2
        D$negative_control_2 = neg_ctl_2
    }
    D
}


normalize_by_vehicle = function( D, summary_method ){
    if( summary_method != "mean" & summary_method != "median"){
        stop("summary_method parameter must be either mean or median")
    }
    is_synergy = "treatment_2" %in% names(D)
    
    D$value_normalized = D$value
    
    # normalization broken out by each timepoint (D$hours) on each plate 
    plates = sort(unique(D$plate_id))
    for( pp in 1:length(plates)){
        plate_cur = plates[pp]
        hours = unique( D$hours[ D$plate_id == plate_cur ] ) 
        for( tt in 1:length( hours ) ){
            hour_cur = hours[tt]
            idx_hour_plate = which( D$hours==hour_cur & D$plate_id == plate_cur)
            vehicles=unique( D$negative_control[ idx_hour_plate ] )
            if( is_synergy ){
                vehicles = unique( c(vehicles, 
                                     D$negative_control_2[ idx_hour_plate] ) )
            }
            for(vv in 1:length(vehicles)){
                vehicle_cur = vehicles[vv]
                # if this is a synergy experiment, only normalize against 
                # wells where both treatments are vehicle
                if( is_synergy ){
                    D_v = D[D$hours==hour_cur &
                            D$plate_id==plate_cur &
                            D$treatment==vehicle_cur & 
                            D$treatment_2==vehicle_cur & 
                            D$is_negative_control &
                            D$is_negative_control_2,]
                }else{
                    D_v = D[D$hours==hour_cur &
                            D$plate_id==plate_cur &
                            D$treatment==vehicle_cur & 
                            D$is_negative_control,]
                }
                D_v = plyr::ddply( D_v, c("sample_type", "hours"), 
                                   function(x){ data.frame(
                                       mu=mean(x$value, na.rm=TRUE),
                                       med=stats::median(x$value, na.rm=TRUE) )} 
                )
                if( summary_method=="mean" ){
                    D_v$value = D_v$mu
                }else{
                    D_v$value = D_v$med
                }
                for(i in 1:length( D_v$sample_type ) ){
                    sample_type_cur = D_v$sample_type[i]
                    idx = which( D$hours==hour_cur & 
                                     D$plate==plate_cur & 
                                     D$negative_control==vehicle_cur &
                                     D$sample_type==sample_type_cur )
                    D$value_normalized[idx] = D$value[idx] / D_v$value[i]
                } 
            }
        }
    }
    D
}

#' Create a dataset from raw data without a plate map
#' 
#' @param sample_types vector of sample types
#' @param treatments vector of treatments
#' @param concentrations vector of concentrations
#' @param values vector of measured response to treatment
#' @param hours time points for each observation. If a number, the same time 
#' point is assigned to all observations. If a vector, there should be one 
#' number for each observation. Defaults to 0.
#' @param negative_control Controls the normalization. This value may be NA, 
#' a number, a string, or a data frame.  
#' 
#' \itemize{
#'  \item{NA: Use when there are no negative control measurements. The contents 
#'    of the column named 'value_normalized' will be copied from the contents of 
#'    the column named 'value'. }
#'  \item{Number: Use when each treatment has been labeled with a concentration 
#'    (typically 0) that indicates the vehicle control. Each treatment must 
#'    contain one or more observations with this concentration, and these 
#'    observations will be the negative controls.}
#'  \item{string: Use when a single set of observations is a universal control. 
#'    The treatment whose name matches the string is the universal 
#'    negative control all of the data.}
#'  \item{data frame: Use when more than one negative control exists, and you 
#'    have to map different treatments to a particular negative control. The 
#'    data frame must have names 'drug' and 'vehicle', and the data frame will 
#'    map match treatments in the 'drug' column to those in the 
#'    'vehicle' column.}
#' }
#' @param plate_id Text string identifying this experiment, useful if 
#' multiple datasets are later combined. Defaults to "plate_1".
#' @param treatments_2 vector of second treatments, used for synergy datasets. 
#' Defaults to NULL, meaning only a single treatment per well.
#' @param concentrations_2 vector of second concentrations, used for synergy 
#' datasets. Defaults to NULL, meaning only a single treatment per well.
#' @param summary_method Method used to combine replicate measures into a single 
#' value; must be one of "mean", "median". Defaults to "mean".
#' @return A data frame where columns indicate the sample type, treatment, 
#' concentration, observed raw value, normalized value,
#' name of the negative_control treatment, whether a particular row 
#' is a negative control for at least one other row, hours since the start time,
#' and plate of origin 
#' @examples 
#' # six measurements: DMSO, 100, and 200 nM for two drugs. 
#' # plan to normalize each line against DMSO for that line
#' # not specifying hours or a plate ID
#' ds = create_dataset( 
#'   sample_types= c("line1","line1","line1","line2","line2","line2"),
#'   treatments = c("DMSO","drug1","drug2","DMSO","drug1","drug2"),
#'   concentrations = c(0, 100, 200, 0, 100, 200),
#'   values = c(98, 90, 20, 99, 89, 87), 
#'   negative_control = "DMSO")
#' 
#' # same as above, now specifying hours and a plate ID
#' ds = create_dataset( 
#'   sample_types= c("line1","line1","line1","line2","line2","line2"),
#'   treatments = c("DMSO","drug1","drug2","DMSO","drug1","drug2"),
#'   concentrations = c(0, 100, 200, 0, 100, 200),
#'   hours = c(48, 48, 48, 48, 48, 48),
#'   values = c(98, 90, 20, 99, 89, 87), 
#'   plate_id = "plate_dq",
#'   negative_control = "DMSO")
#'   
#' # six measurements; drug1 at 0, 100, 200 nM and drug2 at 0, 100, 200 nM. 
#' # plan to normalize against zero concentration for each line
#' ds = create_dataset( 
#'   sample_types= c("line1","line1","line1","line2","line2","line2"),
#'   treatments = c("drug1","drug1","drug2","drug2","drug2","drug2"),
#'   concentrations = c(0, 100, 200, 0, 100, 200),
#'   hours = c(48, 48, 48, 48, 48, 48),
#'   values = c(98, 90, 20, 99, 89, 87), 
#'   plate_id = "plate_dq",
#'   negative_control = 0)
#'   
#' # six measurements; drug1 at 0, 100, 200 nM and drug2 at 0, 100, 200 nM. 
#' # plan to normalize drug1 against DMSO and drug2 against ethanol
#' individual_vehicles = data.frame(
#'   drug=c("drug1", "drug2"), 
#'   vehicle=c("DMSO", "ethanol"),
#'   stringsAsFactors=FALSE)
#' ds = create_dataset( 
#'   sample_types= c("line1","line1","line1","line2","line2","line2"),
#'   treatments = c("DMSO","drug1","drug1","ethanol","drug2","drug2"),
#'   concentrations = c(0, 100, 200, 0, 100, 200),
#'   hours = c(48, 48, 48, 48, 48, 48),
#'   values = c(98, 90, 20, 99, 89, 87), 
#'   plate_id = "plate_dq",
#'   negative_control = individual_vehicles)
#' @export
create_dataset = function( sample_types, treatments, concentrations, values, 
                           hours=0, plate_id="plate_1", negative_control = NA,
                           treatments_2 = NULL, concentrations_2 = NULL,
                           summary_method="mean"){
    if( length(plate_id) != 1 ){
        stop("parameter plate_id should be a single string")   
    }
    if( sum( !is.numeric(hours))>0 ){
        stop( "Hours must be number or a vector of numbers")
    }
    if( length(hours)==1 ){
        hours = rep(hours, length(sample_types))   
    }
    if( length(sample_types) != length(treatments) |
        length(treatments) != length(concentrations) | 
        length(concentrations) != length(hours) | 
        length(hours) != length(values) ){
        stop( paste("parameters sample_types, treatments, concentrations,",
                    "values, and hours must have the same length"))
    }
    if( !is.null( treatments_2 ) ){
        if(length(treatments_2) != length( treatments ) ){
            stop(paste("parameters treatments and treatments_2 must have the",
                        "same length") )
        }
    }
    if( !is.null( concentrations_2 ) ){
        if(length(concentrations_2) != length( concentrations ) ){
            stop(paste("parameters concentrations and concentrations_2 must ",
                       "have the same length") )
        }
    }
    if( (!is.null( treatments_2 ) & is.null( concentrations_2)) | 
        (is.null( treatments_2 ) & !is.null( concentrations_2) ) ){
        stop(paste("if either concentrations_2 or treatments_2 is passed, both",
            "parameters must be passed and must have the same length") )
    }
    
    df = data.frame( sample_type=sample_types, 
                     treatment=treatments, 
                     concentration=concentrations, 
                     hours, 
                     value=values, 
                     value_normalized=values,
                     plate_id = rep(plate_id, length(treatments)),
                     negative_control = rep(NA, length(values) ),
                     is_negative_control = rep(NA, length(values) ),
                     stringsAsFactors=FALSE )
    if(!is.null(treatments_2)){
        df = cbind( df, treatment_2=treatments_2, 
                    concentration_2=concentrations_2, stringsAsFactors=FALSE)   
    }
    df = prepare_for_normalization( df, negative_control ) 
    normalize_by_vehicle(df, summary_method=summary_method )
}

#' Create a synergy dataset from raw data without a plate map
#' 
#' A synergy dataset differs from a standard dataset in that each value is the 
#' result of combining two distinct treatments in two distinct concentrations. 
#' If there are measurements where only one treatment was present, the other 
#' treatment should be specified to have a concentration equal to zero. 
#' 
#' @param sample_types vector of sample types
#' @param treatments_1 vector of treatment one
#' @param treatments_2 vector of treatment two
#' @param concentrations_1 vector of concentrations of treatment one
#' @param concentrations_2 vector of concentrations of treatment two
#' @param values vector of measured response to treatments one and two
#' @param hours time points for each observation. If a number, the same time 
#' point is assigned to all observations. If a vector, there should be one 
#' number for each observation. Defaults to 0.
#' @param plate_id experiment identification string, useful if multiple datasets 
#' are later combined. Defaults to "plate_1"
#' @param negative_control A designation for the negative controls in this 
#' dataset, if they exist. Value may be NA, a number, a string, or a data frame.
#' 
#' \itemize{
#'  \item{NA: Use when there are no negative control measurements. The contents 
#'    of the column named 'value_normalized' will be copied from the contents of 
#'    the column named 'value'. }
#'  \item{Number: Use when each treatment has been labeled with a concentration 
#'    (typically 0) that indicates the vehicle control. Each treatment must 
#'    contain one or more observations with this concentration, and these 
#'    observations will be the negative controls.}
#'  \item{string: Use when a single set of observations is a universal control. 
#'    The treatment whose name matches the string is the universal 
#'    negative control all of the data.}
#'  \item{data frame: Use when more than one negative control exists, and you 
#'    have to map different treatments to a particular negative control. The 
#'    data frame must have names 'drug' and 'vehicle', and the data frame will 
#'    map match treatments in the 'drug' column to those in the 
#'    'vehicle' column.}
#' }
#' @param summary_method summarize replicate measures by either mean or median; 
#' must be one of "mean", "median". Defaults to "mean"
#' @return a data frame where columns indicate the sample type, treatment 1, 
#' treatment 2, concentration of treatment 1, concentration of treatment 2, 
#' observed raw value, normalized value, name of the negative_control treatment, 
#' whether a particular row is a negative control for at least one other row, 
#' hours since the start time, and plate of origin.
#' @examples 
#' dose_SCH=c(0.1, 0.5, 1, 2, 4)
#' eff_SCH = c(0.6701, 0.6289, 0.5577, 0.4550, 0.3755)
#' dose_4HPR = c(0.1, 0.5, 1, 2)
#' eff_4HPR = c(0.7666, 0.5833, 0.5706, 0.4934)
#' eff_comb = c(0.6539, 0.4919, 0.3551, 0.2341)
#' syn = data.frame( 
#'     treatment_1 = rep("SCH66336", 13),
#'     conc_1 = c( dose_SCH, rep(0, 4), dose_SCH[1:4]),
#'     treatment_2 = rep("4-HPR", 13),
#'     conc_2 = c( rep(0, 5), dose_4HPR, dose_4HPR ),
#'     values = c(eff_SCH, eff_4HPR, eff_comb ), stringsAsFactors=FALSE )
#' ds_lk = create_synergy_dataset( sample_types = rep("sample_1", 13), 
#'                                 treatments_1 = syn$treatment_1,
#'                                 treatments_2 = syn$treatment_2,
#'                                 concentrations_1 = syn$conc_1,
#'                                 concentrations_2 = syn$conc_2,
#'                                 values = syn$values)
#' @export
create_synergy_dataset = function( sample_types, treatments_1, treatments_2, 
                                   concentrations_1, concentrations_2, values, 
                                   hours=0, plate_id="plate_1", 
                                   negative_control = NA,
                                   summary_method="mean" ){
    if( length(plate_id) != 1 ){
        stop("parameter plate_id should be a single string")   
    }
    if( sum( !is.numeric(hours))>0 ){
        stop( "Hours must be number or a vector of numbers")
    }
    if( length(hours)==1 ){
        hours = rep(hours, length(sample_types))   
    }
    if( length(sample_types) != length(treatments_1) |
        length(treatments_1) != length(treatments_2) | 
        length(treatments_1) != length(concentrations_1) | 
        length(concentrations_1) != length(concentrations_2) | 
        length(concentrations_1) != length(hours) | 
        length(hours) != length(values) ){
        stop( paste("parameters sample_types, treatments, concentrations,",
                    "values, and hours must have the same length"))
    }
    if( is.factor( treatments_1) | is.factor( treatments_2) ){
        stop("treatment_1 and treatments_2 must be strings, not factors")
    }
    if( is.factor( sample_types ) ){
        stop("sample_types must be vectors of strings, not factors")
    }
    df = data.frame( sample_type=sample_types, 
                     treatment=treatments_1, 
                     concentration=concentrations_1, 
                     treatment_2=treatments_2, 
                     concentration_2=concentrations_2, 
                     hours, 
                     value=values, 
                     value_normalized=values,
                     plate_id = rep(plate_id, length(treatments_1)),
                     negative_control = rep(NA, length(values) ),
                     is_negative_control = rep(NA, length(values) ),
                     stringsAsFactors=FALSE )
    df = prepare_for_normalization( df, negative_control ) 
    normalize_by_vehicle(df, summary_method=summary_method )
}

#' Combine raw data and plate map to produce a single tall data frame
#'
#' Combine a single data plate matching the output of 
#' \code{\link{create_empty_plate}} with a plate map matching the output of 
#' \code{\link{create_empty_plate_map}} to create a single tall data frame with 
#' one row per observation. 
#' 
#' The user specifies which wells (if any) bear negative controls (i.e. 
#' vehicles) 
#' 
#' @param raw_plate data frame with format matching that produced by a call to 
#' \code{read_incucyte_exported_to_excel} 
#' @param plate_map list where each item specifies concentrations, treatments, 
#' and cell lines for a plate specified in raw_plates, matching the format of a 
#' file created by \code{read_platemap_from_excel} 
#' @param negative_control A designation for the negative controls in this 
#' dataset, if they exist. Value may be NA, a number, a string, or a data frame.
#' 
#' \itemize{
#'  \item{NA: Use when there are no negative control measurements. The contents 
#'    of the column named 'value_normalized' will be copied from the contents of 
#'    the column named 'value'. }
#'  \item{Number: Use when each treatment has been labeled with a concentration 
#'    (typically 0) that indicates the vehicle control. Each treatment must 
#'    contain one or more observations with this concentration, and these 
#'    observations will be the negative controls.}
#'  \item{string: Use when a single set of observations is a universal control. 
#'    The treatment whose name matches the string is the universal 
#'    negative control all of the data.}
#'  \item{data frame: Use when more than one negative control exists, and you 
#'    have to map different treatments to a particular negative control. The 
#'    data frame must have names 'drug' and 'vehicle', and the data frame will 
#'    map match treatments in the 'drug' column to those in the 
#'    'vehicle' column.}
#' }
#' @param summary_method summarize replicate measures by either mean or median; 
#' must be one of "mean", "median". Defaults to "mean"
#' @param na.rm remove from final dataset rows where sample_type, treatment, 
#' or concentration has the value NA. Default TRUE.
#' @return A data frame where columns indicate the sample type, treatment, 
#' concentration, observed raw value, normalized value, name of the 
#' negative_control treatment, whether a particular row 
#' is a negative control for at least one other row, hours since the start time,
#' and plate of origin. If the data are from a synergy experiment, there will 
#' be additional columns concententration_2 and treatment_2.
#' @examples 
#' # Create a six well plate testing two drugs on two cell lines. Each drug has 
#' # two concentrations plus a DMSO well. Plan to normalize against DMSO.
#' # Obviously real data would include replicates, more concentrations, etc.
#' plate = create_empty_plate( 6, hour=0, plate_id="plate_1")
#' plate[1:2,1:3] = c(99,98,90,87,77,20)
#' plate_map = create_empty_plate_map( number_of_wells = 6 )
#' plate_map$concentration[1:2,1:3] = c( 0, 0, 100, 100, 200, 200 )
#' plate_map$treatment[1:2,1:3] = c("DMSO","DMSO","drug1","drug2","drug1",
#'                                  "drug2")
#' plate_map$sample_type[1:2,1:3] = rep( c("line1", "line2"), 3)
#' combine_data_and_map( plate, plate_map, negative_control="DMSO" )
#' 
#' # Create a six well plate testing two drugs on two cell lines. Each drug has 
#' # three concentrations, one of which is vehicle-only and labeled zero.
#' plate = create_empty_plate( 6, hour=0, plate_id="plate_1")
#' plate[1:2,1:3] = c(99,98,90,87,77,20)
#' plate_map = create_empty_plate_map( number_of_wells = 6 )
#' plate_map$concentration[1:2,1:3] = c( 0, 0, 100, 100, 200, 200 )
#' plate_map$treatment[1:2,1:3] = c("drug1","drug2","drug1","drug2","drug1",
#'                                  "drug2")
#' plate_map$sample_type[1:2,1:3] = rep( c("line1", "line2"), 3)
#' combine_data_and_map( plate, plate_map, negative_control=0 )
#' @export
combine_data_and_map = function( raw_plate, 
                                 plate_map, 
                                 negative_control=0, 
                                 summary_method="mean", 
                                 na.rm=TRUE){
    
    COLS = dim(plate_map$concentration)[2]
    ROWS = dim(plate_map$concentration)[1]
    
    if( dim(raw_plate)[2] != COLS + 2 ){
        stop("raw_plate must have two more columns than plate map")    
    }
    N_data_cols = COLS
    
    if( length( which(names(raw_plate)=="hours") )==0 ){
        stop("raw_plate must have hours column")
    }
    if( length( which(names(raw_plate)=="plate_id") )==0 ){
        stop("raw_plate must have plate_id column")
    }
    
    if( length( which(names(plate_map)=="concentration") )==0 ){
        stop("plate_map must have concentration matrix")
    }
    if( length( which(names(plate_map)=="treatment") )==0 ){
        stop("plate_map must have treatment matrix")
    }
    if( length( which(names(plate_map)=="sample_type") )==0 ){
        stop("plate_map must have sample_type matrix")
    }
    if( dim(plate_map$treatment)[1] != ROWS | 
        dim(plate_map$treatment)[2] != N_data_cols ){
        stop("plate_map treatment matrix dimensions don't match raw_data")
    }
    if( dim(plate_map$sample_type)[1] != ROWS | 
        dim(plate_map$sample_type)[2] != N_data_cols ){
        stop("plate_map sample_type matrix dimensions don't match raw_data")
    }
    # check if this is a synergy dataset
    is_synergy=FALSE
    if( length( which(names(plate_map)=="concentration_2") )==1 ){
        if( length( which(names(plate_map)=="treatment_2") ) != 1 ){
            stop(paste("inferring that this is a synergy plate map from",
             "presence of concentration_2; the plate_map list is missing a",
             "matrix called treatment_2"))
        }else{
            is_synergy=TRUE   
        }
    }
    if( length( which(names(plate_map)=="treatment_2") )==1 ){
        if( length( which(names(plate_map)=="concentration_2") ) != 1 ){
            stop(paste("inferring that this is a synergy plate map from",
                       "presence of concentration_2; the plate_map list is",
                       "missing a matrix called concentration_2"))
        }
    }
    N_obs = sum( !is.na( raw_plate[, 1:N_data_cols] ) )
    sample_type = rep("", N_obs)
    treatment = rep("", N_obs)
    concentration = rep(0, N_obs)
    treatment_2 = rep("", N_obs)
    concentration_2 = rep(0, N_obs)
    value = rep("", N_obs)
    hours = rep(0, N_obs)
    plate_ids = rep("", N_obs)
    idx=1
    rr_p = 0
    # dim(raw_plate)[1] is number of rows per plate * number of timepoints
    for(rr in 1:dim(raw_plate)[1]){
        rr_p = rr_p + 1
        if(rr_p==ROWS+1){
            rr_p = 1 # roll back to first row of plate
        }
        for(cc in 1:N_data_cols){
            if( !is.na(raw_plate[rr,cc]) ){
               sample_type[idx] =   plate_map[["sample_type"]][rr_p, cc]
               treatment[idx] =     plate_map[["treatment"]][rr_p, cc]
               concentration[idx] = plate_map[["concentration"]][rr_p, cc]
               value[idx] =      raw_plate[rr,cc]
               if(is_synergy){
                   treatment_2[idx] =     plate_map[["treatment_2"]][rr_p, cc]
                   concentration_2[idx]=plate_map[["concentration_2"]][rr_p, cc]
                   if( is.na( concentration_2[idx] ) ){
                       concentration_2[idx]=0
                   }
                   if( is.na( treatment_2[idx]) | treatment_2[idx]=="" ){
                       treatment_2[idx] = treatment[idx]
                   }
               }
               hours[idx] =      raw_plate$hours[rr]
               plate_ids[idx] =  raw_plate$plate_id[rr]
               idx = idx+1
            }
        }
    }
    keep = !is.na( concentration ) & !is.na( treatment ) & !is.na( sample_type )
    value[ !keep ] = NA
    concentration[ keep ] = as.numeric( concentration[ keep ] )

    if( is_synergy ){
        keep = keep & !is.na( concentration_2 ) & !is.na( treatment_2 )
        value[ !keep ] = NA
        concentration_2[ keep ] = as.numeric( concentration_2[ keep ] )
        D = data.frame( sample_type, treatment, concentration, 
              treatment_2, concentration_2, 
              value = as.numeric(value), value_normalized = as.numeric( value ),
              negative_control = rep(NA, length(value) ),
              is_negative_control = rep(NA, length(value) ),
              hours = as.numeric(hours), plate_id = plate_ids, 
              stringsAsFactors=FALSE )        
    }else{
        D = data.frame( sample_type, treatment, concentration, 
              value = as.numeric(value), value_normalized = as.numeric( value ),
              negative_control = rep(NA, length(value) ),
              is_negative_control = rep(NA, length(value) ),
              hours = as.numeric(hours), plate_id = plate_ids, 
              stringsAsFactors=FALSE )
    }
    D = prepare_for_normalization( D, negative_control )
    D = normalize_by_vehicle( D, summary_method=summary_method )
    if( na.rm ){
        keep = !is.na( D$sample_type ) & !is.na( D$treatment ) & !is.na( D$concentration )
        D = D[keep,]
    }
    D
}

    
#' reshape a set of values to a matrix with axes defined by X any Y vectors
#' 
#' Useful for extracting two columns out of a data frame that will serve as 
#' column and row names, with a third column that will represent the value at 
#' that (row, column). The pairs identified by {x_axis_labels,y_axis_labels} 
#' should be unique; if not, the last value seen will be used.
#' 
#' @param x_axis_labels vector of values that define the X axis
#' @param y_axis_labels vector of values that define the Y axis
#' @param values value to position at (X,Y) defined by the labels
#' @examples 
#' xlab = c("a","a","a","b","b","b","c","c","c")
#' ylab = c("x","y","z","x","y","z","x","y","z")
#' vals = c(1, 2 , 3 , 4 , 5 , 6 , 7 , 8, 9)
#' metric_to_grid(xlab, ylab, vals)
#' @return Matrix with columns named for x_axis_labels, rows named for 
#' y_axis_labels, and elements corresponding to the values parameter. X and Y 
#' labels will be sorted.
#' @export
metric_to_grid = function( x_axis_labels, y_axis_labels, values ){
    
    if( length(x_axis_labels) != length(y_axis_labels) ){
        stop("axis labels must have the same length")   
    }
    if( length(x_axis_labels) != length(values) ){
        stop("values must have the same length as axis labels")   
    }
    # sort first, then convert to strings. If convert first, labels that are 
    # numbers will be sorted incorrectly.
    xl = sort(unique(x_axis_labels))
    yl = sort(unique(y_axis_labels))
    
    x_axis_labels=as.character(x_axis_labels)
    y_axis_labels=as.character(y_axis_labels)
    xl=as.character(xl)
    yl=as.character(yl)
    xl2idx = hsh_from_vectors( xl, 1:length(xl))
    yl2idx = hsh_from_vectors( yl, 1:length(yl))
    G = matrix( NA, nrow=length(yl), ncol=length(xl), dimnames=
                    list(yl, xl))
    for(i in 1:length(x_axis_labels)){
        G[ hsh_get(yl2idx, y_axis_labels[i] ), 
           hsh_get(xl2idx, x_axis_labels[i] )] = values[i] 
    }
    G
}



#' given a synergy dataset plot multiple dose response curves for 
#' drug A separated by individual concentrations of drug B. Standardizes new 
#' dataset to have drug A as in the columns "treatment" and "concentration". 
#' Creates new sample types, one for each dose of drug B. The resulting 
#' dataset can then be passed through to fit_DRC without modification.
#' 
#' @param D experiment dataset with columns matching the output of 
#' \code{combine_data_and_map}
#' @param sample_type single sample type present in D
#' @param treatment_a treatment present in D that will be plotted at all 
#' concentrations in each dose response curve
#' @param treatment_b treatment present in D that will be used to divide 
#' readings into distinct dose response curves
#' @param concentrations_a concentrations of treatment A present in D. Do not 
#' pass the concentration of the vehicle, if one is present. 
#' @param concentrations_b concentrations of treatment B present in D. Do not 
#' pass the concentration of the vehicle, if one is present. 
#' @export
convert_dataset_for_synergy_DRC = function( D, sample_type, 
                            treatment_a = "", treatment_b = "",
                            concentrations_a = NULL, concentrations_b = NULL){
    
    ds_type = is_dataset_valid(D, sample_types=c(sample_type), 
                               treatments = c(treatment_a, treatment_b))
    if( ds_type != 2 ){
        stop("This function can only be called on synergy datasets")   
    }
    D_s = D[D$sample_type==sample_type, ]

    D_std = standardize_treatment_assigments(D_s, treatment_a, treatment_b)
    cols=c("sample_type", "treatment", "treatment_2", "concentration_2")
    D_std$sample_type = apply( D_std[ , cols ], 1, paste , collapse = "_" )
    #D_std=D_std[ ( D_std$treatment == treatment_a & D_std$treatment_2 == treatment_b & 
    #                   D_std$concentration_2 %in% concentrations_b ) | 
    
    D_std=D_std[ ( D_std$treatment == treatment_a & 
                   D_std$treatment_2 == treatment_b & 
                   D_std$concentration_2 %in% concentrations_b ) | 
#                ( D_std$is_negative_control & D_std$is_negative_control_2 ),]                     
                 ( D_std$is_negative_control_2 ),]    
    D_std[order(D_std$concentration_2),] 
}
    
#' Attempt to fit a dose-response curve from the dataset and calculate AUC.
#'
#' Given a data frame of measurements generated by 
#' \code{\link{combine_data_and_map}} or matching the format generated by that 
#' function, fit a dose-response curve for each unique sample_type/treatment 
#' specified by the sample_types and treatments parameters. A curve will 
#' be fit for each sample in sample_types, at each treatment in treatments.
#' 
#' If the data are from a synergy experiment, you must specify the concentration
#' to use for the curve in the concentration_column parameter as one of 
#' {concentration, concentration_2}.
#' 
#' @section Non-linear function for curve fitting:
#' 
#' Curve fitting is performed by the \code{drm()} function in the \code{drc} 
#' library. To fit the curve, you need to select a non-linear function. To 
#' estimate the slope, upper asymptote, lower asymptote, and EC50, pass 
#' drc::LL.4(). To fix the lower asymptote at 1 and estimate the other 
#' parameters, pass drc::LL.3(). To fix the upper asympotote at 1 and the lower 
#' asymptote at 0, pass drc::LL.2. For a list of available functions, see 
#' \code{drc::getMeanFunctions()}. 
#' 
#' To call this function you must load the drc package in your R session.
#' @param D experiment dataset with columns matching the output of 
#' \code{combine_data_and_map}
#' @param fct Non-linear function to fit, e.g. drc::LL.3(). See summary. 
#' @param sample_types Which sample types (e.g. distinct cell lines) of the 
#' sample types in D will be fit. If NA, fit all sample types. Default is NA.
#' @param treatments Which treatments (e.g. drugs) of the treatments in D will 
#' be fit. If NA, fit all treatments. Default is NA.
#' @param hour The hour in experiment dataset D at which to fit. If NA, combine
#' all timepoints.
#' @param concentration_columns The name of the columns in D to use for the 
#' concentration values in the curve. Defaults to NA, which will use the 
#' "concentration" column for all treatments. This is the correct choice for 
#' non-synergy experiments. For synergy experiments, where concentration may 
#' be in either column "concentration" or "concentration_2", pass a vector with 
#' a value for each treatment that is either "concentration" or 
#' "concentration_2".
#' @param treatment_columns The name of the columns in D to use for the 
#' treatment values in the curve. Defaults to NA, which will use the "treatment" 
#' column for all treatments. This is the correct choice for non-synergy 
#' experiments. For synergy experiments, where treatments may be in either 
#' column "treatment" or "treatment_2", pass a vector with value for each 
#' treatment that is either "treatment" or "treatment_2".
#' @examples 
#' sample_types = rep( c(rep("line1",3), rep("line2",3)), 5)
#' treatments = c(rep("DMSO",6), rep("drug",24))
#' concentrations = c( rep(0,6),rep(200,6), rep(500,6),rep(1000,6),rep(5000,6))
#' values=c(100,99,100,90,91,92,99,97,99,89,87,88,86,89,88,56,59,58,66,65,67,
#'          25,23,24,42,43,46,4,5,9)
#' hours = rep(48, length(values))
#' plate_id = "plate_1"
#' ds = create_dataset( sample_types=sample_types, treatments=treatments, 
#'                      concentrations=concentrations,hours=hours,values=values, 
#'                      plate_id=plate_id, negative_control = "DMSO")
#' library(drc)
#' # Fit model using three-parameter log-logistic function
#' fit_DRC(ds, sample_types=c("line1", "line2"), treatments=c("drug"), 
#'         hour = 48, fct=drc::LL.3() )
#'  
#' # Fit model using four-parameter log-logistic function
#' fit_DRC(ds, sample_types=c("line1", "line2"), treatments=c("drug"), 
#'         hour = 48, fct=drc::LL.4() )
#' @return A HT_fit object
#' @import drc
#' @importFrom stats anova
#' @export
fit_DRC = function(D, fct, sample_types=NA, treatments=NA, hour=NA,
                   concentration_columns=NA, treatment_columns=NA){
    
    ds_type = is_dataset_valid(D, sample_types=sample_types, 
                               treatments = treatments, hour=hour)
    if( is.na(sample_types[1] ) ){
        sample_types = sort(unique(D$sample_type))    
    }
    
    # set treatments to all values if user passed NA
    if( is.na(treatments[1]) ){
        treatments = sort(unique(D$treatment[!D$is_negative_control]))
    }
    
    # set treatment_columns to treatment if user passed NA
    if( is.na( treatment_columns ) ){
        treatment_columns = rep("treatment", length(treatments) )
    }else{
        if( length(treatment_columns) != length(treatments) ){
            stop(paste("parameter treatment_columns must be either NA or",
                     "be a vector of the same length as parameter treatments"))
        }
        for( i in 1:length(treatment_columns) ){
            if( treatment_columns[i] != "treatment" & 
                treatment_columns[i] != "treatment_2")
                stop(paste("Values for parameter treatment_columns must be", 
                           "one of {treatment, treatment_2}") )
        }
    }
    
    # check treatments; treatment_columns will have same length by now
    for(i in 1:length(treatments)){
        if( treatment_columns[i]=="treatment_2" ){
            if( sum(D$treatment_2==treatments[i], na.rm=TRUE)==0 ) 
                stop(paste("value in treatments parameter", treatments[i],
                           "not present in D in column treatments_2"))
        }else{
            if( sum(D$treatment==treatments[i], na.rm=TRUE)==0 ) 
                stop(paste("value in treatments parameter", treatments[i],
                           "not present in D in column treatments"))   
        }
    }
    
    if( is.na( concentration_columns ) ){
        concentration_columns = rep("concentration", length(treatments) )
    }else{
        if( length(concentration_columns) != length(treatments) ){
            stop(paste("parameter concentration_columns must be either NA or",
                      "be a vector of the same length as parameter treatments"))
        }
        for( i in 1:length(concentration_columns) ){
            if( concentration_columns[i] != "concentration" & 
                concentration_columns[i] != "concentration_2")
                stop(paste("Values for parameter concentration_columns must be",
                            "one of {concentration,concentration_2}") )
        }
    }
    
    if( is.na(hour[1]) ){
        hour = unique(D$hours)
    }

    if(length(unique(hour))>1 ){
        warning("Fitting observations from more than one timepoint")
    }

    FIT = HT_fit()
    FIT$input = subset_treatments( D, sample_types, treatments, hour, 
                                   concentration_columns, treatment_columns )
    FIT$unique_conditions = sort( unique( FIT$input$conditions_to_fit ) ) 
    FIT$sample_types = sample_types
    FIT$treatments = treatments
    drm_settings = drc::drmc(errorm = FALSE)
    model_input = FIT$input
    if(length(unique(model_input$concentration)) < 2 ){
        stop("Model fit requires at least two distinct concentrations")
    }
    model_input$conditions_to_fit = factor(model_input$conditions_to_fit)
    conditions_to_fit = model_input$conditions_to_fit
    if( length( unique( model_input$conditions_to_fit ) ) == 1 ){
        FIT$model = try( drc::drm( value~concentration, 
                                   data=model_input, 
                                   fct = fct) )
    }
    else{
        FIT$model = try( 
            drc::drm( value~concentration, conditions_to_fit, data=model_input, 
                      fct = fct), 
            silent=TRUE )
    }
    if( !"try-error" %in% class( FIT$model ) ){
        FIT$is_fitted=TRUE
        # Difference between fits F statistic and P value
        M = try( drc::drm(value~concentration, data=model_input, fct = fct), 
                 silent=TRUE )
        if( !"try-error" %in% class( M ) ){
            M_anova = stats::anova( FIT$model, M, details=FALSE )
            FIT$ANOVA_F_test = M_anova$`F value`[2]
            FIT$ANOVA_P_value = M_anova$`p value`[2]
            FIT$ANOVA=M_anova
        }
    }
    FIT=AUC( FIT )
    drm_settings = drc::drmc( errorm = TRUE )
    FIT
}


#' Attempt to fit multiple dose-response curves for a range of doses for 
#' treatment_a, broken out by individual doses of treatment_b.
#'
#' Given a data frame of measurements generated by 
#' \code{\link{combine_data_and_map}} or matching the format generated by that 
#' function, fit a dose-response curve for each unique sample_type/treatment 
#' specified by the sample_types and treatments parameters. A curve will 
#' be fit for each sample in sample_types, at each treatment in treatments.
#' 
#' If the data are from a synergy experiment, you must specify the concentration
#' to use for the curve in the concentration_column parameter as one of 
#' {concentration, concentration_2}.
#' 
#' @section Non-linear function for curve fitting:
#' 
#' Curve fitting is performed by the \code{drm()} function in the \code{drc} 
#' library. To fit the curve, you need to select a non-linear function. To 
#' estimate the slope, upper asymptote, lower asymptote, and EC50, pass 
#' drc::LL.4(). To fix the lower asymptote at 1 and estimate the other 
#' parameters, pass drc::LL.3(). To fix the upper asympotote at 1 and the lower 
#' asymptote at 0, pass drc::LL.2. For a list of available functions, see 
#' \code{drc::getMeanFunctions()}. 
#' 
#' To call this function you must load the drc package in your R session.
#' @param ds experiment dataset with columns matching the output of 
#' \code{combine_data_and_map}
#' @param fct Non-linear function to fit, e.g. drc::LL.3(). See summary. 
#' @param sample_type Which sample type (e.g. distinct cell line) of the 
#' sample types in D will be fit. Unlike fit_DRC, a single sample type must 
#' be passed.
#' @param hour The hour in experiment dataset D at which to fit. 
#' @param treatment_for_DRC treatment whose values will be used to plot 
#' individual dose-response curves
#' @param treatment_subgroups treatment that will be used to subgroup 
#' measurements of treatment_for_DRC
#' @param concentrations_for_DRC concentrations used for treatment_for_DRC. 
#' Defaults to NA, use all valid concentrations 
#' @param concentrations_subgroups concentrations used for treatment_subgroups. 
#' Defaults to NA, use all valid concentrations 
#' 
#' @return A list with elements fit, a HT_fit object, and legend_details, a 
#' data frame that indicates the correspondence between the plot colors that 
#' will be chosen by default if plot(fit) is called and the concentration of 
#' treatment_b.
#' @import drc
#' @export
fit_DRC_synergy = function(ds, fct, sample_type, hour, 
                           treatment_for_DRC=NA, treatment_subgroups, 
                           concentrations_for_DRC=NA, 
                           concentrations_subgroups=NA){
    if( length(sample_type) != 1 ){
        stop("a single sample_type must be passed")   
    }
    ds_syn=convert_dataset_for_synergy_DRC( ds,
                                            sample_type = sample_type, 
                                            treatment_a = treatment_for_DRC, 
                                            treatment_b = treatment_subgroups,
                                            concentrations_a = 
                                                concentrations_for_DRC,
                                            concentrations_b = 
                                                concentrations_subgroups )
    samples=unique(ds_syn$sample_type[!ds_syn$is_negative_control])
    fit_syn = fit_DRC(ds_syn, 
                      sample_types = samples,
                      treatments = treatment_for_DRC,
                      fct=fct, hour = hour)
    
    fpc = HT_fit_plot_colors(fit_syn)
    concentration=rep(0, length(fpc$condition))
    for(i in 1:length(concentration)){
        concentration[i] = strsplit( 
            strsplit( fpc$condition[i], split = "_|_", fixed = TRUE)[[1]][1], 
            split="_", fixed=TRUE)[[1]][4]  
    }
    fpc = data.frame( col=fpc$col, 
                      concentration=as.numeric(concentration), 
                      stringsAsFactors=FALSE)
    list(fit=fit_syn, legend_details=fpc)
}

#' Summarize the results of a dose response fit
#' 
#' @param object fit object
#' @param ... other parameters, currently ignored
#' @examples 
#' sample_types = rep( c(rep("line1",3), rep("line2",3)), 5)
#' treatments = c(rep("DMSO",6), rep("drug",24))
#' concentrations = c( rep(0,6),rep(200,6), rep(500,6),rep(1000,6),rep(5000,6))
#' values=c(100,99,100,90,91,92,99,97,99,89,87,88,86,89,88,56,59,58,66,65,67,
#'          25,23,24,42,43,46,4,5,9)
#' hours = rep(48, length(values))
#' plate_id = "plate_1"
#' ds = create_dataset( sample_types=sample_types, treatments=treatments, 
#'                      concentrations=concentrations, hours=hours, 
#'                      values=values, plate_id=plate_id, 
#'                      negative_control = "DMSO")
#' library(drc)
#' # Fit model using three-parameter log-logistic function
#' fit_1 = fit_DRC(ds, sample_types=c("line1", "line2"), treatments=c("drug"), 
#'                 hour = 48, fct=drc::LL.3() )
#' summary(fit_1)
#' @return none
#' @export
summary.HT_fit = function(object, ...){ 
    cat("Sample types: ")
    st = sort(unique(object$sample_types))
    tt = sort(unique(object$treatments))
    cat( paste( paste(st, collapse=' '), "\n", sep=""))
    cat("Treatments: ")
    cat( paste( paste(tt, collapse=' '), "\n", sep=""))
    cat("Unique Conditions: ")
    cat( paste( paste(object$unique_conditions, collapse=' '), "\n", sep=""))
    
    if( object$is_fitted ){
        cat("\nFit converged.\n")
        print( round(object$fit_stats,3) )
        cat( paste("F-statistic: ",round(object$ANOVA_F_test,2), 
                     " p-value: ", signif(object$ANOVA_P_value,3),"\n", 
                   collapse=""))
    }else{
        cat("\nFit did not converge\n")
    }
}


#' Fit all observations and return summary information from fit_stats data frame
#' 
#' @section Non-linear function for curve fitting:
#' 
#' Curve fitting is performed by the \code{drm()} function in the \code{drc} 
#' library. To fit the curve, you need to select a non-linear function. To 
#' estimate the slope, upper asymptote, lower asymptote, and EC50, pass 
#' drc::LL.4(). To fix the lower asymptote at 1 and estimate the other 
#' parameters, pass drc::LL.3(). To fix the upper asympotote at 1 and the lower 
#' asymptote at 0, pass drc::LL.2. For a list of available functions, see 
#' \code{drc::getMeanFunctions()}. 
#' 
#' @param D dataset
#' @param fct Non-linear function to fit, e.g. drc::LL.3(). See summary.
#' @return data frame with fit summary data (is_fitted, hour, treatment, 
#' sample_type, fit_stats, ANOVA_F_test, ANOVA_P_value)
#' @examples 
#' sample_types = rep( c(rep("line1",3), rep("line2",3)), 5)
#' treatments = c(rep("DMSO",6), rep("drug",24))
#' concentrations = c( rep(0,6),rep(200,6), rep(500,6),rep(1000,6),rep(5000,6))
#' values=c(100,99,100,90,91,92,99,97,99,89,87,88,86,89,88,56,59,58,66,65,67,
#'          25,23,24,42,43,46,4,5,9)
#' hours = rep(48, length(values))
#' plate_id = "plate_1"
#' ds = create_dataset( sample_types=sample_types, treatments=treatments, 
#'                      concentrations=concentrations, hours=hours, 
#'                      values=values, plate_id=plate_id, 
#'                      negative_control = "DMSO")
#' library(drc)
#' fit_statistics( ds, fct=LL.3() )
#' @seealso \code{\link{fit_DRC}} 
#' @export
fit_statistics = function(D, fct){
    ds_type=is_dataset_valid(D)
    sample_types = get_sample_types(D)
    treatments = sort(unique(D$treatment[!D$is_negative_control]))
    res = c()
    for( tt in 1:length(treatments ) ){
        cur_treat = treatments[tt]
        hours = get_hours(D[D$treatment==cur_treat,])
        for(hh in 1:length(hours) ){
            cur_hour = hours[hh]
            # protect against situation where a given treatment-hour combination
            # is not present, e.g. a time point where sample "foo" had a missed
            # scan but sample "bar" was present. Otherwise fit_DRC throws error
            st_valid_comparison = c()
            for(ss in 1:length(sample_types)){
                if( sum( D$sample_type==sample_types[ss] &
                         D$hours==hours[hh] &
                         D$treatment==treatments[tt]) >0 ){
                    st_valid_comparison = c( st_valid_comparison, sample_types[ss])
                }
            }
            if( length(st_valid_comparison)>0 ){
                FIT=fit_DRC(D, st_valid_comparison, cur_treat, cur_hour, fct=fct)
                rn=rownames(FIT$fit_stats)
                treatment = get.split.col(rn, "_|_", last=TRUE)
                sample_type = get.split.col(rn, "_|_", first=TRUE)
                summ = cbind( is_fitted = rep(FIT$is_fitted, length(treatment)),
                          hour = rep(cur_hour, length(treatment)),
                          treatment, 
                          sample_type, 
                          FIT$fit_stats, 
                        ANOVA_F_test = rep(FIT$ANOVA_F_test, length(treatment)),
                        ANOVA_P_value=rep(FIT$ANOVA_P_value, length(treatment)),
                          stringsAsFactors=FALSE)
                if(is.null(dim(res)) ){
                    res = summ
                }else{
                    res = rbind(res, summ)
                }
            }
        }
    }
    res
}


#' Convert fit statistics data frame to matrixes for further analysis
#' 
#' This function is convenient for producing heat maps or other plots of 
#' fit summary data across a time series.
#' 
#' @param fits data frame produced by call to \code{\link{fit_statistics}}
#' @return list of lists, indexed first by sample type and then by 
#' ANOVA_P_value, AUC, and EC50. These elements are identically sized matrixes 
#' with the dimensions (treatment x hour) 
#' @examples 
#' sample_types = rep( c(rep("line1",3), rep("line2",3)), 5)
#' treatments = c(rep("DMSO",6), rep("drug",24))
#' concentrations = c( rep(0,6),rep(200,6), rep(500,6),rep(1000,6),rep(5000,6))
#' values=c(100,99,100,90,91,92,99,97,99,89,87,88,86,89,88,56,59,58,66,65,67,
#'          25,23,24,42,43,46,4,5,9)
#' hours = rep(48, length(values))
#' plate_id = "plate_1"
#' ds = create_dataset( sample_types=sample_types, treatments=treatments, 
#'                      concentrations=concentrations, hours=hours, 
#'                      values=values, plate_id=plate_id, 
#'                      negative_control = "DMSO")
#' library(drc)
#' fits = fit_statistics( ds, fct=LL.3() )
#' fit_statistics_matrixes( fits )
#' @export
fit_statistics_matrixes = function( fits ){
    expected_names = c("hour","treatment","sample_type","AUC","coef_slope",
               "coef_asymp_low","coef_asymp_high","coef_EC50","obs_min",
               "obs_max","ANOVA_F_test","ANOVA_P_value")
    if( !is.data.frame( fits ) ){
        stop("parameter fits must be a data frame")
    }
    for(i in 1:length(expected_names) ){
        if( !expected_names[i] %in% names(fits) ){
            stop(paste("fits data frame must have column",expected_names[i]))
        }
    }
    sample_types = sort(unique(fits$sample_type))
    MS <- list()
    for(i in 1:length(sample_types)){
        cur_s = sample_types[i]
        idx_cur_s = which( fits$sample_type==cur_s )
        M = list(
             ANOVA_P_value =  metric_to_grid( fits$hour[idx_cur_s], 
                                              fits$treatment[idx_cur_s],
                                              fits$ANOVA_P_value[idx_cur_s] ),
             AUC =  metric_to_grid( fits$hour[idx_cur_s], 
                                              fits$treatment[idx_cur_s],
                                              fits$AUC[idx_cur_s] ),
             EC50 =  metric_to_grid( fits$hour[idx_cur_s], 
                                    fits$treatment[idx_cur_s],
                                    fits$coef_EC50[idx_cur_s] )
        )
        MS[[ length(MS)+1 ]] = M
    }
    names(MS) = sample_types
    MS
}

#' Convert values in M1, M2 to fold-change M1 vs M2
#' @param M1 matrix for numerator of fold-change
#' @param M2 matrix for denominator of fold-change
#' @return matrix with fold-change for M1/M2. Values that are NA in either M1 
#' or M2 are NA in M
#' @examples 
#' M1 = matrix( c(1, 1, 4, 1), nrow=2, ncol=2 )
#' M2 = matrix( 1:4, nrow=2, ncol=2 )
#' convert_to_foldchange(M1, M2)
#' @export
convert_to_foldchange = function(M1, M2){
    if( !is.matrix( M1 ) | !is.matrix(M2) ){
        stop("M1 and M2 must be matrixes")   
    }
    if( dim(M1)[1] != dim(M2)[1] | dim(M1)[2] != dim(M2)[2] ){
        stop("M1 and M2 must have the same dimensions")   
    }
    M = log2( M2/M1 )
    is_na = is.na(M) | is.nan(M)
    is_lt0 = !is_na & M<0 
    M[is_lt0] = M[is_lt0]*-1
    M=2^M
    M[is_lt0] = M[is_lt0] * -1
    M
}


#' Calculate fold-change in Dose Response Curve AUC and EC50 at all time points 
#' for two samples 
#' 
#' This is a convenience function that reproduces analysis that can also be 
#' performed using the \code{\link{fit_statistics}} function. The rationale 
#' for using this method is it contains some logic for screening out results 
#' where:
#'  
#' \enumerate{
#'  \item{the P value for ANOVA testing whether observations are  
#' significantly different from each other is below a given threshold}
#'  \item{at least one sample type was observed with a normalized value 
#' below a given threshold (e.g. achieved a SF50)}
#'  \item{the log10(difference in AUC) exceeds a threshold}
#' }
#' 
#' @param D experiment dataset with columns matching the output of 
#' \code{combine_data_and_map}
#' @param sample_types vector of two sample types to compare
#' @param fct Non-linear function to fit, defaults to four variable LL.4 model 
#' that estimates slope, upper asymptote, lower asymptote, and EC50. To fix 
#' lower asymptote at 1, pass drc::LL.3(). To fix lower asympotote ant 1 and 
#' upper asymptote at 0, pass drc::LL.2(). For a list of available functions, 
#' see drc::getMeanFunctions(). To pass a function you must load the drc 
#' package in your R session.
#' @param max_Pval mark comparisons where fit P_value exceeds this as NA
#' @param min_obs mark comparisons where neither sample type has an observation 
#' below this value to be NA
#' @param min_log10_EC50_diff mark comparisons where the log10( difference in 
#' AUC ) at that timepoint is less than this value to be NA
#' @return list of matrixes for fold-change in AUC, fold-change in EC50, 
#' P value for ANOVA, sample AUC, and sample EC50. Matrix X-axis is hours and 
#' Y-axis is treatments.
#' @examples 
#' # set up two timepoints; this function is more useful when there are many 
#' # drugs and multiple timepoints
#' sample_types = rep( c(rep("line1",3), rep("line2",3)), 5)
#' treatments = c(rep("DMSO",6), rep("drug",24))
#' concentrations = c( rep(0,6),rep(200,6), rep(500,6),rep(1000,6),rep(5000,6))
#' values=c(100,99,100,90,91,92,99,97,99,77,76,79,92,93,91,66,65,67,77,76,74,
#'          36,35,35,57,56,55,22,25,24,100,99,100,90,91,92,99,97,99,89,87,88,
#'          86,89,88,56,59,58,66,65,67,25,23,24,42,43,46,4,5,9)
#' hours = c( rep(24, length(values)/2), rep(48, length(values)/2))
#' treatments = rep(treatments, 2)
#' concentrations = rep(concentrations, 2)
#' sample_types = rep(sample_types, 2)
#' plate_id = "plate_1"
#' ds = create_dataset( sample_types=sample_types, treatments=treatments, 
#'                      concentrations=concentrations, hours=hours, 
#'                      values=values, plate_id=plate_id, 
#'                      negative_control = "DMSO")
#' # calculate grid
#' timecourse_relative_grid(ds, c("line1", "line2"), drc::LL.3() )
#' @export
timecourse_relative_grid = function( D, sample_types, fct, 
                             max_Pval=1, 
                             min_obs=Inf,
                             min_log10_EC50_diff=0 ){
    
    ds_type=is_dataset_valid(D)
    
    if(length(sample_types)!=2){
        stop( "Must pass exactly two sample_types" )
    }
    if( length(D$sample_type==sample_types[1])==0 ){
        stop(paste("sample_type[1]",sample_types[1],"not found in D"))
    }
    if( length(D$sample_type==sample_types[2])==0 ){
        stop(paste("sample_type[1]",sample_types[2],"not found in D"))
    }
    fits = fit_statistics(D[D$sample_type %in% sample_types,], fct = fct )
    odd = which( fits$sample_type==sample_types[1] )
    even = which( fits$sample_type==sample_types[2] )
    lbl_hours = fits$hour[odd]
    lbl_treat = fits$treatment[odd]
    fits_P = metric_to_grid( lbl_hours, lbl_treat, fits$ANOVA_P_value[odd] )
    
    
    fits_minobs_o = metric_to_grid( lbl_hours, lbl_treat, fits$obs_min[odd] )
    fits_minobs_e = metric_to_grid( lbl_hours, lbl_treat, fits$obs_min[even] )
    fits_EC50_o = metric_to_grid( lbl_hours, lbl_treat, fits$coef_EC50[odd] )
    fits_EC50_e = metric_to_grid( lbl_hours, lbl_treat, fits$coef_EC50[even] )
    
    HAS_OBS_LT50 = (!is.na( fits_minobs_o ) & fits_minobs_o < min_obs) | 
                   (!is.na( fits_minobs_e ) & fits_minobs_e < min_obs) 
    
    HAS_EC50_o = ( !is.na( fits_EC50_o ) & !is.infinite( fits_EC50_o )  )
    HAS_EC50_e = ( !is.na( fits_EC50_e ) & !is.infinite( fits_EC50_e )  )
    HAS_EC50 = HAS_EC50_e | HAS_EC50_o
    HAS_EC50_DIFF = ( is.infinite( fits_EC50_o ) & HAS_EC50_e ) | 
        ( is.infinite( fits_EC50_e ) & HAS_EC50_o ) |
        ( HAS_EC50_o & HAS_EC50_e & abs(log10(fits_EC50_o)-log10(fits_EC50_e)) 
          > min_log10_EC50_diff )
    HAS_PVAL = !is.na( fits_P ) & fits_P <= max_Pval
    
    fits_AUC_even = metric_to_grid( lbl_hours, lbl_treat, fits$AUC[even] )
    fits_AUC_odd = metric_to_grid( lbl_hours, lbl_treat, fits$AUC[odd] )
    fits_AUC_even[  !HAS_PVAL | !HAS_OBS_LT50 | !HAS_EC50_DIFF ] = NA
    fits_AUC_odd[  !HAS_PVAL | !HAS_OBS_LT50 | !HAS_EC50_DIFF ] = NA
    fits_AUC_ratio=convert_to_foldchange(fits_AUC_even, fits_AUC_odd)
    
    fits_EC50_odd = metric_to_grid( lbl_hours, lbl_treat, fits$coef_EC50[odd] )
    fits_EC50_even = metric_to_grid( lbl_hours, lbl_treat, fits$coef_EC50[even] )
    fits_EC50_even[  !HAS_PVAL | !HAS_OBS_LT50 | !HAS_EC50_DIFF ] = NA
    fits_EC50_odd[  !HAS_PVAL | !HAS_OBS_LT50 | !HAS_EC50_DIFF ] = NA
    fits_EC50_ratio=convert_to_foldchange( fits_EC50_even, fits_EC50_odd )
    list( AUC_foldchange=fits_AUC_ratio, 
          EC50_foldchange=fits_EC50_ratio,
          P_value = fits_P,
          AUC_1 = metric_to_grid( lbl_hours, lbl_treat, fits$AUC[odd]),
          AUC_2 = metric_to_grid( lbl_hours, lbl_treat, fits$AUC[even]),
          EC50_1 = metric_to_grid( lbl_hours, lbl_treat, fits$coef_EC50[odd]),
          EC50_2 = metric_to_grid( lbl_hours, lbl_treat, fits$coef_EC50[even]))
}


#' Calculate AUC for sample/treatment at a concentration across multiple 
#' timepoints
#'
#' This function calculates the AUC at a single concentration 
#' across all timepoints. This is very distinct from a typical dose response 
#' curve, which can be used to calculate the AUC at a single timepoint across 
#' multiple concentrations. For that purpose, use \code{\link{fit_DRC}}. 
#' 
#' AUC for a single sample type/treatment condition across the 
#' whole timecourse is calculated using raw data normalized by the vehicle 
#' control specified by the user when the dataset was created. AUC is 
#' calculated by connecting the normalized data points and then using the 
#' trapezoids method for AUC implemented in pracma::trapz.
#'
#' For each treatment, in each concentration,
#' \itemize{
#'  \item{ AUC_v= AUC( raw vehicle matched plate_id )} 
#'  \item{ AUC_sample1 = AUC( raw sample1 after treatment ) / AUC_v }
#'  \item{ AUC_sample2 = AUC( raw sample2 after treatment ) / AUC_v }   
#'  \item{ AUC ratio = AUC_sample1 / AUC_sample2}
#' }
#' Return value is a list. The matrix in the list contains one column for each 
#' AUC value followed by one column for each AUC ratios of sample_types[1] / 
#' sample_types[2]. The concentrations vector in the list indicates the 
#' treatment concentration in each column. The sample_types vector in the list 
#' indicates the sample type if the column is a single sample AUC, or the word 
#' "ratio" if the value in the column is an AUC ratio.
#' 
#' @param D experiment dataset with columns matching the output of 
#' \code{read_incucyte_exported_to_excel}
#' @param treatments which treatments to calculate
#' @param sample_types samples on which to calculate 
#' @param concentrations which concentrations to draw
#' @param summary_method mean or median
#' @return list( AUC matrix, concentrations, sample_types )
#' @examples 
#' # Create a dataset with two drugs normalized against DMSO, tested at a single
#' # concentration. Timepoints are 0, 12, 24, 36, 48 hours. drug1 affected 
#' # line1 cells at this concentration, while drug2 affected line2 cells.
#' ds = create_dataset( 
#'      sample_types= rep( c("line1", "line2"), 15),
#'      treatments = c( rep("DMSO", 10), rep("drug1", 10), rep("drug2", 10)),
#'      concentrations = c( rep(0, 10), rep(200, 20) ),
#'      hours = rep( c(0,0,12,12,24,24,36,36,48,48), 3),
#'      values = c(10,10,15,15,30,25,60,50,80,65,
#'                 10,10,12,15,22,28,26,48,30,60,
#'                 10,10,15,12,30,11,60,13,80,9), 
#'      plate_id = "plate_1",
#'      negative_control = "DMSO")
#' tc=timecourse_AUC_ratio(ds, 
#'                       treatments=c("drug1", "drug2"), 
#'                       sample_types=c("line1", "line2"), 
#'                       concentrations=c(0, 200), 
#'                       summary_method="mean")
#' # create a data frame from individual list components
#' data.frame( sample_type = tc$sample_types,
#'             concentration = tc$concentrations,
#'             t(tc$AUC), 
#'             row.names=1:length(tc$sample_types),
#'             stringsAsFactors=FALSE)
#' @export
timecourse_AUC_ratio = function( D, treatments, sample_types, concentrations,
                           summary_method){
    
    ds_type = is_dataset_valid(D)
    
    if( summary_method != "mean" & summary_method != "median"){
        stop("summary_method parameter must be either mean or median")
    }
    if( length(sample_types) != 2 ){
        stop("Must pass exactly two sample types in parameter sample_types")
    }
    if( sum(D$sample_type==sample_types[1], na.rm=TRUE) == 0  ){
        stop(paste("sample_types[1], '",sample_types[1], 
                   "', not found in D",sep="" ) )
    }
    if( sum(D$sample_type==sample_types[2], na.rm=TRUE) == 0  ){
        stop(paste("sample_types[2], '",sample_types[2], 
                   "', not found in D",sep="" ) )
    }
    for(i in 1:length(treatments)){
        if(sum(D$sample_type==sample_types[1] & D$treatment==treatments[i], 
               na.rm=TRUE)==0){
            stop(paste("treatment",treatments[i],"not found for sample type",
                       sample_types[1]))
        }
        if(sum(D$sample_type==sample_types[2] & D$treatment==treatments[i], 
               na.rm=TRUE)==0){
            stop(paste("treatment",treatments[i],"not found for sample type",
                       sample_types[2]))
        }
    }
    for(i in 1:length(concentrations)){
        if(sum(D$sample_type==sample_types[1] & 
               D$concentration==concentrations[i], na.rm=TRUE)==0){
            stop(paste("concentration",concentrations[i],
                       "not found for sample type", sample_types[1]))
        }
        if(sum(D$sample_type==sample_types[2] & 
               D$concentration==concentrations[i], na.rm=TRUE)==0){
            stop(paste("concentration",concentrations[i],
                       "not found for sample type",sample_types[2]))
        }
    }
    Mauc = matrix(0, nrow=length(treatments), 
                  ncol=length(concentrations)*length(sample_types))
    col_concentrations = rep(0, dim(Mauc)[2] +  length(concentrations) )
    col_sample_types = rep(0, dim(Mauc)[2] + length(concentrations) )
    col_identities = rep(0, dim(Mauc)[2] + length(concentrations) )
    for(idx_t in 1:length(treatments) ){
        cur_t = treatments[idx_t]
        plate_id = unique(D$plate_id[D$treatment==cur_t])
        # AUC for vector in each line at the appropriate plate
        AUC_v = c()
        for( idx_s in 1:length(sample_types)){
            cur_s = sample_types[idx_s]
            neg_ctl = unique( D$negative_control[ 
               D$plate==plate_id & D$sample_type==cur_s & D$treatment == cur_t])
            neg_ctl_conc = unique(D$concentration[ D$plate==plate_id &
                                            D$sample_type==cur_s &
                                            D$treatment == neg_ctl &
                                            D$is_negative_control ])
            AUC_v = c(AUC_v,
                      AUC_across_timepoints(D, cur_s, neg_ctl, 
                                            neg_ctl_conc, summary_method) )
        }
        ctr=1
        for(idx_c in 1:length(concentrations)){
            for(idx_s in 1:length(sample_types)){
                cur_s = sample_types[idx_s]
                cur_c = concentrations[idx_c]
                col_concentrations[ctr] = cur_c
                col_identities[ctr] = paste("AUC_", cur_s, collapse="", sep="")
                col_sample_types[ctr] = cur_s
                if( sum( D$sample_type==cur_s & D$concentration==cur_c & 
                         D$treatment==cur_t, na.rm=TRUE)==0 ){
                    ratio=NA
                }
                else{
                    ratio = AUC_across_timepoints( D, cur_s, cur_t, cur_c,
                                               summary_method) / AUC_v[idx_s]
                }
                Mauc[idx_t, ctr] = ratio
                ctr = ctr+1
            }
        }
    }
    Mauc_ratio = matrix(0, nrow=length(treatments), 
                        ncol = length(concentrations) )
    for( idx_c in 1:length(concentrations)){
        cur_c = concentrations[idx_c]
        idx_conc_s1 = which(col_concentrations==cur_c &
                            col_sample_types == sample_types[1])
        idx_conc_s2 = which(col_concentrations==cur_c &
                            col_sample_types == sample_types[2])
        Mauc_ratio[,idx_c] = Mauc[,idx_conc_s1] / Mauc[,idx_conc_s2 ]
        col_concentrations[ctr] = cur_c
        col_sample_types[ctr] = "ratio"
        col_identities[ctr] = paste("AUCratio_", sample_types[1],":",
                                    sample_types[2], collapse="", sep="")
        ctr = ctr + 1
    }
    Mauc = cbind(Mauc, Mauc_ratio)
    dimnames(Mauc)[[2]] = col_identities
    dimnames(Mauc)[[1]] = treatments
    
    list( AUC = Mauc, 
          concentrations=col_concentrations,
          sample_types=col_sample_types)
}
DavidQuigley/HTDoseResponseCurve documentation built on Jan. 23, 2021, 5:10 a.m.