R/parsers.R

Defines functions writeData readbmg readBMG2Plate readBMGPlate readSynergyPlate readBioLectorProPlate readBioLectorPlate readSimplePlate readPlateData amountColors parseAmounts replaceAmounts readPlateMap readPlateMap.old mergePlates readPlates readExperiment factorize tosquare

Documented in amountColors parseAmounts readBMG2Plate readBMGPlate readExperiment readPlateData readPlateMap readPlateMap.old readSimplePlate readSynergyPlate replaceAmounts writeData

### PLATE DESIGN & DATA

### UTILS

## get the square-most matrix for a given number
tosquare <- function(x) {
    fct <-factorize(x)
    mid <- length(fct)/2
    if ( length(fct)%%2==0 )
        dim <- c(nrow=fct[mid], ncol=fct[mid+1])
    else
        dim <- c(nrow=fct[mid-.5],ncol=fct[mid+.5])
    dim
}

## https://stackoverflow.com/questions/6424856/r-function-for-returning-all-factors
factorize <- function(x) {
    x <- as.integer(x)
    div <- seq_len(abs(x))
    factors <- div[x %% div == 0L]
    return(factors)
}

### HIGH-LEVEL WRAPPER
#' High-level interface of \code{\link{platexpress}}
#'
#' This high-level wrapper allows to load a platereader experiment
#' and its layout file, do blank corrections, and replicate groupings.
#'
#' Calls, in this order the  \code{\link{platexpress}} functions
#'  \code{\link{readPlateData}},  \code{\link{readPlateMap}};
#'  optionally: \code{\link{skipWells}}, and \code{\link{correctBlanks}},
#' and \code{\link{getGroups}}. Please see the help files of these
#' functions for details on parameters.
#' @inheritParams readPlateData
#' @inheritParams readPlateMap
#' @inheritParams skipWells
#' @inheritParams skipWells
#' @inheritParams correctBlanks
#' @param dsep \code{sep} parameter for data file, see
#' \code{\link{readPlateData}}
#' @param layout the text file containing the plate layout information, see
#' argument \code{file} in \code{\link{readPlateMap}}
#' @param skip.wells skip these wells from all analyses, see argument
#' \code{skip} in \code{\link{skipWells}}
#' @param blank call \code{\link{correctBlanks}} to blank data
#' @param blank.data subset of data to blank, see argument \code{yids} in
#' function \code{\link{correctBlanks}}
#' @param base optional minimal value; all values will be raised by
#' the same amount using the function \code{\link{adjustBase}} via
#' function \code{\link{correctBlanks}} 
#' @param group1 plate layout column to be used for coarse grouping, via
#' function \code{\link{getGroups}}, eg. "strains"
#' @param group2 plate layout column to be used for fine grouping, via
#' function \code{\link{getGroups}}, eg. "inducer amount")
#' @param group2.color numeric plate layout column to be used for group2
#' coloring, via function \code{\link{groupColors}}, eg. "amount"
#' @param ... further parameters to plate-reader specific parsing functions
#' @export
readExperiment <- function(files, type, dsep,
                           time.range=c("full"), time.conversion=1/3600,
                           layout, sep=";",fsep=";",asep=":",blank.id="blank",
                           fields, afields,
                           skip.wells, blank=FALSE, blank.data, base=0,
                           group1, group2, group2.color, ...) {


    cat(paste0("PARSING DATA FILE(s): ", paste(files,collapse=";"), "\n",
               "type\t", type, "\n",
               "range\t", time.range, "\n",
               "time\t", time.conversion, "\n"))
    
    ## parse raw data
    if ( missing(dsep) ) 
        raw <- readPlateData(files, type=type,
                             time.range=time.range,
                             time.conversion=time.conversion,
                             verb=TRUE, ...)
    else
        raw <- readPlateData(files, type=type, sep=dsep,
                             time.range=time.range,
                             time.conversion=time.conversion,
                             verb=TRUE, ...)
###return(raw)
    
    cat(paste("PARSING LAYOUT:", layout, "\n"))

    ## read layout file
    map <- readPlateMap(layout, 
                        fields=fields, blank.id=blank.id,
                        sep=sep, fsep=fsep, asep=asep,
                        afields=afields)

    cat(paste("PRE-PROCESSING DATA\n"))
    ## skip wells
    ## TODO: implement in skipWells and do after map is attached
    if ( !missing(skip.wells) ) {
        map <- skipWells(map, skip.wells)
        raw <- skipWells(raw, skip.wells)
    }
    
    
    ## correct blanks
    if ( blank | !missing(blank.data) ) {
        if ( missing(blank.data) )
            blank.data <- raw$dataIDs
        dat <- correctBlanks(raw, map, yids = blank.data, base = base)
    } else dat <- raw
    rm(raw)
    
    cat(paste("GROUPING DATA\n"))

    ## get groupings
    dat$groups <- NULL
    if ( !missing(group1) )
        dat$groups$group1 <- getGroups(map, by = group1, verb=FALSE)
    if ( !missing(group2) ) {
        dat$groups$group2 <- getGroups(map, by = group2, verb=FALSE)
        if ( !missing(group2.color) )
            dat$groups$group2.color <- groupColors(map, dat$groups$group2,
                                                   color = group2.color)
    }
    
    ## attach all layout
    dat$layout <- map
    
    dat
    
}


### TODO FUNCTIONS

## TODO: wrapper to read both layout and data
readPlates <- function(layout, data.path, type, settings) {
    ## read plate layout, potentially multiple plates in one file
    ## automatically read in each plate data
    ## merge data from multiple plates?
}
## TODO: merge several plates and layouts
## just generate lists of plate objects, and make viewGroups/viewPlate
## etc. take such lists as defaults, and convert singles to lists themselves
mergePlates <- function(data=list(), layouts=list()) {
    #just generates lists of plates, and one big layout table
}

### LAYOUT PARSERS

## Read Plate Layout Map
## parses a plate design file in CSV. Rows and columns
## should be named as in the datafile. Each field can
## have multiple descriptors, separated by a specified separator (e.g.
## "newline"); blanks are specified by a keyword (default: "blank"),
## and if separate blanks are used for different conditions, the
## blank field must have the same format as measurement fields, except
## one parameter replaced by the keyword. Names for the values
## in the fields can be passed via argument "fields" as a vector of
## strings.
## TODO: repair this in example:
#' Read Plate Layout Map
#'
#' Parses a plate design file in CSV format. Rows and
#' columns should be named as in the corresponding plate reader data files.
#' @param file text file containing the plate layout.
#' @param sep column separator, as in \code{\link[utils:read.table]{read.table}}
#' @param fsep within-field separator, separating the well-specific descriptors within
#' well fields
#' @param blank.id keyword that indicates blank wells. Blank wells can be
#'                 combined with other well descriptors for separate blanking
#' @param fields names for the field descriptors
#' @param asep a separator for substance:amount pairs, eg. to indicate amount
#' of an inducer or a nutrient, can only be used together with
#' argument \code{afields}, eg. use \code{asep=":", afields="inducer"}
#' @param afields field names which hold substance:amount pair information,
#' see argument \code{asep}
#' @param formatted indicates whether the file is already in the required
#'                  format; all other paramaters but 'sep' will be ignored
#' @param nrows number of rows to expect, defaults to 8 for rows A to H in
#' a typical 96 well plate; TODO: get rid of this and instead check for
#' rownames?
#' @param header logical argument indicating the presence/absence of a header
#' row in the layout file
#' @return a table of well content descriptors, where the first column 'wells'
#'         maps the plate map to the data files.
#' @seealso \code{\link{readPlateData}}
#' @examples
#' plate.file <- system.file("extdata", "AP12_layout.csv", package = "platexpress")
#' plate <- readPlateMap(file=plate.file, blank.id="blank",fsep="\n", fields=c("strain","samples"))
#' @author Rainer Machne \email{raim@tbi.univie.ac.at}
#' @export
readPlateMap.old <- function(file, sep="\t",
                             fsep="\n", fields, asep, afields,
                             blank.id="blank",
                             nrows=8, formatted=FALSE, header=TRUE) {


    ## already in well format?
    if ( formatted ) {
        if ( length(grep("\\.xlsx?$", file)) ) {
            tmp <- data.frame(readxl::read_excel(file, col_names=header),
                              stringsAsFactors=FALSE)
        } else {
            dat <- read.table(file, sep=sep, header=header,
                              stringsAsFactors=FALSE)
        }
        ## split inducer:amount columns
        if ( !missing(asep) & !missing(afields) ) {
            dat <- replaceAmounts(dat, col=afields, sep=asep, replace=TRUE)
            dat <- amountColors(dat)
        }
        return(dat)
    }

    ## plate map by columns and rows
    ## TODO: get rid of nrows and check fields instead?
    if ( length(grep("\\.xlsx?$", file)) ) {
        tmp <- data.frame(readxl::read_excel(file, col_names=header),
                          stringsAsFactors=FALSE)
        rownames(tmp) <- tmp[,1]
        dat <- tmp[1:nrows,-1]
    } else {
        dat <- read.table(file, sep=sep,
                          header=header, row.names=1, nrows=nrows,
                          stringsAsFactors=FALSE)
    }
    if ( !header )
      colnames(dat) <- 1:ncol(dat)

    ## generate well names from row and column names
    plate <- paste(rep(rownames(dat),ncol(dat)),
                   rep(sub("X","",colnames(dat)),each=nrow(dat)),sep="")

    ## parse field values
    vals <- strsplit(unlist(dat),fsep)
    nvals <- max(unlist(lapply(vals,length)))

    values <- matrix("",nrow=length(vals),ncol=nvals)
    for ( i in 1:nvals )
      values[,i] <- as.character(unlist(lapply(vals, function(x) trimws(x[i]))))

    if ( missing(fields) )
      colnames(values) <- paste("X",1:nvals,sep="")
    else colnames(values) <- fields

    plate <- cbind(well=plate,values)

    blanks <- apply(plate,1, function(x) any(x==blank.id))
    plate[which(plate==blank.id)] <- NA

    ## rm blank column
    plate <- plate[,colnames(plate)!=blank.id]
    ## and add new blank
    plate <- cbind(data.frame(plate),blank=blanks)

    ## split inducer:amount columns
    if ( !missing(asep) & !missing(afields) ) {
        plate <- replaceAmounts(plate, col=afields, sep=asep, replace=TRUE)
        plate <- amountColors(plate)
    }

    ## use wells as rownames!
    rownames(plate) <- plate$well

    #class(plate) <- "platemap"
    return(plate)
}

## version 2 of readplatemap
## 1) parse xls/odt files
## 2) use tidyr/dplyr to parse fields - DONE
## Read Plate Layout Map
## parses a plate design file in CSV. Rows and columns
## should be named as in the datafile. Each field can
## have multiple descriptors, separated by a specified separator (e.g.
## "newline"); blanks are specified by a keyword (default: "blank"),
## and if separate blanks are used for different conditions, the
## blank field must have the same format as measurement fields, except
## one parameter replaced by the keyword. Names for the values
## in the fields can be passed via argument "fields" as a vector of
## strings.
## TODO: repair this in example:
#' Read Plate Layout Map
#'
#' Parses a plate design file in CSV format. Rows and
#' columns should be named as in the corresponding plate reader data files.
#' TODO: causes empty groups in getGroups; because separate causes empty
#' strings in first field instead of NA
#' @param file text file containing the plate layout.
#' @param sep column separator, as in \code{\link[utils:read.table]{read.table}}
#' @param fsep within-field separator, separating the well-specific descriptors within
#' well fields
#' @param blank.id keyword that indicates blank wells. Blank wells can be
#'                 combined with other well descriptors for separate blanking
#' @param fields names for the field descriptors
#' @param asep a separator for substance:amount pairs, eg. to indicate amount
#' of an inducer or a nutrient, can only be used together with
#' argument \code{afields}, eg. use \code{asep=":", afields="inducer"}
#' @param afields field names which hold substance:amount pair information,
#' see argument \code{asep}
#' @param formatted indicates whether the file is already in the required
#'                  format; all other paramaters but 'sep' will be ignored
#' @param nrows number of rows to expect, defaults to 8 for rows A to H in
#' a typical 96 well plate; TODO: get rid of this and instead check for
#' rownames?
#' @param header logical argument indicating the presence/absence of a header
#' row in the layout file
#' @return a table of well content descriptors, where the first column 'wells'
#'         maps the plate map to the data files.
#' @seealso \code{\link{readPlateData}}
#' @examples
#' plate.file <- system.file("extdata", "AP12_layout.csv", package = "platexpress")
#' plate <- readPlateMap(file=plate.file, blank.id="blank",fsep="\n", fields=c("strain","samples"))
#' @author Rainer Machne \email{raim@tbi.univie.ac.at}
#' @export
readPlateMap <- function(file, sep="\t",
                         fsep="\n", fields, asep, afields,
                         blank.id="blank",
                         nrows=8, formatted=FALSE, header=TRUE) {

    ## already in well format?
    if ( formatted ) {
        if ( length(grep("\\.xlsx?$", file)) ) {
            tmp <- data.frame(readxl::read_excel(file, col_names=header),
                              stringsAsFactors=FALSE)
        } else {
            dat <- read.table(file, sep=sep, header=header,
                              stringsAsFactors=FALSE)
        }
        ## split inducer:amount columns
        if ( !missing(asep) & !missing(afields) ) {
          #if ( length(afields)>1 )
          #  warning("multiple substance:amount pairs not fully implemented;",
          #          "please add colors manually")
          for ( i in 1:length(afields) ) {
            afield <- afields[i]
            amnt.name <- ifelse(i>1,paste0(afield,".amount"),"amount")
            vdat <- tidyr::separate(vdat, col = afield,
                                    into = c(afield, amnt.name),
                                    sep = asep)
            vdat[[amnt.name]] <- as.numeric(vdat[[amnt.name]])
            vdat <- amountColors(vdat,
                                 substance=afield,
                                 amount=amnt.name,
                                 color=sub("amount","color",amnt.name))
          }
        }
    }

    ## plate map by columns and rows
    ## TODO: get rid of nrows and check fields instead?
    if ( length(grep("\\.xlsx?$", file)) ) {
        tmp <- data.frame(readxl::read_excel(file, col_names=header),
                          stringsAsFactors=FALSE)
        rownames(tmp) <- tmp[,1]
        dat <- tmp[1:nrows,-1]
    } else {
        dat <- read.table(file, sep=sep, header=header,
                          row.names=1, nrows=nrows,
                          stringsAsFactors=FALSE)
    }
    if ( !header )
        colnames(dat) <- 1:ncol(dat)

    ## generate well names from row and column names
    wells <- paste(rep(rownames(dat),ncol(dat)),
                   rep(sub("X","",colnames(dat)),each=nrow(dat)),sep="")

    ## convert to 1D
    vdat <- data.frame(well=wells,field=unlist(dat))
    ## get blanks, to add later
    blank.idx <- grep(blank.id, vdat$field)
    ## split fields - using tidyr separate
    vdat <- tidyr::separate(vdat, col = "field", into = fields, sep = fsep)
    if ( !missing(asep) & !missing(afields) ) {
      #if ( length(afields)>1 )
      #  warning("multiple substance:amount pairs not fully implemented;",
      #          "please add colors manually")
      for ( i in 1:length(afields) ) {
        afield <- afields[i]
        amnt.name <- ifelse(i>1,paste0(afield,".amount"),"amount")
        vdat <- tidyr::separate(vdat, col = afield,
                                into = c(afield, amnt.name),
                                sep = asep)
        vdat[[amnt.name]] <- as.numeric(vdat[[amnt.name]])
        vdat <- amountColors(vdat,
                             substance=afield,
                             amount=amnt.name,
                             color=sub("amount","color",amnt.name))
        ## TODO: split different substances in one afield
        ## into different columns!
        ## and replace column name
      }
    }
    ## replace all empty strings by NA
    vdat[vdat==""] <- NA
    ## add blanks
    vdat$blank <- FALSE
    vdat$blank[blank.idx] <- TRUE

    ## don't - use well column throughout!
    ##rownames(vdat) <- wells
    #class(vdat) <- "platemap"
    vdat
}

#' Numeric Amounts
#'
#' replaces amount strings of the form \code{substance:amount} in a column of
#' the plate layout map by two columns, separating substance and the amount
#' into a string and a numeric column.
#' @param map the plate map, see  \code{\link{readPlateMap}}
#' @param col the column ID or index to be split
#' @param sep the separator of the substance:amount pair strings
#' @param replace if TRUE the original columns in \code{col} will be removed
#' @seealso \code{\link{parseAmounts}}, \code{\link{readPlateMap}}
#' @export
replaceAmounts <- function(map, col, sep=":", replace=TRUE) {
    for ( cl in col )
        map <- cbind.data.frame(map, parseAmounts(map[,cl], sep=sep))
    if ( replace ) {
        if ( typeof(col)=="character" )
            col <- match(col,colnames(map))
        map <- map[,-col]
    }
    map
}

#' Parse Amount Strings
#'
#' splits a vector of strings of substance:amount pairs into
#' a matrix with 2 columns, of substances and amounts
#' @param str a vector of strings, providing substance:amount pairs
#' @param sep the separator of the substance:amount pair strings
#' @seealso \code{\link{replaceAmounts}}
#' @export
parseAmounts <- function(str, sep=":") {
    amount <- as.numeric(sub(paste0(".*",sep),"",str))
    inducer <- sub(paste0(sep,".*"),"",str)
    cbind.data.frame(substance=inducer,amount=amount)
}

#' Add Amount Colors
#'
#' add a color palette to the plate layout map, with colors along the range
#' of added amounts of a given substance. Substance and amount columns are eg.
#' auto-generated by \code{\link{readPlateMap}} with options \code{asep}
#' and \code{afields}.
#' @param map the plate map, see  \code{\link{readPlateMap}}
#' @param substance name of the substance column in \code{map}
#' @param amount name of the amount column in \code{map}
#' @param color name of the color column to be added to \code{map}
#' @param colf color palette function, default is colorRamp's
#' \code{\link[colorRamps:matlab.like2]{matlab.like2}}
#' @seealso \code{\link{readPlateMap}}, \code{\link{replaceAmounts}}
#' @export
amountColors <- function(map, substance="substance",amount="amount", color="color", colf=colorRamps::matlab.like2) {
    substances <- unique(map[,substance])
    colors <- rep(NA, nrow(map))
    for ( subst in substances ) {
        idx <- which(map[,substance]==subst)
        amnt <- as.numeric(map[idx,amount])
        amnt <- round(100*amnt/max(amnt,na.rm=TRUE))+1
        colors[idx] <- colf(101)[amnt]
    }
    if ( color %in% colnames(map) )
      map[,color] <- as.character(colors)
    else {
        map <- cbind.data.frame(map,
                                as.character(colors),
                                stringsAsFactors=FALSE)
        colnames(map)[ncol(map)] <- color
    }
    map
}

### PLATE DATA PARSERS

## TODO: repair this in example
#' Read Plate Data
#'
#' Parses platereader data files in CSV format, as exported by
#' the plate reader software. Header IDs in the data file should match with
#' IDs in the plate map, see \code{\link{readPlateMap}}. Pre-defined parsing
#' functions exist for a couple of plate-readers. If your format is
#' not implement, you can manually create simple data tables and use
#' the function \code{\link{readSimplePlate}}
#' @param files list of one or more data files
#' @param type pre-defined formats, as exported from platereaders; currently
#' for BMG Optima and Mars v3.01 R2, ('BMG'), BMG Clariostar and Mars
#' vXXX ('BMG2') and Biotek Synergy Mx ('Synergy'), BioLector ('BioLector'), BioLector Pro ('BioLectorPro');  TODO: define export
#' protocols!
#' @param data.ids an optional sub-selection of data types in the input file,
#' as a list of strings
#' @param interpolate if true a master time, the average time between distinct
#' measurements of one timepoint, is calculated and all values are interpolated
#' to this mastertime. This is currently obligatory for further processing.
#' See function \code{\link{interpolatePlateTimes}} for details.
#' @param time.range "common" requires that all data must have
#' the same number of time-points, "full" interpolates
#' the maximal available time range, using the mean
#' time step of the mean of data-specific time vectors
#' @param time.conversion conversion factor for the plate time, e.g., 1/3600
#' to convert from hours to seconds
#' @param verb print messages if true
#' @param ... further parameters to plate-reader specific parsing functions
#' @note The original data is all interpolated to a common/average 'master' time
#' @return a list of distinct measurement time-courses from one plate
#' @seealso \code{\link{readSimplePlate}}, \code{\link{readBMGPlate}}, \code{\link{readSynergyPlate}}, \code{\link{readPlateMap}}, \code{\link{viewPlate}}
#' @author Rainer Machne \email{raim@tbi.univie.ac.at}
#' @examples
#' data.file <- system.file("extdata", "AP12.csv", package = "platexpress")
#' raw <- readPlateData(files=data.file, type="Synergy",
#'                      data.ids=c("600","YFP_50:500,535"),
#'                      dec=",", time.format="%H:%M:%S")
#' @export
readPlateData <- function(files, type, data.ids, 
                          interpolate=TRUE, time.range=c("common","full"),
                          time.conversion, verb=TRUE, ...) {

    if ( type=="BMG" )
      data <- readBMGPlate(files=files, data.ids=data.ids,
                           verb=verb, ...)
    else if ( type=="BMG2" )
      data <- readBMG2Plate(files=files, data.ids=data.ids,
                            verb=verb, ...)
    else if ( type=="Synergy" )
      data <- readSynergyPlate(files=files, data.ids=data.ids,
                               verb=verb, ...)
    else if ( type=="BioLector" ) {
      data <- readBioLectorPlate(files=files, data.ids=data.ids,
                                 verb=verb, ...)
      interpolate <- FALSE
    } else if ( type=="BioLectorPro" ) {
      data <- readBioLectorProPlate(files=files, data.ids=data.ids,
                                    verb=verb, ...)
      interpolate <- FALSE
    } else if ( type=="simple" ) {
        ## single data item in simple spreadsheet
        data <- readSimplePlate(files=files, data.ids=data.ids,
                                verb=verb, ...)
        interpolate <- FALSE
    }
###return(data)

    ## NOW PREPARE DATA
    ## SET GLOBAL TIME & TEMPERATURE by INTERPOLATION:
    ## interpolate data: this adds a master time and temperature
    ## and interpolates all data to this time; if this step
    ## is omitted, there will be no global master time!
    if ( interpolate )
      data <- interpolatePlateTimes(data, time.range=time.range, verb=verb)

    if ( !missing(time.conversion) )
        data$Time <- data$Time * time.conversion

    if ( verb )
        cat(paste0("Successfully parsed data:\n",
                  "\ttype: ", type, "\n",
                  ifelse(interpolate,
                         paste0("\ttime.range rule:", time.range, "\n"),""),
                  "\n"))

    class(data) <- "platedata"
    data
}

#' Read Simple Plate Data Tables
#'
#' Read in simple platereader data with only one data type, time
#' in the first column and data for all wells in the following columns,
#' where the column header must correspond to wells
#' in \code{\link{readPlateMap}}.
#' @param nrow organize data into a matrix with this number of rows
#' @param time.format format of the time, see \code{\link[base:strptime]{strptime}}); only used if time column is not numeric!
#' @param ... arguments passed to \code{\link{read.csv}}
#' @inheritParams readPlateData
readSimplePlate <- function(files, data.ids, 
                            nrow, time.format="%M:%S", verb=TRUE, ...) {

    if ( verb )
      cat(paste("Parsing file:", files, "\n"))

    ## defaults
    if ( missing(data.ids) )
      data.ids <- "data"

    if ( verb  )
      cat(paste0("\tloading data: \"", data.ids, "\"\n"))

    ## parse file and get data
    dat <- read.csv(files, ...)

    ## split into time (first column) and well data
    time <- dat[,1]
    ## attempt format time, if not numeric
    if (  !typeof(dat[,1])%in%c("double","integer") )
        time <- as.numeric(strptime(time,format=time.format)) # time in seconds
    time <- time - time[1]
    dat <- dat[, -1]

    ## for convenience replace leading 0, e.g. A07 -> A7
    ## in column namnes
    ## TODO: generalize somehow? 
    colnames(dat) <- gsub("([A-Z])0(.*)", "\\1\\2", colnames(dat))
    
    ## generate local data
    data <-  list()
    data[[data.ids]] <- list()
    data[[data.ids]]$data <- data.matrix(dat)

    ## set dataID
    data$dataIDs <- data.ids

    ## generate wells: factorize and take middle factors
    N <- ncol(dat)
    if ( missing(nrow) ) {
        dim <- tosquare(N)
        nrow <- dim[1]
        ncol <- dim[2]
    }
    if ( !exists("ncol", mode="numeric" ) ) ncol <- N/nrow
    cat(paste(nrow, N, "\n"))
    
    data$wells$plate <- t(matrix(colnames(data[[data$dataIDs[1]]]$data),
                                 nrow=ncol, ncol=nrow))

    ## ROWS: are used in viewPlate
    ## TODO: avoid to force letters for rows and numbers for columns?
    data$wells$rows <- toupper(letters[1:nrow])
    data$wells$cols <- as.character(1:ncol)

    ## or alternatively from well names, assuming rows in
    ## first letter, and columns in rest!?
    if ( FALSE ) {
        wells <- colnames(data[[data$dataIDs[1]]]$data)
        rows <- do.call(rbind,strsplit(wells,""))[,1]
        cols <- unlist(lapply(strsplit(wells,""),
                              function(x) paste0(x[2:length(x)])))
    }
    
    ## add global time and xids
    data$Time <- time
    data$xids <- "Time"
    
    class(data) <- "platedata"
    data
}


## TODO: get rid of german format
## TODO: what's wrong with GFP?
## TODO: get temperature and humidity
## headerline <- 23 ## line of data header in BioLector result file
readBioLectorPlate <- function(files, data.ids,
                               headerline=23, hnrw=20, sep=";", dec=",",
                               replace.zero=TRUE,
                               verb=TRUE) {


    if ( verb )
      cat(paste("Parsing file:", files, "\n"))

    ## parse header to get experiment information
    filt <- read.csv(files, nrow=hnrw, sep = sep, dec = dec,
                     stringsAsFactors=FALSE, header=FALSE, fill=TRUE)

    nrw <- as.numeric(filt[filt[,1]=="MTP ROWS",2])
    ncl <- as.numeric(filt[filt[,1]=="MTP COLUMNS",2])
    nfl <- as.numeric(filt[filt[,1]=="FILTERSETS",2])
    
    skip <- which(filt[,1]=="FILTERSET") -2

    ## parse header again to get filter information
    filters <- read.csv(files, skip=skip, nrow=nfl, sep = sep, dec = dec,
                         stringsAsFactors=FALSE, header=TRUE)
              


    dat <- read.csv(files, skip=headerline, sep = sep, dec = dec,
                    stringsAsFactors=FALSE, fill=TRUE)

    if ( verb )
      cat(paste("Read:", nrow(dat), "rows and", ncol(dat),"columns \n"))

    fidx <- grep("READING", colnames(dat)) # column with filter info

    ## get time
    tidx <- grep("TIME", dat[,fidx])
    ## convert time to seconds, SI and in-line with other formats
    time <- unlist(dat[tidx, (fidx+1):ncol(dat)])*3600
    
    ## get calibrated rows
    didx <- grep("Cal.", dat[,fidx])
    data <- as.data.frame(t(dat[didx,fidx:ncol(dat)]), stringsAsFactors=FALSE)
    colnames(data) <- dat[didx,1]

    data <- list()
    for ( i in 1:nrow(filters) ) {
        if ( verb ) 
            cat(paste0("\tloading data: \"", trimws(filters[i,2]), "\""))
        didx <- grep(paste0("Cal..*FS=",filters[i,1]), dat[,fidx])
        ## take raw values if no calibration exists
        if ( length(didx)==0 ) {
            didx <- which(dat[,fidx]==filters[i,1])
            if ( verb) cat(" - RAW VALUES")
        }
        dt <- t(dat[didx,(fidx+1):ncol(dat)])
        colnames(dt) <- dat[didx,1]
        data[[i]] <- list()
        data[[i]]$time <- time
        data[[i]]$data <- as.matrix(dt)
        if ( replace.zero )
            colnames(data[[i]]$data) <- sub("0","", colnames(data[[i]]$data))
                                            
        if ( verb ) cat("\n")
    }
    names(data) <- trimws(filters[,2])
    
    dataIDs <- names(data)

    ## filter only requested data
    if ( !missing(data.ids) ) {
        if ( verb )
            cat(paste("skipping",
                      paste(dataIDs[!dataIDs%in%data.ids],collapse=";"), "\n"))
        data <- data[dataIDs%in%data.ids]
        dataIDs <- dataIDs[dataIDs%in%data.ids]
    }
    data$dataIDs <- dataIDs
    
    data$Time <- time
    data$xids <- "Time"

    ## TODO: use this for all data parsers, although
    ## only required for viewPlate; but could be used for
    ## auto-grouping
    ## TODO: generalize? split well strings?
    data$wells$plate <- t(matrix(colnames(data[[data$dataIDs[1]]]$data),
                                 nrow=ncl,ncol=nrw))
    data$wells$rows <- toupper(letters[1:nrw])
    if ( replace.zero ) 
        data$wells$cols <- as.character(1:ncl)
    else data$wells$cols <- sprintf("%02d", 1:ncl)
        
    class(data) <- "platedata"
    ## TODO: get temperature and humidity 
    data
}

##if ( FALSE ) {
##    DATPATH <- "~/work/hhu_talks/uebung_201812/BIQ980-2018/data/BIQ980_groupI_20181206/biolector"
##    file <- "6_BIQ980_gI_20181206_REDUCTION-1.csv"
##     DATPATH <- "~/work/hhu_talks/uebung_201812/BIQ980-2018/data/BIQ980_groupII_20181207/biolector"
##    file <- "7_BIQ980_gI_20181206_REDUCTION-1.csv"
##   files <- file.path(DATPATH, file)
##    sep=";";dec="."
##}
readBioLectorProPlate <- function(files, data.ids,
                                  sep=";", dec=".",
                                  replace.zero=TRUE,
                                  verb=TRUE) {

    if ( verb )
      cat(paste("Parsing file:", files, "\n"))

    ## first parse header to get experiment information
    ## TODO: get and use more info
    dat <- read.csv(files, sep = sep, dec = dec,
                    stringsAsFactors=FALSE, header=FALSE, fill=TRUE)

    if ( verb )
      cat(paste("Preprocessing:",nrow(dat),"rows and",ncol(dat),"columns \n"))

    ## data start, for second parsing step
    dstart <- which(dat[,1] == "=====data=====")
   
    nrw <- as.numeric(sub(".*]","",grep("mtp-rows", dat[,1], value=TRUE)))
    ncl <- as.numeric(sub(".*]","",grep("mtp-columns", dat[,1], value=TRUE)))
    nfl <- as.numeric(sub(".*]","",grep("no_filterset", dat[,1], value=TRUE)))

    ## get filters
    filters <- cbind(1:nfl, rep(NA,nfl))
    for ( i in 1:nfl ) {
        fid <- paste0("\\[",sprintf("%02d",i),"_name\\]")
        filters[i,2] <- sub(fid,"",grep(fid,dat[,1],value=T))
    }

    ## parse again without header to get columns properly
    dat <- read.csv(files, sep = sep, dec = dec, skip = dstart,
                    stringsAsFactors=FALSE, header=FALSE, fill=TRUE)
    dend <- tail(which(dat[,1] == "M"), 1)
    
    ## parse again without header to get columns properly
    dat <- read.csv(files, sep = sep, dec = dec, skip = dstart, nrow = dend,
                    stringsAsFactors=FALSE, header=FALSE, fill=TRUE)

   if ( verb )
      cat(paste("Read:", nrow(dat), "rows and", ncol(dat),"columns \n"))

    ## debug
    ##return(dat)
    ## skip last column, only NA, since lines end with ;
    dat <- dat[,-ncol(dat)]

 
    
    ## get time of main measurements
    ## NOTE: second T is time for user comments and system events
    tidx <- which(dat[,1] == "T")[1]
    fidx <- which(dat[tidx,] == "Time [h]->")
    ## convert time to seconds, SI and in line with other formats!
    time <- unlist(dat[tidx, (fidx+1):ncol(dat)])*3600
    
    ## get calibrated rows
    didx <- grep("Cali.", dat[,fidx])
    data <- as.data.frame(t(dat[didx,fidx:ncol(dat)]), stringsAsFactors=FALSE)
    colnames(data) <- dat[didx,1]

    data <- list()
    for ( i in 1:nrow(filters) ) {
        if ( verb ) 
            cat(paste0("\tloading data: \"", trimws(filters[i,2]), "\""))

        ## first search calibrated version
        fid <- paste0("Cali.",filters[i,2])
        didx <- grep(fid, dat[,fidx], fixed=TRUE)
        ## if not present, use raw
        if ( length(didx)==0 ) {
            didx <- grep(filters[i,2], dat[,fidx], fixed=TRUE)
            if ( verb) cat(" - RAW VALUES")
        }
        dt <- t(dat[didx,(fidx+1):ncol(dat)])
        colnames(dt) <- dat[didx,2]
        data[[i]] <- list()
        data[[i]]$time <- time
        data[[i]]$data <- as.matrix(dt)
        if ( replace.zero )
            colnames(data[[i]]$data) <- sub("0","", colnames(data[[i]]$data))
                                            
        if ( verb ) cat("\n")
    }
    names(data) <- trimws(filters[,2])
    
    dataIDs <- names(data)

    ## filter only requested data
    if ( !missing(data.ids) ) {
        if ( verb )
            cat(paste("skipping",
                      paste(dataIDs[!dataIDs%in%data.ids],collapse=";"), "\n"))
        data <- data[dataIDs%in%data.ids]
        dataIDs <- dataIDs[dataIDs%in%data.ids]
    }
    data$dataIDs <- dataIDs
    
    data$Time <- time
    data$xids <- "Time"

    ## TODO: use this for all data parsers, although
    ## only required for viewPlate; but could be used for
    ## auto-grouping
    ## TODO: generalize? split well strings?
    data$wells$plate <- t(matrix(colnames(data[[data$dataIDs[1]]]$data),
                                 nrow=ncl,ncol=nrw))
    data$wells$rows <- toupper(letters[1:nrw])
    if ( replace.zero ) 
        data$wells$cols <- as.character(1:ncl)
    else data$wells$cols <- sprintf("%02d", 1:ncl)
        
    class(data) <- "platedata"
    ## TODO: get temperature and humidity 
    data
}

#' Read Synergy Mx Plate Data
#'
#' Parses date exported from the Excel file that can be exported
#' from the Biotek Synergy Mx platereader. A parameter that often changes
#' is \code{skip}, the number of lines before the data starts,
#' here before the ID of a measurement in the first column, eg.
#' "600" for OD measurement at 600 nm.
#' @param skip lines to skip before parsing data
#' @param time.format format of the time, e.g., "%H:%M"%S" (see
#' \code{strptime}), or "numeric" if the time is provided in normal numbers
#' @param sep field separator used in the input tabular file
#' @param dec decimal operator used in data file (e.g., ',' if data export
#' was with german language settings)
#' @param skiplastcol the last column is often empty, set to TRUE if
#' this is the case for the current data set
#' @inheritParams readPlateData
#' @author Rainer Machne \email{raim@tbi.univie.ac.at}
#' @seealso \code{\link{readPlateData}}
#' @export
readSynergyPlate <- function(files, data.ids,
                             skip, sep=";", dec=".", skiplastcol=FALSE,
                             time.format="%H:%M:%S",
                             verb=TRUE) {

    if ( verb )
      cat(paste("Parsing file:", files, "\n"))

    ## defaults
    if ( missing(skip) )
      skip <- 58


    if ( length(grep("\\.xlsx?$", files)) ) {
        indat <- data.frame(readxl::read_excel(files,
                                               col_names=FALSE, skip=skip),
                            stringsAsFactors=FALSE)
    } else {
        indat <- read.csv(files, header=FALSE,stringsAsFactors=FALSE,
                          sep=sep, dec=dec, skip=skip)
    }
    
    ## data IDs are in column 1, followed by data matrices starting
    ## in column 2; skip internal result calculation
    yidx <- c(which(indat[,1]!="" & indat[,1]!="Results"))

    ## filter only requested data
    dataIDs <- indat[yidx,1]
    yidx <- c(yidx, which(indat[,1]=="Results")) # add "Results" index
    names(yidx) <- c(dataIDs,"end")

    if ( !missing(data.ids) )
      dataIDs <- dataIDs[dataIDs%in%data.ids]

    data <- rep(list(NA),length(dataIDs))
    names(data) <- dataIDs
    for ( dataID in dataIDs  ) {

        if ( verb  )
            cat(paste0("\tloading data: \"", dataID, "\"\n"))

        ## get DATA:
        ## rows: the header is 2 rows after the ID, data starts 3 rows after
        ## and ends 2 rows before next
        i <- which(names(yidx)==dataID)
        hidx <- yidx[i]+2
        sidx <- yidx[i]+3
        eidx <- yidx[i+1]-2
        ## columns:
        ## time: column 2; temperature: column 3;
        ## data: columns 4 to last-1; last data set ends with nrow
        lcol <- ncol(indat)
        if ( skiplastcol  ) lcol <- lcol -1
        cols <- 4:lcol
        time <- indat[sidx:eidx,2]
        #cat(paste("time format", time.format, time[1], "\n"))
        if ( time.format!="numeric" )
          time <- as.numeric(strptime(time,format=time.format))
        else
          time <- as.numeric(sub(",",".",time))
         #cat(paste("time format", time.format, time[1], "\n"))

        temp <-  as.numeric(sub(",",".",indat[sidx:eidx,3]))
        ## convert german decimal ',' to '.'
        ## NOTE: align this with command-line argument 'dec'
        dat <- matrix(as.numeric(sub(",",".",unlist(indat[sidx:eidx,cols]))),
                      ncol=length(cols),nrow=length(sidx:eidx))
        ## get columns and change Temperature ID
        colnames(dat) <- as.character(indat[hidx,cols])
        ## check last columns (sometimes empty, sometimes not)
        emptycols <- which(colnames(dat)=="NA")
        if ( length(emptycols)>0 )
          dat <- dat[,-emptycols]
        ## check last rows (sometimes empty, sometimes not)
        ## -> either NA values in time or in all wells
        present <- (!is.na(time) &
                    ncol(dat) > apply(dat,1,function(x) sum(is.na(x))))

        dat  <- dat[ present,]
        temp <- temp[present]
        time <- time[present]
        data[[dataID]] <- list(time=time, temp=temp, data=dat)
    }

    ## since time here comes formatted, the current data is
    ## added -> subtract minimal time
    t0 <- min(unlist(lapply(data, function(x) x$time)),na.rm=TRUE)
    for ( i in 1:length(data) )
      data[[i]]$time <- data[[i]]$time - t0

    ## SET DATA ID
    data$dataIDs <- names(data)

    data

}


## interpolates all data to a master time
## and reduced redundant temperature information
## TODO: correct times for reading delay in platereader
##       - perhaps newer versions can give exact time for each
#' Read BMG Optima/MARS vR3.01 R2 Plate Data
#'
#' Parses date from a CSV file that can be exported
#' from the MARS (vR3.01 R2) analysis software of BMG plate readers. Different
#' measurements (eg. OD and fluorescence) are exported separately
#' and should all be liste in parameter \code{files}. Later
#' versions of MARS (vXXX) have a simpler file format, and can be
#' parsed with \code{\link{readBMG2Plate}}
#' @param skip lines to skip before parsing data; if missing it will
#' be set to 5
#' @param sep field separator used in the input tabular file
#' @param dec decimal number symbol (e.g., ',' if data export was with
#' german language settings)
#' @inheritParams readPlateData
#' @author Rainer Machne \email{raim@tbi.univie.ac.at}
#' @seealso \code{\link{readPlateData}}, \code{\link{readBMG2Plate}}
#' @export
readBMGPlate <- function(files, data.ids,
                         skip, sep=";", dec=".", verb=TRUE) {

    if ( missing(skip) )
      skip <- 5

    data <- list()
    ## 1) PARSE ALL DATA FILES and collect the individual measurements
    for ( i in 1:length(files) ) {
        file <- files[i]
        #id <- names(files)[i]
        if ( verb )
          cat(paste("Parsing file:", file , "\n"))
        dat <- read.csv(file,header=FALSE,stringsAsFactors=FALSE,
                        skip=skip, sep=sep)

        ## TODO: scan to parse header
        ## for rownames "Well" or "Well Row", "Well Col"
        ## and "Content"; and then scan again with adjusted skip
        ## -> started in function readbmg

        ## last row in BMG files is usually empty, remove
        if ( sum(dat[nrow(dat),]=="")==ncol(dat) )
          dat <- dat[2:nrow(dat)-1,]

        ## convert well/row info from rows 1:2 to column names
        colnames(dat) <- paste(dat[1,],dat[2,],sep="")

        ## columns 1 and 2 are data type and measurement time
        ## and ID is in row 3
        colnames(dat)[1:2] <- as.character(dat[3,1:2])

        ## get sample/blank information from row 3, substituting "Sample "
        ## also converts to character
        samples <- sub("Blank ","",sub("Sample ","",dat[3,3:ncol(dat)]))

        ## remove first three rows from which info has been parsed
        dat <- dat[4:nrow(dat),]

        ## GET ALL DATA

        ## data type identified by first column
        ## substituting common stuff "Raw Data (XXX \d)"
        dtype <- sub("\\)","",sub(" .*", "",sub("Raw Data \\(", "", dat[,1])))
        ## the rest is numeric information, convert now
        dat <- data.matrix(dat[,2:ncol(dat)])

        ## collect all data
        types <- unique(dtype)
        dlst <- rep(list(NA),length(types))
        names(dlst) <- types
        for ( dtyp in types ) {
            if ( verb )
             cat(paste0("\tloading data: \"", dtyp, "\"\n"))
            tdat <- dat[dtype==dtyp,]
            dlst[[dtyp]] <- list(time=tdat[,1],
                                 data=tdat[,2:ncol(tdat)])
        }

        ## handle temperature
        ## BMG gives T for each well, but actually it's the same for all
        ## add this temperature time-course to each other data
        ## and remove from list
        ## NOTE: temperature is not exported after fusing wells in BMG Mars!
        ## TODO: find out which time is used when fusing wells in BMG Mars!
        tidx <- which(names(dlst)=="Temperature")
        if ( length(tidx)>1 )
          warning(paste("BMG Temperature format has changed, multiple entries",
                        "in one data file:", file, "\n\t",
                        "Perhaps check validity of code\n"))
        for ( i in tidx ) {
            temp <- dlst[[i]]$data
            if ( any(apply(temp,1,sd)>0 ) ) {
                warning(paste("BMG format changed; different temperatures for",
                              "different wells noted. UPDATE R CODE\n"))
            }
            temp <- c(matrix(apply(temp,1,mean),ncol=1))
            ## TODO: check whether times are ok, but so far, BMG/Mars
            ## don't give different times
        }

        ## rm from data list
        dlst <- dlst[names(dlst)!="Temperature"]
        ## .. and add to each other data (if it was present
        if ( length(tidx)>1 )
            dlst <- lapply(dlst, function(x) {x$temp=temp; x} )

        ## append data
        data <- append(data,dlst)
    }

    ## NOTE: at this stage, data between different plate-readers
    ## should already look similar; each entry containing separate
    ## time and temperature vectors

    ## SET DATA ID
    data$dataIDs <- names(data)

    data
}

#' Read BMG Optima/MARS vXXX Plate Data
#'
#' Parses date from a CSV file that can be exported
#' from the MARS (vXXX) analysis software of BMG plate readers. Different
#' measurements (eg. OD and fluorescence). Data exported from an earlier
#' version of MARS (vR3.01 R2) can be parsed with \code{\link{readBMGPlate}}
#' @param skip lines to skip before parsing data
#' @param sep field separator used in the input tabular file
#' @param dec decimal number symbol (e.g., ',' if data export was with
#' german language settings)
#' @param time.format format of the time, e.g., "%H:%M"%S" (see
#' \code{strptime}),
#' @inheritParams readPlateData
#' @author Rainer Machne \email{raim@tbi.univie.ac.at}
#' @seealso \code{\link{readPlateData}}, \code{\link{readBMGPlate}}
#' @export
readBMG2Plate <- function(files, data.ids, time.format=" %H h %M min",
                          skip=6, sep=";", dec=",", verb=TRUE) {



  data <- list()
  ## 1) PARSE ALL DATA FILES and collect the individual measurements
  for ( i in 1:length(files) ) {
    file <- files[i]
    #id <- names(files)[i]
    if ( verb )
      cat(paste("Parsing file:", file , "\n"))

    ## parse header and data separately, to obtain
    ## clean numeric data
    hdat <- read.csv(file,header=FALSE,stringsAsFactors=FALSE,
                     skip=skip,sep=sep, dec=dec)
    hdat <- hdat[1:2,]
    dat <- read.csv(file,header=FALSE,stringsAsFactors=FALSE,
                    skip=skip+2,sep=sep,dec=dec)

    ## last column in BMG2 files is usually empty, remove
    if ( sum(is.na(dat[,ncol(dat)]))==nrow(dat) ) {
      dat <- dat[,-ncol(dat)]
      hdat <- hdat[,-ncol(hdat)]
    }


    ## convert well info from column 1 to column names
    rownames(dat) <- sub("([A-Z])0","\\1",(dat[,1]))
    ## rm column 1 (well ID) and column 2 (sample ID, not used here)
    dat <- dat[,-(1:2)]
    hdat <- hdat[,-(1:2)]

    ## 20181209 : FIX MARS BUG: "Shake" instead of number in first timepoint
    dat[] <- apply(dat, 2, as.numeric)

    ## GET ALL DATA

    ## split data into  merged measurements, using header in row 1
    dtype <- trimws(sub("\\)","",sub("Raw Data \\(", "", hdat[1,])))
    types <- unique(dtype)

    ## collect all data
    types <- unique(dtype)
    dlst <- rep(list(NA),length(types))
    names(dlst) <- types
    for ( dtyp in types ) {
      if ( verb )
          cat(paste0("\tloading data: \"", dtyp, "\"\n"))
      tdat <- dat[,dtype==dtyp]
      htdat <- hdat[,dtype==dtyp]

      ## parse times in row 2
      ## TODO: use time.format and strptime!!
      hours <- as.numeric(sub(" h.*","",htdat[2,]))
      minutes <- as.numeric(sub(" min","",sub(".*h","",htdat[2,])))
      minutes[is.na(minutes)] <- 0
      times <- hours*3600 + minutes*60 # time in seconds

      dlst[[dtyp]] <- list(time=times,
                           data=t(as.matrix(tdat)))
    }
    ## append data
    data <- append(data,dlst)
  }

  ## NOTE: at this stage, data between different plate-readers
  ## should already look similar; each entry containing separate
  ## time and temperature vectors

  ## SET DATA ID
  data$dataIDs <- names(data)

  data
}

readbmg <- function(files, time.format=" %H h %M min", sep=";", dec=",", verb=TRUE) {

    data <- list()
    ## 1) PARSE ALL DATA FILES and collect the individual measurements
    for ( i in 1:length(files) ) {
        file <- files[i]
        ##id <- names(files)[i]
        if ( verb )
            cat(paste("Parsing file:", file , "\n"))

        ## 1) parse header
        dat <- read.csv(file,header=FALSE,stringsAsFactors=FALSE,
                        sep=sep, skip=5)

        widx <- grep("Well",dat[,1])
        cidx <- grep("Well Col",dat[,1])
        ridx <- grep("Well Row",dat[,1])
        iidx <- grep("Content", dat[,1])

    }

}


#' writes out all data from the `platexpress` object
#'
#' calls \code{\link{write.csv}} for each data type and writes
#' them to separate files names `<file>_<dataID>.csv`.
#' @param data \code{platexpress} data object
#' @param file either a character string naming a file or a connection open
#' for writing.  ‘""’ indicates output to the console.
#' @param yids IDs of the data to be written; if missing, all data will
#' written
#' @param xid ID of a data-set in the input data that can be used as x-axis
#' instead of the default Time vector
#' @param dtype type of the data to be plotted, default is the main 'data', but
#' e.g. 'orig' allows to plot the original data w/o processing (e.g.
#' interpolation, blank correction, etc.)
#' @param verb print messages if \code{>1}
#' @param ... arguments passed to write.csv
#' @export
writeData <- function(data, file, yids, xid, dtype="data", verb=1, ...) {

    ## which data to write?
    if ( missing(yids) ) yids <- data$dataIDs
    ## get x-axis data: time and temperature or another data set
    if ( missing(xid) )
        xid <- data$xids[1]
    x <- data[[xid]]

    for ( id in yids ) {
        file.name <- paste0(file, "_", make.names(id), ".csv")
        if ( verb ) cat(paste("writing", file.name, "\n"))
        dat <- data[[id]][[dtype]]
        dat <- cbind(x, dat)
        colnames(dat)[1] <- xid
        write.csv(dat, file.name, row.names=FALSE, ...)
    }
}
raim/platexpress documentation built on Jan. 18, 2022, 1:41 p.m.