R/mergePdata.R

Defines functions mergePdata

Documented in mergePdata

#' Merge Phenotype and Image Data
#'
#' Merge "phenotype" data describing the samples in an image data.frame.
#'
#' @param phenoData A \code{data.frame} containing variables for the multiplicity,
#'   typically \code{moi} and either \code{well} or \code{file} associated with
#'   each unique multiplicity.
#' @param imageData A \code{data.frame} generated by \code{\link{parseImages}}.
#' @param moi Character name for multiplicity variable, either "moi" or "x".
#' @param stringsAsFactors Convert string variables to factors if \code{TRUE}
#'   (default value). Note that if present, \code{column} and \code{row} are
#'   preserved as factors regardless of this conversion.
#' @param formatString Character string as \code{\link{sprintf}} format for the
#'   column; default value of \code{NULL} pads the column value with zeros. 
#'   See details for fixing a mismatch between imageData and phenoData
#'
#' @details
#'
#' The format for well names (and column names) may differ between the imageData
#' and phenoData. An "Error in mergePData(pd, df)" might report "A1, A2, <...>
#' are missing from pd." This can happen if, for example, the imageData uses only
#' a few of the possible 96 wells in phenoData. Change the \code{formatString} 
#' parameter to a value such as "\%02d" to force a consistent format. This value
#' is passed to the \code{\link{well.info}} as the parameter \code{format}.
#'
#' This function performs a \code{data.frame} merge with additional checks. It
#' does not (yet) respect the 'plate' variable and only merges according to the
#' \code{well} or \code{file} variables. For now, it is necessary to split
#' the phenoData and imageData by \code{plate}, merge each separately,
#' and then join the data back together. The following might work with phenoData
#' in \code{pd} and imageData in \code{df}:
#'
#' \preformatted{
#'   g <- pd$plate
#'   pd <- split(pd, g)
#'   df <- split(df, df$plate)
#'   df <- Map(mergePdata, pd, df)
#'   df <- do.call(rbind, df)
#'   rownames(df) <- NULL
#'   pd <- unsplit(pd, g)
#' }
#'
#' @return
#'
#' Merged \code{data.frame} with harmonized well information (if appropriate)
#' and additional data provided in \code{phenoData}. The variables \code{row},
#' \code{column}, and \code{well} are expected to be factors. 
#'
#' @export
#'
mergePdata <- function(phenoData, imageData, moi = c("moi", "x"),
	stringsAsFactors = TRUE, formatString = NULL)
{
# check formatString argument
	if (!is.null(formatString))
		formatString <- as.character(formatString)[1]

# intercept data with 'plate' present
	if ("plate" %in% names(phenoData) | "plate" %in% names(imageData)) {
		if (nlevels(factor(phenoData$plate)) > 1 | nlevels(factor(imageData$plate)) > 1)
			warning("see mergePdata() help to use 'plate' variable for merging")
	}
			
# process and check data according to the presence of "well"
	if ("well" %in% names(imageData) && !"well" %in% names(phenoData))
		stop("'well' found in ", deparse(substitute(imageData)),
			" but not in ", deparse(substitute(phenoData)))

	if ("well" %in% names(imageData)) {
	# process harmonized well value
		pdWell <- unique(well.info(phenoData$well, format = formatString)$well)
		idWell <- unique(well.info(imageData$well, format = formatString)$well)
		diffWell <- setdiff(idWell, pdWell)
		if (length(diffWell) != 0) {
			message("Perhaps 'formatString' should be set to \"%02d\"?")
			stop(paste(diffWell, collapse = ", "), " are missing from ",
				deparse(substitute(phenoData)))
		}

#	# process plate
#		pdPlate <- unique(phenoData$plate)
#		idPlate <- unique(imageData$plate)
#		diffPlate <- setdiff(idWell, pdWell)
#		if (length(diffPlate) != 0)
#			stop(paste(diffPlate, collapse = ", "), " are missing from ",
#				deparse(substitute(phenoData)))

	# replace (or add) harmonized row and column information to phenotype data
		phenoData$column <- well.info(phenoData$well, format = formatString)$column
		phenoData$row <- well.info(phenoData$well, format = formatString)$row

	# replace well names in phenodata with harmonized well names
		phenoData$well <- pdWell
	}
	else if ("file" %in% names(phenoData)) {
		if (!"file" %in% names(imageData))
			stop("A 'file' or 'well' variable must be present in ",
				deparse(substitute(imageData)))
	}

# check for 'unit' variable and assign if needed
	if (!"unit" %in% names(phenoData))
		phenoData$unit <- "[no unit]"

# check for 'type' variable and assign if needed
	if (!"type" %in% names(phenoData)) {
		type <- rep("standard", nrow(phenoData))
		if ("moi" %in% names(phenoData))
			type[phenoData$moi == 0] <- "control"
		else if ("x" %in% names(phenoData))
			type[phenoData$x == 0] <- "control"
		phenoData$type <- type
	}

# remove any variables in imageData that are present in phenoData EXCEPT
# for variables used to merge data frames: 'file' and/or 'well'
	vars <- names(imageData)[!names(imageData) %in% names(phenoData)]
	if ("file" %in% names(imageData)) vars <- c(vars, "file")
	if ("well" %in% names(imageData)) vars <- c(vars, "well")
	vars <- unique(vars)
	imageData <- imageData[vars]

# add temporary sorting variable for imageData and merge
	sval <- tail(make.unique(c(names(imageData), "srt")), 1)
	imageData[[sval]] <- seq_len(nrow(imageData))
	res <- merge(phenoData, imageData, all.y = TRUE) # Ugh...need all.y = TRUE
	ord <- order(res[[sval]])
	res <- res[ord, ]
	res <- res[names(res) != sval]

# convert strings to factors if requested
	if (stringsAsFactors == TRUE) {
		sel <- sapply(res, is.character)
		res[sel] <- lapply(res[sel], as.factor)
	}

# reorganize data
#	pdnames <- c("dir","file","plate","well","column","row","frame","type","unit")
	pdnames <- c("dir","file","well","column","row","frame","type","unit")
	first <- pdnames[pdnames %in% names(res)]
	last <- names(res)[!names(res) %in% pdnames]
	res <- res[c(first, last)]

# exclude unused levels (allows pd to hold more information than required)
	res <- droplevels(res)
	rownames(res) <- NULL
	return(res)
}
ornelles/virustiter documentation built on March 29, 2024, 8:30 p.m.