R/check_experiment.R

Defines functions check_experiment

Documented in check_experiment

#' Check experiment data.
#'
#' Checks that the experiment description is well-formed and complete.
#'
#' Information about a full experiment can be assembled into a spreadsheet
#' (Excel, CSV and tab-delimited text formats are supported) and used to process
#' large numbers of files in one batch. This function checks the spreadsheet to
#' make sure that it is properly formed and that all the data files referred to
#' are present.
#'
#' The function can (and ideally should) be run with the same parameters as will
#' be used to call \code{\link{read_experiment}}, although many of the
#' parameters are not required for the check.
#'
#' The content of the spreadsheet, the presence and the content of any
#' supporting files are also checked. Checks do not cover validity of the raw
#' data, so it is still possible to have invalid data even if
#' \code{\link{check_experiment}} returns \code{TRUE} (although this suggests an
#' underlying problem with the raw data). Warning and error messages are
#' intended to be useful and help any format issues be quickly resolved.
#'
#' @param filename A spreadsheet file containing a description of the experiment
#'   or a trackxf file containing an exported experiment archive.
#' @param format An experiment description for reading raw data can be provided
#'   as an Excel spreadsheet ("excel") or as a comma-delimited ("csv") or
#'   tab-delimited ("tab", "tsv", "txt" or "text") text file. The value
#'   "trackxf" indicates that the file is an archived experiment in the trackxf
#'   format (as generated by \code{\link{export_data}}). Default (\code{NA}) is
#'   to guess the format from the file extension.
#' @param interpolate Ignored. For compatibility with
#'   \code{\link{read_experiment}}.
#' @param project.dir A directory path specifying where the files needed for
#'   processing the experiment are stored. Ignored if \code{format = "trackxf"}.
#' @param data.dir A directory path specifying where the raw data are stored.
#'   This is a folder root and all paths specified in the spreadsheet. Ignored
#'   if \code{format = "trackxf"}.
#' @param author.note Ignored. For compatibility with
#'   \code{\link{read_experiment}}.
#' @param threads Ignored. For compatibility with \code{\link{read_experiment}}.
#' @param verbose Ignored. For compatibility with \code{\link{read_experiment}}.
#'
#' @return Invisibly returns \code{TRUE} for a successful check or \code{FALSE}
#'   otherwise.
#'
#' @seealso \code{\link{read_experiment}}, \code{\link{export_data}}.
#'
#' @examples
#' require(Rtrack)
#' experiment.description <- system.file("extdata", "Minimal_experiment.xlsx",
#'   package = "Rtrack")
#' check_experiment(experiment.description)

#' @importFrom readxl read_excel
#' @importFrom utils read.csv read.table
#' @importFrom stats na.omit
#' @importFrom crayon bold red green
#' @importFrom rjson fromJSON
#' @importFrom RCurl url.exists
#'
#' @export
check_experiment = function(filename, format = NA, interpolate = FALSE, project.dir = NA, data.dir = project.dir, author.note = "", threads = NULL, verbose = FALSE){
	if(is.na(project.dir)) project.dir = dirname(filename)
	if(is.na(data.dir)) data.dir = dirname(filename)
	if(unlist(strsplit(project.dir, ""))[nchar(project.dir)] != "/") project.dir = paste0(project.dir, "/")
	if(unlist(strsplit(data.dir, ""))[nchar(data.dir)] != "/") data.dir = paste0(data.dir, "/")

	# Check if this is actually a raw experiment or if it is an archive of some sort (not supported).
	# Support for checking validity of trackxf archives may be added in the future - but is not currently implemented.
	# It would be necessary to restore the entire archive, so there doesn't seem to be any gain over 'read_experiment'.
	# Also, archives should have been generated by a sane algorithm from a functioning experiment, so checking is no really useful.
	format = tolower(format)
	if(is.na(format)){
		if(RCurl::url.exists(filename)){
			format = "http"
		}else if(tools::file_ext(filename) %in% c("trackxf")){
			format = "trackxf"
		}else if(tools::file_ext(filename) %in% c("json")){
			format = "json"
		}else if(tools::file_ext(filename) %in% c("xls", "xlsx")){
			format = "excel"
		}else if(tools::file_ext(filename) %in% c("csv")){
			if(ncol(read.csv(filename)) > 4){
				format = "csv"
			}else{
				format = "csv2"
			}
		}else if(tools::file_ext(filename) %in% c("tab", "tsv", "txt")){
			format = "tab"
		}else{
			message("The file format cannot be established automatically. Please specify the 'format' parameter.")
			check.pass = FALSE
		}
	}
	
	experiment.data = NULL
	experiment.info = NULL
	check.pass = TRUE

 	if(format == "json"){
		if(!file.exists(filename)){
			message(paste0("The experiment archive file ('", filename, "') does not exist!"))
			check.pass = FALSE
		}else{
			error.message = tryCatch({
				experiment.data = rjson::fromJSON(file = filename, simplify = FALSE)
				TRUE
			}, error = function(e){
				FALSE
			})
			message(paste0("The file '",  filename, "' is not properly formed."))
			check.pass = FALSE
		}
		
		experiment.info = experiment.data[[1]]
		experiment.data = experiment.data[[2]]
		
		# Run check for required features
		required.fields = c("id", "target", "day", "trial", "arena_name", "raw.t", "raw.x", "raw.y", "t", "x", "y", "arena")
		for(field in required.fields){
			field.data = sapply(experiment.data, "[[", field)
			field.check = !(field.data == "" | sapply(field.data, is.null))
			if(!all(field.check)){
				if(length(which(!field.check)) > 1){
					message(paste0("The experiment is missing the field '", field, "' for elements: ", paste(which(!field.check), collapse = "', '"), "."))
					check.pass = FALSE
				}else if(length(which(!field.check)) == 1){
					message(paste0("The experiment is missing the field '", field, "' for element ", which(!field.check), "."))
					check.pass = FALSE
				}
			}
		}
		required.arena.fields = c("type", "time.units", "arena.bounds") # goal and old.goal are not actually required
		arenas = lapply(experiment.data, "[[", "arena")
		for(field in required.arena.fields){
			field.data = sapply(arenas, "[[", field)
			field.check = !(field.data == "" | sapply(field.data, is.null))
			if(!all(field.check)){
				if(length(which(!field.check)) > 1){
					message(paste0("The experiment is missing the field 'arena$", field, "' for elements: ", paste(which(!field.check), collapse = "', '"), "."))
					check.pass = FALSE
				}else if(length(which(!field.check)) == 1){
					message(paste0("The experiment is missing the field 'arena$", field, "' for element ", which(!field.check), "."))
					check.pass = FALSE
				}
			}
		}
	} # end json check
	
 	if(format != "json"){
 		# if(!(dir.exists(project.dir) & dir.exists(data.dir))){
 		# 	message(paste0("The project and/or data directories do not exist."))
 		# 	check.pass = FALSE
 		# }
 		
		if(format == "xls" | format == "xlsx" | format == "excel"){
			if(!file.exists(filename)){
				message(paste0("The experiment description file ('", filename, "') does not exist!"))
				check.pass = FALSE
			}else{
				experiment.data = suppressMessages(as.data.frame(readxl::read_excel(filename, col_types = 'text'), stringsAsFactors = F))
				rownames(experiment.data) = experiment.data$TrackID
			}
		}else if(format == "csv"){
			if(!file.exists(filename)){
				message(paste0("The experiment description file ('", filename, "') does not exist!"))
				check.pass = FALSE
			}else{
				experiment.data = utils::read.csv(filename, stringsAsFactors = F, check.names = FALSE)
				rownames(experiment.data) = experiment.data$TrackID
			}
		}else if(format == "csv2"){
			if(!file.exists(filename)){
				message(paste0("The experiment description file ('", filename, "') does not exist!"))
				check.pass = FALSE
			}else{
				experiment.data = utils::read.csv2(filename, stringsAsFactors = F, check.names = FALSE)
				rownames(experiment.data) = experiment.data$TrackID
			}
		}else if(format == "tab"){
			if(!file.exists(filename)){
				message(paste0("The experiment description file ('", filename, "') does not exist!"))
				check.pass = FALSE
			}else{
				experiment.data = utils::read.delim(filename, stringsAsFactors = F, check.names = FALSE)
				rownames(experiment.data) = experiment.data$TrackID
			}
		}else if(format == "trackxf"){
			message("Checking trackxf archives is not currently supported.")
			check.pass = FALSE
		}else if(format == "http"){
			message("Checking URLs is not supported. The experiment files must be downloaded (and the archive unpacked if necessary) first.")
			check.pass = FALSE
		}else{
			if(!file.exists(filename)){
				message(paste0("The experiment description file ('", filename, "') does not exist!"))
				check.pass = FALSE
			}else{ # An unknown file extension will be assumed to be a tab-delimited file. If that fails, crash out.
				success = tryCatch({
					utils::read.delim(filename, stringsAsFactors = F)
					TRUE
				}, error = function(e){
					FALSE
				}, warning = function(e){
					FALSE
				})
				if(!success){
					message(paste0("The file '",  filename, "' is not properly formed."))
				}else{
					experiment.data = utils::read.delim(filename, stringsAsFactors = F, check.names = FALSE)
					rownames(experiment.data) = experiment.data$TrackID
				}
				check.pass = FALSE
			}
		} # end file type check
		
		if(check.pass & !is.null(experiment.data)){
			# Run check for required features
			required.fields = c("_TrackID", "_TargetID", "_Day", "_Trial", "_Arena", "_TrackFile", "_TrackFileFormat")
			field.check = required.fields %in% colnames(experiment.data)
			if(!all(field.check)){
				if(length(which(!field.check)) > 1){
					message(paste0("The experiment description is missing the required columns: ", paste(required.fields[!field.check], collapse = "', '"), "."))
					check.pass = FALSE
				}else if(length(which(!field.check)) == 1){
					message(paste0("The experiment description is missing the required column ", paste(required.fields[!field.check], collapse = "', '"), "."))
					check.pass = FALSE
				}
			} # end field.check
			optional.fields = c("_TrackIndex") # Valid but not always required
			# Check remaining columns for valid names (compatible with JSON schema)
			user.fields = colnames(experiment.data)[!(colnames(experiment.data) %in% c(required.fields, optional.fields))]
			user.field.check = grepl("^[a-zA-Z][a-zA-Z0-9_]*$", user.fields) & !duplicated(user.fields)
			if(!all(user.field.check)){
				if(length(which(!user.field.check)) > 1){
					user.field.suggestions = gsub("^([0-9])", "X\\1", gsub("([^a-zA-Z0-9_])", "_", make.unique(user.fields)))[!user.field.check]
					message(paste0("The following column names are not valid: ", paste0("'", user.fields[!user.field.check], "'", collapse = ", "), ". Consider replacing them with: ", paste0("'", user.field.suggestions, "'", collapse = ", "), "."))
					check.pass = FALSE
				}else if(length(which(!user.field.check)) == 1){
					user.field.suggestions = gsub("^([0-9])", "X\\1", gsub("([^a-zA-Z0-9_])", "_", make.unique(user.fields)))[!user.field.check]
					message(paste0("The following column name '", user.fields[!user.field.check], "' is not valid. Consider replacing it with '", user.field.suggestions, "'."))
					check.pass = FALSE
				}
			}
			
			if(!file.exists(project.dir)){
				message(paste0("The project directory '", project.dir, "'does not exist."))
				check.pass = FALSE
			}else{
				arenas.present = sapply(paste0(project.dir, stats::na.omit(unique(experiment.data[, "_Arena"]))), simplify = T, USE.NAMES = T, function(arenafile) file.exists(arenafile) )
				if(length(which(!arenas.present)) > 1){
					message(paste0("The following arena description files cannot be found: ", paste(paste0("'", project.dir, names(arenas.present)[!arenas.present], "'"), collapse = ", "), "."))
					check.pass = FALSE
				}else if(length(which(!arenas.present)) == 1){
					message(paste0("The arena description file '", names(arenas.present)[!arenas.present], "' does not exist."))
					check.pass = FALSE
				}else{
					# Check the arena files
					for(arena.name in stats::na.omit(unique(experiment.data[, "_Arena"]))){
						description = tryCatch(
							as.data.frame(t(utils::read.delim(paste0(project.dir, arena.name), header = F, sep = "=", strip.white = T, comment.char = "#", stringsAsFactors = F)), stringsAsFactors = F),
							error = function(e){
								message(paste0("There was a problem reading the arena description file '", filename, "'."))
								check.pass = FALSE
							}
						)
						# Workaround because multiple 'hole' definitions lead to duplicated names.
						if(nrow(description) == 2) description = setNames(description[2, , drop = FALSE], as.character(description[1, ]))
						# Convert time.units info into seconds.
						if(description$time.units == "us" | description$time.units == "micros"){
							description$time.units = 1e-6
						}else if(description$time.units == "ms"){
							description$time.units = 0.001
						}else if(description$time.units == "s"){
							description$time.units = 1
						}else if(description$time.units == "min"){
							description$time.units = 60
						}else if(description$time.units == "h"){
							description$time.units = 3600
						}else if(description$time.units == "d"){
							description$time.units = 86400
						}else if(description$time.units == "y"){
							description$time.units = 31536000
						}else{
							x = eval(parse(text = description$time.units))
							if(is.numeric(x)){
								description$time.units = x
							}else{
								message("The 'time.units' parameter is not valid. Please check the documentation for creating the arena file.")
							}
						}
						#
						zones.check = NULL
						if(description$type == "mwm"){
							time.units.check = tryCatch({
								!is.na(as.numeric(description$time.units))
							}, error = function(e){
								FALSE
							}, warning = function(e){
								FALSE
							})
							zones = intersect(c("arena.bounds", "goal", "old.goal"), colnames(description))
							required.zones.present = TRUE
							if(!"arena.bounds" %in% zones){
								message(paste0("The 'arena.bounds' definition for arena '", arena.name, "' is missing."))
								required.zones.present = FALSE
							}
							if(!"goal" %in% zones){
								message(paste0("The 'goal' definition for arena '", arena.name, "' is missing."))
								required.zones.present = FALSE
							}
							#
							zones.check = sapply(zones, function(zone){
								bounds = unlist(strsplit(description[[zone]], "\\s+"))
								if(bounds[1] == "circle"){
									bounds.check = tryCatch({
										all(length(bounds) == 4, !is.na(as.numeric(bounds[2])), !is.na(as.numeric(bounds[3])), !is.na(as.numeric(bounds[4])))
									}, error = function(e){
										FALSE
									}, warning = function(e){
										FALSE
									})
								}else{
									message(paste0("The '", zone, "' definition for arena '", arena.name, "' is not valid."))
									bounds.check = FALSE
								}
								bounds.check
							})
						}else if(description$type == "oft"){
							time.units.check = tryCatch({
								!is.na(as.numeric(description$time.units))
							}, error = function(e){
								FALSE
							}, warning = function(e){
								FALSE
							})
							zones = intersect(c("arena.bounds"), colnames(description))
							required.zones.present = TRUE
							if(!"arena.bounds" %in% zones){
								message(paste0("The 'arena.bounds' definition for arena '", arena.name, "' is missing."))
								required.zones.present = FALSE
							}
							#
							zones.check = sapply(zones, function(zone){
								bounds = unlist(strsplit(description[[zone]], "\\s+"))
								if(bounds[1] == "square"){
									bounds.check = tryCatch({
										all(length(bounds) %in% c(4, 5, 9), all(suppressWarnings(!is.na(as.numeric(bounds[-1])))))
									}, error = function(e){
										FALSE
									}, warning = function(e){
										FALSE
									})
								}else{
									message(paste0("The '", zone, "' definition for arena '", arena.name, "' is not valid."))
									bounds.check = FALSE
								}
								bounds.check
							})
						}else if(description$type == "nor"){
							time.units.check = tryCatch({
								!is.na(as.numeric(description$time.units))
							}, error = function(e){
								FALSE
							}, warning = function(e){
								FALSE
							})
							zones = intersect(c("arena.bounds", "object.1", "object.2"), colnames(description))
							required.zones.present = TRUE
							if(!"arena.bounds" %in% zones){
								message(paste0("The 'arena.bounds' definition for arena '", arena.name, "' is missing."))
								required.zones.present = FALSE
							}
							if(!"object.1" %in% zones){
								message(paste0("The definition of 'object.1' for arena '", arena.name, "' is missing."))
								required.zones.present = FALSE
							}
							if(!"object.2" %in% zones){
								message(paste0("The definition of 'object.2' for arena '", arena.name, "' is missing."))
								required.zones.present = FALSE
							}
							#
							zones.check = sapply(zones, function(zone){
								bounds = unlist(strsplit(description[[zone]], "\\s+"))
								if(bounds[1] == "circle"){
									bounds.check = tryCatch({
										all(length(bounds) == 4, !is.na(as.numeric(bounds[2])), !is.na(as.numeric(bounds[3])), !is.na(as.numeric(bounds[4])))
									}, error = function(e){
										FALSE
									}, warning = function(e){
										FALSE
									})
								}else if(bounds[1] == "square"){
									bounds.check = tryCatch({
											all(length(bounds) %in% c(4, 5, 9), all(suppressWarnings(!is.na(as.numeric(bounds[-1])))))
									}, error = function(e){
										FALSE
									}, warning = function(e){
										FALSE
									})
								}else{
									message(paste0("The '", zone, "' definition for arena '", arena.name, "' is not valid."))
									check.pass = FALSE
								}
								bounds.check
							})
						}else if(description$type == "barnes"){
							time.units.check = tryCatch({
								!is.na(as.numeric(description$time.units))
							}, error = function(e){
								FALSE
							}, warning = function(e){
								FALSE
							})
							zones = intersect(c("arena.bounds", "hole", "holes", "goal", "old.goal"), colnames(description))
							required.zones.present = TRUE
							if(!"arena.bounds" %in% zones){
								message(paste0("The 'arena.bounds' definition for arena '", arena.name, "' is missing."))
								required.zones.present = FALSE
							}
							if(!("holes" %in% zones | "hole" %in% zones)){
								message(paste0("The 'hole' definitions for arena '", arena.name, "' are missing."))
								required.zones.present = FALSE
							}
							if(!"goal" %in% zones){
								message(paste0("The 'goal' definition for arena '", arena.name, "' is missing."))
								required.zones.present = FALSE
							}
							#
							zones.check = sapply(zones, function(zone){
								bounds = unlist(strsplit(description[[zone]], "\\s+"))
								if(bounds[1] == "circle"){
									bounds.check = tryCatch({
										all(length(bounds) == 4, !is.na(as.numeric(bounds[2])), !is.na(as.numeric(bounds[3])), !is.na(as.numeric(bounds[4])))
									}, error = function(e){
										FALSE
									}, warning = function(e){
										FALSE
									})
								}else if(bounds[1] == "square"){
									bounds.check = tryCatch({
										all(length(bounds) %in% c(4, 5, 9), all(suppressWarnings(!is.na(as.numeric(bounds[-1])))))
									}, error = function(e){
										FALSE
									}, warning = function(e){
										FALSE
									})
								}else{
									message(paste0("The '", zone, "' definition for arena '", arena.name, "' is not valid."))
									check.pass = FALSE
								}
								bounds.check
							})
						}else if(description$type == "apa"){
							time.units.check = tryCatch({
								!is.na(as.numeric(description$time.units))
							}, error = function(e){
								FALSE
							}, warning = function(e){
								FALSE
							})
							zones = intersect(c("arena.bounds", "aversive.zone"), colnames(description))
							required.zones.present = TRUE
							if(!"arena.bounds" %in% zones){
								message(paste0("The 'arena.bounds' definition for arena '", arena.name, "' is missing."))
								required.zones.present = FALSE
							}
							if(!"aversive.zone" %in% zones){
								message(paste0("The 'aversive.zone' definition for arena '", arena.name, "' is missing."))
								required.zones.present = FALSE
							}
							#
							zones.check = sapply(zones, function(zone){
								bounds = unlist(strsplit(description[[zone]], "\\s+"))
								if(bounds[1] == "circle"){
									bounds.check = tryCatch({
										all(length(bounds) == 4, !is.na(as.numeric(bounds[2])), !is.na(as.numeric(bounds[3])), !is.na(as.numeric(bounds[4])))
									}, error = function(e){
										FALSE
									}, warning = function(e){
										FALSE
									})
								}else if(bounds[1] == "sector"){
									bounds.check = tryCatch({
										all(length(bounds) == 3, !is.na(as.numeric(bounds[2])), !is.na(as.numeric(bounds[3])))
									}, error = function(e){
										FALSE
									}, warning = function(e){
										FALSE
									})
								}else{
									message(paste0("The '", zone, "' definition for arena '", arena.name, "' is not valid."))
									check.pass = FALSE
								}
								bounds.check
							})
						}else{
							message(paste0("In '", arena.name, "', the arena type '", description$type, "' is not supported."))
							check.pass = FALSE
						}
						#
						if(!all(zones.check)){
							if(length(which(!zones.check)) > 1){
								message(paste0("In '", arena.name, "', the zone descriptions for the following zones are not valid: ",  paste0("'", names(zones.check)[!zones.check], "'", collapse = ", "), "."))
								check.pass = FALSE
							}else if(length(which(!zones.check)) == 1){
								message(paste0("In '", arena.name, "', the zone description for '", names(zones.check)[!zones.check], "' is not valid."))
								check.pass = FALSE
							}
						}
						#
					}
				}	
			} # end project.dir check
			
			if(!file.exists(data.dir)){
				message(paste0("The data directory '", data.dir, "'does not exist."))
				check.pass = FALSE
			}else{
				track.format.info = unique(experiment.data[, "_TrackFileFormat"])
				supported.formats = suppressMessages(identify_track_format())
				track.formats = sapply(track.format.info, function(info) strsplit(info, "_")[[1]][1])
				paths.formatted = sapply(track.formats, simplify = T, USE.NAMES = T, function(trackformat) trackformat %in% supported.formats )
				if(length(which(!paths.formatted)) > 1){
					message(paste0("The track formats ", paste(paste0("'", names(paths.formatted)[!paths.formatted], "'"), collapse = ", "), " are not supported."))
					check.pass = FALSE
				}else if(length(which(!paths.formatted)) == 1){
					message(paste0("The track format '", names(paths.formatted)[!paths.formatted], "' is not supported."))
					check.pass = FALSE
				}
				paths.present = sapply(paste0(data.dir, stats::na.omit(unique(experiment.data[, "_TrackFile"]))), simplify = T, USE.NAMES = T, function(trackfile) file.exists(trackfile) )
				if(length(which(!paths.present)) > 1){
					message(paste0("The following track files cannot be found: ", paste(paste0("'", data.dir, names(paths.present)[!paths.present], "'"), collapse = ", "), "."))
					check.pass = FALSE
				}else if(length(which(!paths.present)) == 1){
					message(paste0("The track file ", paste(paste0("'", data.dir, names(paths.present)[!paths.present], "'"), collapse = ", "), "  cannot be found."))
					check.pass = FALSE
				}
			} # end data.dir check
			
		}else if(check.pass){
			message(paste0("There was a problem reading the experiment file.")) # This should never happen. The file is valid but the reading into a data frame returned NULL.
			check.pass = FALSE
		} # end experiment.data check
 	} # end non-json check
	
	if(check.pass){
		message(paste0(crayon::green("\u2714"), crayon::green(" This experiment appears to be valid and complete.")))
	}else{
		if(format != "trackxf"){ # This is not an error - just not supported.
			message(paste0(crayon::red("\u2716"), crayon::red(" This experiment has some problems. Please check the documentation at https://rupertoverall.net/Rtrack.")))
		}
	}
	invisible(check.pass)
}

Try the Rtrack package in your browser

Any scripts or data that you put into this service are public.

Rtrack documentation built on Aug. 10, 2023, 9:10 a.m.