R/read_experiment.R
In Rtrack: Spatial Navigation Strategy Analysis

Documented in read_experiment

#' Read experiment data.
#'
#' Reads a spreadsheet containing a description of all the files required for an
#' experiment to allow batch execution.
#'
#' Information about a full experiment can be assembled into a spreadsheet (
#' currently Excel and CSV formats are supported) and used to process large
#' numbers of files in one batch. The project directory (\code{project.dir}) is
#' where the arena description files are found. This will typically be the same
#' place as the experiment description file (and is set to be this by default).
#' This does not need to be the same as the current working directory. An
#' optional data directory (\code{data.dir}) can also be specified separately
#' allowing the storage-intensive raw data to be kept in a different location
#' (for example on a remote server). Together, these options allow for
#' flexibility in managing your raw data storage. Individual tracks are
#' associated with their raw data file, experimental group metadata, an arena
#' and any other parameters that the strategy-calling methods require. Required
#' columns are "_TrackID", "_TargetID", "_Day", "_Trial", "_Arena" "_TrackFile"
#' and "_TrackFileFormat" (note the leading underscore "_"). Any additional
#' columns (without a leading underscore) will be interpreted as user-defined
#' factors or other metadata and will be passed on to the final analysis objects
#' and thus be available for statistical analysis.
#'
#' For details on how interpolation is performed (if \code{interpolate} is set
#' to \code{TRUE}), see the documentation for \code{\link{read_path}}.
#'
#' For larger experiments, it might be helpful to run the experiment processing
#' on multiple CPU cores in parallel. To do this, simply specify the number of
#' processes ("threads") to use.
#'
#' @param filename A spreadsheet file containing a description of the experiment
#'   or a trackxf file containing an exported experiment.
#' @param format An experiment description for reading raw data can be provided
#'   as an Excel spreadsheet ("excel") or as a comma-delimited ("csv") or
#'   tab-delimited ("tab", "tsv", "txt" or "text") text file. The value "trackxf"
#'   indicates that the file is an archived experiment in the trackxf format (as
#'   generated by \code{\link{export_data}}). Default (\code{NA}) is to guess
#'   the format from the file extension.
#' @param interpolate This is passed to the \code{\link{read_path}} function and
#'   specifies whether missing data points will be interpolated when reading raw
#'   swim path data. Default is \code{FALSE}.
#' @param project.dir A directory path specifying where the files needed for
#'   processing the experiment are stored. Default (\code{NA}) means the project
#'   files are in the same directory as the experiment description (specified by
#'   \code{filename}). Ignored if \code{format = "trackxf"}.
#' @param data.dir A directory path specifying where the raw data are stored.
#'   All paths specified in the experiment description spreadsheet are
#'   interpreted as being relative to the \code{data.dir} directory. Default is
#'   the same directory as \code{project.dir}. Ignored if \code{format =
#'   "trackxf"}.
#' @param author.note Optional text describing the experiment. This might be
#'   useful if the data is to be published or otherwise shared. Appropriate
#'   information might be author names and a link to a publication or website.
#' @param threads The number of CPU threads/processes to run in parallel. The
#'   default is 1, which will use just one single thread. A value of 0 will try
#'   to use the maximum number of available cores (using multi-threading if
#'   available). Using all of the available threads/logical cores may not be
#'   sensible though, depending on your hardware. Note that for some Linux
#'   machines with multi-threading capabilities, the number of threads detected
#'   might be the same as the number of physical CPU cores. Negative values will
#'   start the default number of threads minus the given number.
#' @param verbose Should feedback be printed to the console. This is only useful
#'   for debugging and takes a little longer to run. Default is \code{FALSE}.
#'
#' @return An \code{rtrack_experiment} object containing a complete description
#'   of the experiment.
#'
#' @seealso \code{\link{read_path}}, \code{\link{read_arena}},
#'   \code{\link{identify_track_format}} to identify the format of your raw
#'   track files, and \code{\link{check_experiment}}.
#'
#' @examples
#' require(Rtrack)
#' experiment.description = system.file("extdata", "Minimal_experiment.xlsx",
#'   package = "Rtrack")
#' experiment = read_experiment(experiment.description)
#'
#' @importFrom tools file_ext
#' @importFrom zip zip_list unzip
#' @importFrom readxl read_excel
#' @importFrom utils read.csv read.table
#' @importFrom stats na.omit
#' @importFrom pbapply pboptions pblapply
#' @importFrom parallel makeForkCluster detectCores stopCluster
#' @importFrom rjson fromJSON
#' @importFrom stringi stri_enc_mark
#' @importFrom RCurl url.exists
#'
#' @export
read_experiment = function(filename, format = NA, interpolate = FALSE, project.dir = NA, data.dir = project.dir, author.note = "", threads = 1, verbose = FALSE){
	filepath = filename # Full path (if URL, then will be resolved to a temporary file).
	filename = basename(filename) # Just name.
	tempdir = tempdir()
	if(RCurl::url.exists(filepath)){ # From a URL, only an archive can be read.
		tempfile = file.path(tempdir, filename)
		utils::download.file(filepath, tempfile, method = "libcurl", mode = "wb")
		filepath = tempfile
	}else{ # On local filesystem
		if(is.na(project.dir)) project.dir = dirname(filepath)
		if(is.na(data.dir)) data.dir = dirname(filepath)
		if(unlist(strsplit(project.dir, ""))[nchar(project.dir)] != "/") project.dir = paste0(project.dir, "/")
		if(unlist(strsplit(data.dir, ""))[nchar(data.dir)] != "/") data.dir = paste0(data.dir, "/")
	}
	# Is it compressed? Only valid if trackxf (so archive only contains a single file with the same name as the archive (but without '.trackxf' extension)).
	if(suppressWarnings(tryCatch({zip::zip_list(filepath)$filename[1] == paste0(filename, ".json")}, error = function(e) FALSE ))){
		tempfile = paste0(filename, ".json")
		zip::unzip(filepath, files = tempfile, exdir = tempdir)
		filepath = file.path(tempdir, tempfile)
	}

	format = tolower(format)
	if(is.na(format)){
		file.header = suppressWarnings(tryCatch({readLines(filepath, n = 5)}, error = function(e) stop("Experiment file does not exist or is corrupt")))
		non.text = any(suppressWarnings(stringi::stri_enc_mark(file.header)) %in% c("bytes", "native"))
		trackxf.header = ifelse(!non.text, length(grep('rupertoverall\\.net\\/trackxf\\/trackxf_schema', file.header)), 0)
		#
		if(tools::file_ext(filename) %in% c("trackxf") | trackxf.header > 0){
			version.text = grep("rupertoverall\\.net\\/trackxf\\/trackxf_schema", file.header, value = TRUE)
			trackxf.version = gsub("^.*trackxf_schema_(v[0-9]*).*$", "\\1", version.text)
			if(trackxf.version == "v0"){
				format = "trackxf"
			}else{
				stop("This release of Rtrack does not support trackxf version '", trackxf.version, "'.") 
			}
		}else if(tools::file_ext(filename) %in% c("json")){
			format = "json"
		}else if(tools::file_ext(filename) %in% c("xls", "xlsx")){
			format = "excel"
		}else if(tools::file_ext(filename) %in% c("csv")){
			if(ncol(read.csv(filepath)) > 4){
				format = "csv"
			}else{
				format = "csv2"
			}
		}else if(tools::file_ext(filename) %in% c("tab", "tsv", "txt")){
			format = "tab"
		}else{
			stop("The file format cannot be established. Please specify the 'format' parameter.")
		}
	}
	
	available.cores = max(parallel::detectCores(logical = TRUE), na.rm = TRUE, 1)
	if(threads == 0){
		threads = available.cores
	}else if(threads < 0){ 
		threads = max(available.cores + threads, 1)
	} # Also, if threads is set higher than the number of jobs, then only n.jobs threads will be used (handled when cluster is created).
	progress = function(text){cat("    ", crayon::blue(text), "\n", sep = "")}
	experiment = NULL
	
	if(format == "trackxf"){
		progress(paste0("Restoring archived experiment."))
		experiment = read_trackxf(filepath, threads, verbose)
	}else	if(format == "json"){ # For backward compatibility. Deprecated.
		progress(paste0("Restoring archived experiment."))
		experiment = read_json(filepath, threads, verbose)
	}else{
		experiment.data = NULL
		experiment.info = NULL
		#
		if(format == "xls" | format == "xlsx" | format == "excel"){
			experiment.data = suppressMessages(as.data.frame(readxl::read_excel(filepath, col_types = 'text'), stringsAsFactors = F))
			rownames(experiment.data) = experiment.data$TrackID
		}else if(format == "csv"){
			experiment.data = utils::read.csv(filepath, header = TRUE, stringsAsFactors = F, check.names = FALSE)
			rownames(experiment.data) = experiment.data$TrackID
		}else if(format == "csv2"){
			experiment.data = utils::read.csv2(filepath, header = TRUE, stringsAsFactors = F, check.names = FALSE)
			rownames(experiment.data) = experiment.data$TrackID
		}else{
			experiment.data = utils::read.table(filepath, sep = "\t", header = TRUE, stringsAsFactors = F, check.names = FALSE)
			rownames(experiment.data) = experiment.data$TrackID
		}
		# Run check for required columns
		core.columns = c("_TargetID", "_Day", "_Trial", "_Arena", "_TrackFile", "_TrackFileFormat")
		check = core.columns %in% colnames(experiment.data)
		if(!all(check)) stop(paste0("The experiment description is missing the required column/s '", paste(c("_TargetID", "_Day", "_Trial", "_Arena", "_TrackFile", "_TrackFileFormat")[!check], collapse = "', '"), "'."))
		if("_TrackIndex" %in% colnames(experiment.data)) experiment.data$`_TrackIndex` = as.numeric(experiment.data$`_TrackIndex`)
		time.bounds = cbind(rep(NA, nrow(experiment.data)), rep(NA, nrow(experiment.data)))
		if(!"_PathStart" %in% colnames(experiment.data)) experiment.data$`_PathStart` = NA
		if(!"_PathEnd" %in% colnames(experiment.data)) experiment.data$`_PathEnd` = NA
		# Extract user columns to a data.frame of factors
		user.columns = colnames(experiment.data)[sapply(sapply(colnames(experiment.data), strsplit, ""), "[", 1) != "_"]
		user.factors = data.frame(experiment.data[, user.columns, drop = FALSE], stringsAsFactors = F)
		factors = data.frame(experiment.data[, c("_TargetID", "_Day", "_Trial", "_Arena")], user.factors, stringsAsFactors = F, check.names = F)
		arenas = sapply(stats::na.omit(unique(experiment.data[, "_Arena"])), simplify = F, USE.NAMES = T, function(arenafile){
			arena = read_arena(paste0(project.dir, arenafile))
			return(arena)
		})
		factors[, "_Arena"] = sapply(arenas[factors[, "_Arena"]], "[[", "id") # Use the id.
		factors[, "_Day"] = as.numeric(factors[, "_Day"])
		factors[, "_Trial"] = as.numeric(factors[, "_Trial"])
		experiment.data = experiment.data[, c("_TrackID", core.columns, "_PathStart", "_PathEnd", user.columns)] # Reorder.

		# Calculate metrics for whole experiment.
		if(threads == 1){ # Non-parallel code.
			progress("Processing tracks.")
			metrics = vector("list", nrow(experiment.data))
			pbapply::pboptions(type = "timer", txt.width = 50, style = 3, char = "=")
			pb = pbapply::startpb(min = 0, max = nrow(experiment.data))
			for(i in seq_len(nrow(experiment.data))){
				this.arena = arenas[[as.character(experiment.data[i, "_Arena"])]] # Pre-loaded
				this.path = read_path(paste0(data.dir, as.character(experiment.data[i, "_TrackFile"])), this.arena, id = as.character(experiment.data[i, "_TrackID"]), track.format = as.character(experiment.data[i, "_TrackFileFormat"]), interpolate = interpolate, time.bounds = unlist(experiment.data[i, c("_PathStart", "_PathEnd")]))   
				if(length(this.path$t) > 1){
					metrics[[i]] = calculate_metrics(this.path, this.arena)
				}else{
					metrics[[i]] = NULL
				}
				pbapply::setpb(pb, i)
			}
		}else{ # Parallel code (threads > 1).
			progress("Initialising cluster.")
			pbapply::pboptions(type = "timer", txt.width = 50, style = 3, char = "=")
			cluster = parallel::makePSOCKcluster(min(threads, nrow(experiment.data)))
			parallel::clusterExport(cl = cluster, list("project.dir", "data.dir", "interpolate", "arenas"), envir = environment())
			#. = parallel::clusterEvalQ(cl = cluster, require("Rtrack"))
			progress(paste0("Processing tracks using ", length(cluster), " threads."))
			metrics = pbapply::pbapply(experiment.data, 1, function(track){
				this.arena = Rtrack::read_arena(paste0(project.dir, as.character(track["_Arena"])))
				this.path = Rtrack::read_path(paste0(data.dir, as.character(track["_TrackFile"])), this.arena, id = as.character(track["_TrackID"]), track.format = as.character(track["_TrackFileFormat"]), interpolate = interpolate, time.bounds = unlist(track[c("_PathStart", "_PathEnd")]))
				if(length(this.path$t) > 1){
					return(Rtrack::calculate_metrics(this.path, this.arena))
				}else{
					return(NULL)
				}
			}, cl = cluster)
			parallel::stopCluster(cluster)
		}
		# Resize list to remove any missing data (e.g. from non-existent files)
		keep = !sapply(metrics, is.null)
		metrics = metrics[keep]
		names(metrics) = experiment.data[keep, "_TrackID"]
		factors = factors[keep, ]
		rownames(factors) = experiment.data[keep, "_TrackID"]
		info = list(
			author.note = author.note,
			processing.note = list(
				software = "Rtrack",
				version = as.character(utils::packageVersion("Rtrack")),
				date = format(Sys.time(), format = "%Y-%m-%dT%H:%M:%S%z"), # ISO 8601 
				url = "https://rupertoverall.net/Rtrack"
			),
			export.note = list(
				software = "",
				version = "",
				date = "", 
				url = ""
			)
		)
		experiment = list(metrics = metrics, factors = factors, summary.variables = names(metrics[[1]]$summary), info = info)
	}
	
	class(experiment) = "rtrack_experiment"
	return(experiment)
}