#' Check consistency of data in a project
#'
#' @param project path to the project directory
#' @param nmax maximum number of problematic items to print out (50 by default)
#'
#' @details
#' This function
#' \enumerate{
#' \item Reads the metadata table of the project
#' \item Lists and reads all \code{dat1.pid} files in the \code{work} directory
#' \item Lists and reads all \code{dat1.tx} files in \code{PID_process/Pid_results/Dat1_validated} (or \code{Dat1_extracted} if the previous one is empty)
#' \item Lists all images in the \code{PID_process/Sorted_vignettes} directory
#' }
#' Then it
#' \enumerate{
#' \item Compares the list of samples in the metadata, \code{dat1.pid}, and \code{dat1.txt} files
#' \item Compares the list of objects in the \code{dat1.pid}, \code{dat1.txt} files, and present as images (i.e. vignettes)
#' \item For images present both in the \code{dat1.txt} files and as vignettes, checks that the location of a vignette (its classification) matches the last column in the \code{dat1.txt} file
#' }
#'
#' @return
#' Returns a list (of class \code{check}) containing all problematic items (sample or object names) that can be printed with various levels of detail (depending on \code{nmax})
#'
#' @export
check_project <- function(project) {
project <- project_class(project)
check_project_dispatcher(project=project)
}
check_project_dispatcher <- function(project) {
UseMethod("check_project_dispatcher")
}
#' @importFrom stringr str_c str_replace str_split
#' @importFrom plyr ldply
check_project_dispatcher.zooscan <- function(project) {
# message("CHECK SAMPLES")
# message("Gather data")
# list all data referencing samples
meta <- read_meta(project)
pid_files <- list.files(str_c(project, "/Zooscan_scan/_work"), pattern=glob2rx("*.pid"), full=TRUE, recursive=TRUE)
dat1_files <- list.files(str_c(project, "/PID_process/Pid_results/Dat1_validated"), pattern=glob2rx("*_dat1.txt"), full=TRUE)
if (length(dat1_files) == 0) {
dat1_files <- list.files(str_c(project, "/PID_process/Pid_results/Dat1_extracted"), pattern=glob2rx("*_dat1.txt"), full=TRUE)
dat1_files <- dat1_files[!str_detect(dat1_files, fixed("/Analysis_"))]
}
# get sample identifiers from names
pids <- str_replace(basename(pid_files), "_dat1.pid", "")
dat1 <- str_replace(basename(dat1_files), "_dat1.txt", "")
in_meta_no_pid <- setdiff(meta$id, pids)
attr(in_meta_no_pid, "message") <- "Sample in meta but no .pid file"
in_meta_no_dat1 <- setdiff(meta$id, dat1)
attr(in_meta_no_dat1, "message") <- "Sample in meta but no .txt file"
pid_not_in_meta <- str_replace(pid_files[which(! pids %in% meta$id)], project, "")
attr(pid_not_in_meta, "message") <- "Sample with .pid file but not in meta"
dat1_not_in_meta <- str_replace(dat1_files[which(! dat1 %in% meta$id)], project, "")
attr(dat1_not_in_meta, "message") <- "Sample with .txt file but not in meta"
pid_no_dat1 <- setdiff(pids, dat1)
attr(pid_no_dat1, "message") <- "Sample with .pid file but no .txt file"
dat1_no_pid <- setdiff(dat1, pids)
attr(dat1_no_pid, "message") <- "Sample width .txt file but no .pid file"
# message("\nCHECK OBJECTS")
# message("Gather data")
# read pid and dat1 data records
dpids <- ldply(pid_files, read_pid, .inform=TRUE)
ddat1 <- ldply(dat1_files, read_pid, .inform=TRUE)
# get image names from records
dpids$img <- str_c(dpids$Label, "_", dpids$Item)
ddat1$img <- str_c(ddat1$Label, "_", ddat1$Item)
# get images on disk
img_files <- list.files(str_c(project, "/PID_process/Sorted_vignettes"), pattern=glob2rx("*.jpg"), recursive=TRUE)
imgs <- str_replace(basename(img_files), ".jpg", "")
# Check image presence
pid_no_img <- setdiff(dpids$img, imgs)
attr(pid_no_img, "message") <- "Object referenced in .pid files but no vignette"
dat1_no_img <- setdiff(ddat1$img, imgs)
attr(dat1_no_img, "message") <- "Object referenced in .txt file but no vignette"
img_not_in_pid <- img_files[! imgs %in% dpids$img]
attr(img_not_in_pid, "message") <- "Vignette present but no object in .pid files"
img_not_in_dat1 <- img_files[! imgs %in% ddat1$img]
attr(img_not_in_dat1, "message") <- "Vignette present but no object in .txt files"
# Check identifications
# reduce to vignettes on disk and in dat1 (and keep order)
ddat1_common <- ddat1[ddat1$img %in% imgs,]
img_files_common <- img_files[match(ddat1_common$img, imgs)]
# extract identifications
img_ids <- str_split(img_files_common, "/")
img_ids <- sapply(img_ids, function(x) {x[length(x)-1]})
ids <- ddat1_common[,c("img", "Valid")]
names(ids) <- c("object", "from_dat1")
ids$from_img <- img_ids
# detect problems and report them
wrong_ids <- ids[ids$from_dat1 != ids$from_img,]
if ( nrow(wrong_ids) > 0) {
wrong_ids <- str_c(wrong_ids$object, " should be ", wrong_ids$from_dat1, " but vignette is in ", wrong_ids$from_img)
} else {
wrong_ids <- character()
}
attr(wrong_ids, "message") <- "Identification mismatch between .txt files and vignette location"
out <- list(
in_meta_no_pid=in_meta_no_pid,
in_meta_no_dat1=in_meta_no_dat1,
pid_not_in_meta=pid_not_in_meta,
dat1_not_in_meta=dat1_not_in_meta,
pid_no_dat1=pid_no_dat1,
dat1_no_pid=dat1_no_pid,
pid_no_img=pid_no_img,
dat1_no_img=dat1_no_img,
img_not_in_pid=img_not_in_pid,
img_not_in_dat1=img_not_in_dat1,
wrong_ids=wrong_ids
)
class(out) <- c("checklist", class(out))
return(out)
}
#' @importFrom plyr l_ply
#' @export
#' @rdname check_project
print.checklist <- function(x, nmax=50) {
l_ply(x, function(X) {
n <- length(X)
if (n > 0) {
title <- attr(X, "message")
message("\n", title, " (",n ," elements)")
if (n > nmax) {
X <- X[1:nmax]
}
X <- str_c(" ", X, collapse="\n")
if ( n > nmax) {
X <- str_c(X, "\nand ", (n - nmax), " others...")
}
message(X)
}
})
return(invisible(x))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.