Nothing
#' @title ID Check
#' @description This function checks that the first column of the data set is the primary ID for each participant labeled as `SUBJECT_ID`, that values contain no illegal characters or padded zeros, and that each participant has an ID.
#' @param DS.data Data set.
#' @param verbose When TRUE, the function prints the Message out, as well as more detailed diagnostic information.
#' @return Tibble, returned invisibly, containing: (1) Time (Time stamp); (2) Name (Name of the function); (3) Status (Passed/Failed); (4) Message (A copy of the message the function printed out); (5) Information (Detailed information about the four ID checks that were performed).
#' @export
#' @details Subject IDs should be an integer or string value. Integers should not have zero padding. IDs should not have spaces. Specifically, only the following characters can be included in the ID: English letters, Arabic numerals, period (.), hyphen (-), underscore (_), at symbol (@), and the pound sign (#). All IDs should be filled in (i.e., no misisng IDs are allowed).
#' @seealso \code{\link{id_first_data}}
#' @seealso \code{\link{id_first_dict}}
#' @examples
#' # Example 1: Fail check, 'SUBJECT_ID' not present
#' data(ExampleO)
#' id_check(DS.data.O)
#' print(id_check(DS.data.O, verbose=FALSE))
#'
#' # Example 2: Fail check, 'SUBJECT_ID' includes illegal spaces
#' data(ExampleP)
#' id_check(DS.data.P)
#' results <- id_check(DS.data.P)
#' results$Information[[1]]$details
#' print(id_check(DS.data.P, verbose=FALSE))
#'
#' # Example 3: Pass check
#' data(ExampleA)
#' id_check(DS.data.A)
#' print(id_check(DS.data.A, verbose=FALSE))
id_check <- function(DS.data, verbose=TRUE){
# Function contains 5 checks to meet dbGaP requirements that "The primary
# ID in each subject phenotypes file should be the SUBJECT_ID."
# 1. Is column 1 SUBJECT_ID?
# 2. If not, is SUBJECT_ID in the data set?
# 3. SUBJECT_ID can contain only the following characters: English letters,
# Arabic numerals, period (.), hyphen (-), underscore (_), at symbol (@),
# and the pound sign (#).
# 4. Zero padding is not allowed for SUBJECT_ID.
# 5. All IDs are filled in (missing values not allowed).
# Check 1: Column 1 is labeled as 'SUBJECT_ID'
x <- names(DS.data)[1]
y <- which(names(DS.data)=="SUBJECT_ID")
if (x=="SUBJECT_ID"){
check1 <- TRUE
check2 <- TRUE
} else {
check1 <- FALSE
# Check 2: If 'SUBJECT_ID' is not the name of column 1,
# is SUBJECT_ID any column name in the data set?
if (c("SUBJECT_ID") %in% names(DS.data)) {
check2 <- TRUE
} else {
check2 <- FALSE
}
}
# Create information reports
# Check 1
check.name <- "Check 1"
check.description <- "Column 1 is labeled as 'SUBJECT_ID'."
if (check1==TRUE) {
check.status <- "Passed"
details <- paste0("The first column name is ", x, ".")
} else {
check.status <- "Failed"
details <- paste0("The first column name is ", x, ". The name of the first column should be 'SUBJECT_ID'.")
}
check1.final <- tibble(check.name, check.description, check.status, details)
# Check 2
check.name <- "Check 2"
check.description <- "'SUBJECT_ID' is a column name in the data set."
if (check2==TRUE) {
check.status <- "Passed"
if (check1==FALSE) {
details <- paste0("'SUBJECT_ID' is the name of column ", y, ". Please reoder data set so 'SUBJECT_ID' is the name of column 1.")
} else {
details <- paste0("'SUBJECT_ID' is the name of column ", y, ".")
}
} else {
check.status <- "Failed"
details <- paste0("'SUBJECT_ID' is not a column name in this data set.")
}
check2.final <- tibble(check.name, check.description, check.status, details)
# Check 3: If checks 1 or 2 pass, check that there are no illegal characters
# within the 'SUBJECT_ID' formatting
check.name <- "Check 3"
if (check1==TRUE | check2==TRUE) {
#trouble_rows <- grep(" ", DS.data$SUBJECT_ID)
trouble_rows <- grep(pattern = "([^A-Za-z0-9\\.\\-_@#])", DS.data$SUBJECT_ID, perl=TRUE, value=TRUE)
if (length(trouble_rows)==0) {
check3 <- TRUE
check.status <- "Passed"
details <- "No illegal characters detected in 'SUBJECT_ID'."
} else {
check3 <- FALSE
check.status <- "Failed"
details <- paste0("Illegal characters detected in 'SUBJECT_ID' for ", length(trouble_rows), " row(s). SUBJECT_ID may contain only: English letters, Arabic numerals, period (.), hyphen (-), underscore (_), at symbol (@), and the pound sign (#). No spaces or other characters are allowed.")
}
} else {
check3 <- FALSE
check.status <- "Failed"
details <- "Checks 1 and 2 failed, so Check 3 was not performed."
}
check3.final <- tibble(check.name, check.description, check.status, details)
# Check 4: If checks 1 or 2 pass, check that there are no padded zeros
# within the 'SUBJECT_ID' formatting
check.name <- "Check 4"
check.description <- "No leading zeros detected in 'SUBJECT_ID' column."
if (check1==TRUE | check2==TRUE) {
trouble_rows <- grep("^0", DS.data$SUBJECT_ID)
if (length(trouble_rows)==0) {
check4 <- TRUE
check.status <- "Passed"
details <- paste0("No leading zeros detected in 'SUBJECT_ID'.")
} else {
check4 <- FALSE
check.status <- "Failed"
details <- paste0("Leading zeros detected in 'SUBJECT_ID' for ", length(trouble_rows), " row(s).")
}
} else {
check4 <- FALSE
check.status <- "Failed"
details <- "Checks 1 and 2 failed, so Check 4 was not performed."
}
check4.final <- tibble(check.name, check.description, check.status, details)
# Check 5: If checks 1 or 2 pass, check that all IDs are filled in.
check.name <- "Check 5"
check.description <- "No missing values for 'SUBJECT_ID'."
if (check1==TRUE | check2==TRUE) {
trouble_rows <- which(is.na(DS.data$SUBJECT_ID))
if (length(trouble_rows)==0) {
check5 <- TRUE
check.status <- "Passed"
details <- paste0("No missing values detected for 'SUBJECT_ID'.")
} else {
check5 <- FALSE
check.status <- "Failed"
details <- paste0("Missing values detected in 'SUBJECT_ID' for ", length(trouble_rows), " row(s).")
}
} else {
check5 <- FALSE
check.status <- "Failed"
details <- "Checks 1 and 2 failed, so Check 5 was not performed."
}
check5.final <- tibble(check.name, check.description, check.status, details)
# Create information summary
Information <- bind_rows(check1.final, check2.final, check3.final, check4.final, check5.final)
# Compile report for return to user
Time <- Sys.time()
Function <- "id_check"
if (all(Information$check.status=="Passed")) {
Status <- "Passed"
Message <- c("Passed: All ID variable checks passed.")
return_to_user <- lst(Message, Information)
} else {
Status <- "Failed"
Message <- c("ERROR: not all ID variable requirements are met. See Information for more details.")
return_to_user <- lst(Message, Information)
}
report <- tibble(Time, Function, Status, Message, Information=lst(Information))
# If verbose=TRUE, print message
if (verbose==TRUE){
print(return_to_user)
}
# Invisible return of report
return(invisible(report))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.