#' @name Check_spread_file
#' @title Check the readable of spread files
#' @importFrom readxl excel_format
#' @param fn fn
#' @return Logical
#' @export
#'
Check_spread_file <- function(fn){
if(excel_format(fn) != 'xls' & excel_format(fn) != 'xlsx')
stop("The selected input should be an Excel file with `xls` or `xlsx` format!")
return(TRUE)
}
#' @name Check_sheet_name
#' @title Check the format of sheet names of the spread
#'
#' @importFrom stringr str_detect
#' @importFrom cli cat_bullet
#' @importFrom readxl excel_sheets read_excel
#' @importFrom openxlsx read.xlsx
#' @importFrom magrittr %>%
#' @importFrom stringi stri_escape_unicode
#'
#' @param fn fn
#' @param dataset_format dataset_format (default as 1)
#' @param excel_option Which package used for opening the spread: `readxl` or `openxlsx`
#' @return List
#' @export
#' @encoding UTF-8
#'
Check_sheet_name <- function(fn, dataset_format = 1, excel_option = "readxl"){
cli_control("head", "Sheet name check")
cat("\n")
num_er <- num_wr <- 0
if(!Check_spread_file(fn)) stop("File check failed!")
Sheets_found <- excel_sheets(fn)
cli_control("info",
c("Following sheets are found: ", paste(Sheets_found, collapse = " ")))
if(dataset_format == 1){
Sheets_required <- c(stri_unescape_unicode("\\u57fa\\u672c\\u53c2\\u6570"),
"1.Rrs", "2.ap", "3.aph", "4.anap", "5.aCDOM", "6.apc")
general_name <- c("base", "Rrs", "ap", "aph", "anap", "aCDOM", "apc")
base_colnames_type <- dataset_format_1$base_info[,"base_colnames_type"]
}
# showNonASCIIfile(file = './R/DAMATO.R')
Sheets_used <- NULL
for(Sheet in Sheets_required){
w <- stringi::stri_detect_regex(Sheets_found, Sheet, case_insensitive = TRUE)
if(!any(w)){
cli_control("error", c("[", Sheet, "] does not detected!"))
num_er = num_er + 1
stop()
}
if(sum(w) > 1){
cli_control("error",
c(Sheet, " should be only one matched element! ",
"But [", paste(Sheets_found[w], collapse = " "), "] were found!"))
num_er = num_er + 1
stop()
}
# Sheets_used <- c(Sheets_used, Sheet)
Sheets_used <- c(Sheets_used, Sheets_found[w])
}
cli_control("right",
c("Following sheets are used for the next step: ", paste(Sheets_used, collapse = " ")))
dt <- list()
# for(Sheet in Sheets_used){
# if(Sheet == stri_unescape_unicode("\\u57fa\\u672c\\u53c2\\u6570") | Sheet == "base"){
# dt[[Sheet]] <- read_excel(fn, sheet=Sheet, col_types = base_colnames_type)
# }else{
# dt[[Sheet]] <- read_excel(fn, sheet=Sheet, col_types = "numeric")
# }
# }
for(Sheet in Sheets_used){
if(excel_option == "openxlsx") {
dt[[Sheet]] <- openxlsx::read.xlsx(fn, sheet = Sheet)
} else if(excel_option == "readxl") {
dt[[Sheet]] <- readxl::read_excel(fn, sheet = Sheet)
} else {
stop("Error input `excel_option`! Use `openxlsx` or `readxl`")
}
}
names(dt) <- general_name
cli_control("info",
c("To simplify the follow-up process, change the sheet name to: ",
paste(general_name, collapse = " ")))
# To remove the return line of colnames
if(any(str_detect(names(dt[['base']]), "\\r\\n"))){
old_name <- names(dt[['base']])
names(dt[['base']]) <- names(dt[['base']]) %>% gsub("\\r\\n",'', .)
cli_control("warning",
c("We found carriage return or newline in the sheet [base]: ",
paste(names(dt[['base']])[which(str_detect(old_name ,"\\r\\n"))], collapse = " ")))
cli_control("warning", "Fully recommand to use only one line character to describe colnames in [base].")
cli_control("warning", "Anyway, the carriage return or newline symbols are removed in this process!")
num_wr = num_wr + 1
}
# final report
Status <- ifelse(num_er == 0, "Pass", "Error")
col <- ifelse(num_er == 0, {ifelse(num_wr == 0, "green", "yellow")}, "red")
cat("\n")
cli_control("status", c(
"Sheet name status: [", Status,"] ", "Error: [", num_er, "] Warning: [", num_wr, "]"
), col_status = col)
cat("\n")
cli_control("end", "Sheet name check")
result <- list(
fn = fn,
Sheets_used = Sheets_used,
general_name = general_name,
dt = dt,
Status_sheet_name = Status
)
return(result)
}
#' @name Check_sample_id
#' @title Check the format of sample ids of the spread
#'
#' @importFrom cli cat_bullet
#' @importFrom stringr str_detect str_match str_c
#' @importFrom stringi stri_unescape_unicode
#'
#' @param input input
#' @param dataset_format dataset_format (default as 1)
#' @param fix_Rrs_sample_id fix_Rrs_sample_id (default as \code{TRUE})
#' @return List
#'
#' @export
#' @encoding UTF-8
#'
Check_sample_id <- function(input, dataset_format = 1, fix_Rrs_sample_id = TRUE){
# sample id should be checked into twp parts
# 1) generally, the number of samples should follow some rules;
# 2) for each sheet, check the string of samples about unmatched regrexp pattern such as "(" or "["
# symbols: good-tick green; error-cross red; warning-bullet yellow
cli_control("head", "Sample ID check")
cat("\n")
num_er <- num_wr <- 0
dt <- input$dt
general_name <- input$general_name
list_sample_id <- list()
# SampleID head
for(name in names(dt)){
dt_sub <- dt[[name]]
if(name == general_name[1] | name == stri_unescape_unicode("\\u57fa\\u672c\\u53c2\\u6570")){ # for base sheet
pattern <- str_c(stri_unescape_unicode("\\u7f16\\u53f7"),"|",stri_unescape_unicode("\\u901a\\u7528"),"|SampleID") #Bianhao | Tongyong
w <- str_detect(names(dt_sub), pattern) %>% which()
if(sum(w) == 1){
list_sample_id[[name]] <- dt_sub[, w] %>% as.matrix %>% c
cli_control("right", c("'SampleID' found in [", name, "]"))
}else{
cli_control("error", c("'SampleID' CANNOT found in [", name, "]"))
num_er = num_er + 1
}
}else{ # for following sheets
if(!all(str_detect(names(dt_sub), pattern))){
cli_control("warning", c("'SampleID' CANNOT be found in [", name, "]."))
cli_control("warning", "The first colname is used here")
cli_control("warning", c("Recommand to change '", names(dt_sub)[1], "' as 'SampleID'."))
num_wr = num_wr + 1
}
wrong_pattern <- str_c(stri_unescape_unicode("\\u5907\\u6ce8"),
'|Note|note|\\[|\\]|\\{|\\}|\\(|\\)|',
stri_unescape_unicode("\\uff08"),'|',
stri_unescape_unicode("\\uff08"),'|-')
if(any(str_detect(names(dt_sub)[-1], wrong_pattern))){
cli_control("error",
c("Found '",
paste(names(dt_sub)[-1][str_detect(names(dt_sub)[-1], wrong_pattern)], collapse = " "),
"' in colnames of [", name,
"] with the wrong format 'YYYYMMXXNN_Z'. They have been removed. Please check and re-submit!"))
num_er = num_er + 1
w <- which(str_detect(names(dt_sub), str_c(stri_unescape_unicode("\\u5907\\u6ce8"),'|Note|note')))
list_sample_id[[name]] <- names(dt_sub)[c(-1, -w)]
}else{
list_sample_id[[name]] <- names(dt_sub)[-1]
}
}
}
# SampleID details - format
length_sample_id <- lapply(list_sample_id, length) %>% as.data.frame
if(names(length_sample_id)[which.max(length_sample_id)] != 'base'){
cli_control("error",
c("Error! Full sample id was selected by the sheet [",
names(length_sample_id)[which.max(length_sample_id)], '].',
" It shoud be [base]!"))
sample_id_full <- list_sample_id[['base']]
num_er = num_er + 1
}else{
cli_control("okay", c(
"Full sample id was selected by the sheet [", names(length_sample_id)[which.max(length_sample_id)], '].'
))
sample_id_full <- list_sample_id[[which.max(length_sample_id)]]
}
# The sample id definition of base and Rrs is a kind of different. Have to check.
w = which(!(list_sample_id$Rrs %in% list_sample_id$base))
if(length(w) != 0){
cli_control("error",c(
"In Rrs sheet, the detected sample id(s) are different from the base: ", paste(list_sample_id$Rrs[w], collapse = " ")
))
for(i in 1:length(w)){
w_maybe = which(str_detect(list_sample_id$base, str_c(list_sample_id$Rrs[w][i],"_1")))
if(length(w_maybe) == 0){
cat_bullet("No.", i, " Cannot find matched '", list_sample_id$Rrs[w][i], "' from the base sample id!",
bullet = "radio_on", bullet_col = "red", col = 'red')
stop("Check process exist! See info and resubmit!")
}else{
cat_bullet("No.", i, " Is '", list_sample_id$Rrs[w][i], "'matched with '",
paste(list_sample_id$base[w_maybe], collapse = ", ") ,"' from the base sample id?",
bullet = "radio_on", bullet_col = "yellow", col = 'yellow')
if(fix_Rrs_sample_id & length(w_maybe) == 1){
colnames(dt$Rrs)[-1][w][i] <- str_c(list_sample_id$Rrs[w][i],"_1") # replace the former one
cli_control("okay", c(
"Automatically modified as ", str_c(list_sample_id$Rrs[w][i],"_1")
))
}else{
stop("Check process exist! See info and resubmit!")
}
}
}
}
# Detect whether the input spread satisfy the format of the required.
for(i in names(length_sample_id)[!{names(length_sample_id) %in% c("base","Rrs")}]){
sample_missing <- sample_id_full %in% list_sample_id[[i]] %>% {!.} %>% sample_id_full[.] %>%
paste0(., collapse = ", ")
if(sample_missing == ""){
cli_control("right", c(
"Sheet [", i, '] includes same samples with [base]!'
))
}else{
cli_control("error", c(
"Sheet [", i, '] includes missing sample ids: ', paste(sample_missing, collapse = " ")
))
num_er = num_er + 1
}
}
# final report
Status <- ifelse(num_er == 0, "Pass", "Error")
col <- ifelse(num_er == 0, {ifelse(num_wr == 0, "green", "yellow")}, "red")
cat("\n")
cli_control("status", c(
"Sample ID status: [", Status,"] ", "Error: [", num_er, "] Warning: [", num_wr, "]"
), col_status = col)
cat("\n")
cli_control("end", "Sample ID check")
result <- input
result$Status_sample_id = Status
result$dt = dt
return(result)
}
#' @name Check_base_info
#' @title Check the format of sheet 'base'
#' @importFrom cli cat_bullet
#' @importFrom readxl excel_sheets cell_cols
#' @import stringr
#' @import stringi
#' @import dplyr
#' @param input input
#' @param dataset_format dataset_format (default as 1)
#' @param as_EN_colnames Convert colnames to EN format (default as \code{TRUE})
#' @return List
#' @export
#'
Check_base_info <- function(input, dataset_format = 1, as_EN_colnames = TRUE){
cli_control("head", "Base info check")
cat("\n")
num_er <- num_wr <- 0
dt = input$dt
dt_base <- dt[['base']]
name_ <- colnames(dt_base)
unit <- str_extract_all(name_, "(\\[.*\\])", simplify = T) %>% c
name <- str_replace_all(name_, "(\\[.*\\])", "")
cli_control("okay", c(
"Found colnames (unit removed) are: ", paste(name, collapse = ", ")
))
if(dataset_format == 1){
base_colnames_CN <- dataset_format_1$base_info[,"base_colnames_CN"]
base_colnames_EN <- dataset_format_1$base_info[,"base_colnames_EN"]
base_colnames_type = dataset_format_1$base_info[,"base_colnames_type"]
}
cli_control("info", c(
"Formatted colnames (CN): ", paste(base_colnames_CN, collapse = ", ")
))
cli_control("info", c(
"Formatted colnames (EN): ", paste(base_colnames_EN, collapse = ", ")
))
# The function will detect the out-formatted columns in the raw spread and stop checking
# The user requires to inspect these columns manually.
if(length(name) > length(base_colnames_CN)){
w = ((str_to_lower(name) %in% str_to_lower(base_colnames_CN)) | (str_to_lower(name) %in% str_to_lower(base_colnames_EN)))
w = which(!w)
cli_control("error",c(
paste(name[w], collapse = ", "), " found in the current spread. ",
"They are not required colnames. If you want to save them, just keep them into a new sheet. ",
"Or if you confirm these parameters as the often-used. Please contact me to modify the code. ",
"Now please check and re-submit!"
))
stop("Exit check process!")
}
w = which(!{{str_to_lower(base_colnames_CN) %in% str_to_lower(name)} | {str_to_lower(base_colnames_EN) %in% str_to_lower(name)}})
if(length(w) == 0){
cli_control("right", "Great! All required colnames in [base] are detected!")
w_sort = seq(length(base_colnames_CN))
for(i in 1:length(w_sort)){
w_sort[i] = which(base_colnames_CN[i] == name | str_to_lower(base_colnames_EN[i]) == str_to_lower(name))
}
dt_base <- dt_base[, w_sort]
if(as_EN_colnames){
names(dt_base) <- base_colnames_EN
cli_control("info", "Colnames of [base] are converted to EN characters.")
}else{
names(dt_base) <- base_colnames_CN
cli_control("info", "Colnames of [base] are converted to CN characters.")
}
}else{
cli_control("error",c(
paste(base_colnames_CN[w], collapse = ", "), ' (or ',
paste(base_colnames_EN[w], collapse = ", "),
") CANNOT be found in [base] correctly. Please check and re-submit!"
))
num_er = num_er + 1
cat(" Try to find the matched pattern (just guess) ...\n")
for(i in 1:length(w)){
w_CN = which(!is.na(str_match(name, base_colnames_CN[w[i]] %>%
stri_split_boundaries(., type="word", skip_word_none = TRUE, simplify = T) %>%
str_c(., collapse = "|"))))
w_EN = which(!is.na(str_match(str_to_lower(name),
str_to_lower(base_colnames_EN[w[i]]))))
w_ = union(w_CN, w_EN)
if(length(w_) != 0){
cat_bullet("No.", i, " Is '", base_colnames_CN[w][i], "' or ('", base_colnames_EN[w][i],
"') matched with '", paste(name[w_], collapse = ", ") ,"' from the input colnames?",
bullet = "radio_on", bullet_col = "yellow", col = 'yellow')
}else{
cat_bullet("No.", i, " Could not find matched patterns for '",
base_colnames_CN[w][i], "' or ('", base_colnames_EN[w][i], "')", " from the input colnames.",
bullet = "radio_off", bullet_col = "red", col = "red")
}
}
}
# final report
Status <- ifelse(num_er == 0, "Pass", "Error")
col <- ifelse(num_er == 0, {ifelse(num_wr == 0, "green", "yellow")}, "red")
# If base info check pass, the base data should be re-read as the col_types should be defined again.
if(Status == "Pass"){
Sheets = excel_sheets(input$fn)
w_sheet = which(Sheets == stri_unescape_unicode("\\u57fa\\u672c\\u53c2\\u6570") | Sheets == "base")
dt_base <- read_excel(input$fn, sheet=w_sheet,
cell_cols(w_sort),
col_types = base_colnames_type)[, w_sort]
if(as_EN_colnames){
names(dt_base) <- base_colnames_EN
cli_control("info", "Colnames of [base] are converted to EN characters.")
}else{
names(dt_base) <- base_colnames_CN
cli_control("info", "Colnames of [base] are converted to CN characters.")
}
}
cat("\n")
cli_control("status", c(
"Base info status: [", Status,"] ", "Error: [", num_er, "] Warning: [", num_wr, "]"
), col_status = col)
cat("\n")
cli_control("end", "Base info check")
result <- input
result$dt$base <- dt_base
result$Status_base_info = Status
result$dt$unit <- data.frame(name = name, unit = unit, stringsAsFactors = FALSE)
result$base_colnames_CN = base_colnames_CN
result$base_colnames_EN = base_colnames_EN
result$as_EN_colnames = as_EN_colnames
return(result)
}
#' @name Check_format
#' @title Main function to check the spread format using Check_sheet_name, Check_sample_id,
#' and Check_base_info.
#'
#' @param fn Filename
#' @param dataset_format The format of dataset to be checked.
#' @param as_EN_colnames Convert colnames to EN format (default as \code{TRUE})
#' @return A list
#' @export
#'
#' @importFrom cli cat_boxx
#'
Check_format <- function(fn,
dataset_format = 1,
as_EN_colnames = TRUE){
# cat_boxx("Format check begin!")
res_1 = Check_sheet_name(fn, dataset_format = dataset_format)
cat("\n\n")
# cat_boxx("Check_sheet_name() finished!")
res_2 = Check_sample_id(res_1, dataset_format = dataset_format)
cat("\n\n")
# cat_boxx("Check_sample_id() finished!")
res_3 = Check_base_info(res_2, dataset_format = dataset_format, as_EN_colnames = as_EN_colnames)
cat("\n\n")
# cat_boxx("Check_base_info() finished!")
return(res_3)
}
# files <- lapply(list.files(system.file('extdata', package = 'DAMATO'), full.names = TRUE), read.csv)
Check_folder <- function(){
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.