#' Title
#'
#' @param df_patients tibble
#'
#' @export
check_patient_names <- function(df_patients) {
# Get patients table from SQL
cn <- connect_db("mysql_covid_seq")
on.exit(DBI::dbDisconnect(cn))
sql_patients <- dplyr::tbl(cn, "patients")
# To lower and remove existent patient ids
df_patients <- df_patients %>%
dplyr::mutate(dplyr::across(dplyr::everything(), function(x) {
stringr::str_to_lower(x)
})) %>%
dplyr::filter(!patient_id %in% (sql_patients %>% dplyr::pull(patient_id)))
# nrow of input data and output table
df_rows <- nrow(df_patients)
checks_df <- dplyr::tibble()
# Check patient_id
checks_df <- df_patients %>%
dplyr::filter(stringr::str_detect(patient_id, "pat_[:digit:]{3}")) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "patient_id",
message = "Invalid patient_id or already present in the database",
out_table = checks_df)
# Check host
checks_df <- df_patients %>%
dplyr::filter(host %in% c("homo_sapiens", "felis_catus", "canis_lupus", "panthera_leo")) %>%
tidyr::drop_na(host) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "host",
message = "Invalid host", ### Report in which samples invalid_hosts are found
### report which hosts are valid
out_table = checks_df)
# Check host_comment
checks_df <- df_patients %>%
tidyr::replace_na(list(host_comment = "text to detect")) %>%
dplyr::filter(stringr::str_detect(host_comment, "[:alpha:]")) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "host_comment",
message = "host_comment must be text or empty",
out_table = checks_df)
# Check gender
checks_df <- df_patients %>%
dplyr::filter(gender %in% c("male", "female", "unknown")) %>%
tidyr::drop_na(gender) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "gender",
message = "Invalid gender",
out_table = checks_df)
# Check age
checks_df <- df_patients %>%
dplyr::filter(age %in% 0:130) %>%
tidyr::drop_na(age) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "age",
message = "Invalid age",
out_table = checks_df)
# Check age units
checks_df <- df_patients %>%
dplyr::filter(age_units %in% c("years", "months")) %>%
tidyr::drop_na(age_units) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "age_units",
message = "Invalid age_units",
out_table = checks_df)
# Check residence_location
checks_df <- df_patients %>%
tidyr::drop_na(residence_location) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "residence_location",
message = "NAs in residence_location", ### location maybe not known, use unknown in those cases
out_table = checks_df)
# Check infection_location
checks_df <- df_patients %>%
tidyr::drop_na(infection_location) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "infection_location",
message = "NAs in infection_location", ### What does infection_location refer to?
out_table = checks_df)
# Check location_comment
checks_df <- df_patients %>%
tidyr::replace_na(list(location_comment = "text to detect")) %>%
dplyr::filter(stringr::str_detect(location_comment, "[:alpha:]")) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "location_comment",
message = "location_comment must be text or empty",
out_table = checks_df)
# Check patient_status
opts <- c("home","hospitalized","released","live","deceased","uci","unknown")
checks_df <- df_patients %>%
dplyr::filter(patient_status %in% opts) %>%
tidyr::drop_na(patient_status) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "patient_status",
message = "Invalid patient_status", ### Report which samples have this field invalid
out_table = checks_df)
# Check outbreak
checks_df <- df_patients %>%
tidyr::replace_na(list(outbreak = "text to detect")) %>%
dplyr::filter(stringr::str_detect(outbreak, "[:alpha:]")) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "outbreak",
message = "outbreak must be text or empty",
out_table = checks_df)
# Check vaccinated
checks_df <- df_patients %>%
dplyr::filter(vaccinated %in% c("true", "false", "unknown")) %>%
tidyr::drop_na(vaccinated) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "vaccinated",
message = "Invalid vaccinated (true, false or unknown)",
out_table = checks_df)
# Check vaccinated_date
checks_df <- df_patients %>%
nrow() %>% #### Need to make sure we can import usual date formats.
report_checks(original_nrows = df_rows,
var = "vaccinated_date",
message = "Invalid vaccinated_date",
out_table = checks_df)
# Check treatment
checks_df <- df_patients %>%
tidyr::replace_na(list(treatment = "text to detect")) %>%
dplyr::filter(stringr::str_detect(treatment, "[:alpha:]")) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "treatment",
message = "treatment must be text or empty",
out_table = checks_df)
# Check clinical_trial_name
checks_df <- df_patients %>%
tidyr::replace_na(list(clinical_trial_name = "text to detect")) %>%
dplyr::filter(stringr::str_detect(clinical_trial_name, "[:alpha:]")) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "clinical_trial_name",
message = "clinical_trial_name must be text or empty",
out_table = checks_df)
checks_df <- checks_df %>%
dplyr::mutate(idx = 1:nrow(.)) %>%
dplyr::arrange(dplyr::desc(idx)) %>%
dplyr::select(-idx)
}
#' Title
#'
#' @param df_samples tibble
#'
#' @export
check_samples_names <- function(df_samples) {
# Get patients table from SQL
cn <- connect_db("mysql_covid_seq")
on.exit(DBI::dbDisconnect(cn))
sql_samples <- dplyr::tbl(cn, "samples")
# To lower
df_samples <- df_samples %>%
dplyr::mutate(dplyr::across(dplyr::everything(), function(x) {
stringr::str_to_lower(x)
}))
# nrow of input data and output table
df_rows <- nrow(df_samples)
checks_df <- tibble::tibble()
# Check original_sample_id
checks_df <- df_samples %>%
dplyr::filter(!original_sample_id %in% (sql_samples %>% dplyr::pull(original_sample_id))) %>%
tidyr::drop_na(original_sample_id) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "sample_id",
message = "Invalid sample_id or already present in the database",
out_table = checks_df)
# Check patient_id
checks_df <- df_samples %>%
dplyr::filter(stringr::str_detect(patient_id, "pat_[:digit:]{3}")) %>%
tidyr::drop_na(patient_id) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "patient_id",
message = "Invalid patient_id",
out_table = checks_df)
# Check sample_shared_type
opts <- c("saliva", "rna", "cdna", "oro_pharyngeal_swab", "naso_pharyngeal_swab")
checks_df <- df_samples %>%
dplyr::filter(sample_shared_type %in% opts) %>%
tidyr::drop_na(sample_shared_type) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "sample_shared_type",
message = "Invalid sample_shared_type",
out_table = checks_df)
# Check original_source
opts <- c("saliva", "oro_pharyngeal_swab", "naso_pharyngeal_swab")
checks_df <- df_samples %>%
dplyr::filter(original_source %in% opts) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "original_source",
message = "Invalid original_source",
out_table = checks_df)
# Check collection_date
checks_df <- df_samples %>%
tidyr::drop_na(collection_date) %>%
nrow() %>% ### Need to make sure we import usual date formats
report_checks(original_nrows = df_rows,
var = "collection_date",
message = "Missing values on collection_date",
out_table = checks_df)
# Check pcr_cycle_threshold
checks_df <- df_samples %>%
dplyr::filter(pcr_cycle_threshold %in% 0:40) %>%
tidyr::drop_na(pcr_cycle_threshold) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "pcr_cycle_threshold",
message = "Missing or invalid pcr_cycle_threshold",
out_table = checks_df)
# Check submitting_lab_name
checks_df <- df_samples %>%
dplyr::filter(submitting_lab_name %in% c("cr_microbiology")) %>%
tidyr::drop_na(submitting_lab_name) %>%
nrow() %>% ### Lab Name should be inferred from username, info should be queried upon registration
report_checks(original_nrows = df_rows,
var = "submitting_lab_name",
message = "Invalid submitting_lab_name",
out_table = checks_df)
# Check submitting_lab_address
checks_df <- df_samples %>%
tidyr::replace_na(list(submitting_lab_address = "text to detect")) %>%
dplyr::filter(stringr::str_detect(submitting_lab_address, "[:alpha:]")) %>%
nrow() %>% ### Same as above
report_checks(original_nrows = df_rows,
var = "submitting_lab_address",
message = "submitting_lab_address must be text or empty",
out_table = checks_df)
# Check submitting_lab_authors
checks_df <- df_samples %>%
tidyr::replace_na(list(submitting_lab_authors = "text to detect")) %>%
dplyr::filter(stringr::str_detect(submitting_lab_authors, "[:alpha:]")) %>%
nrow() %>% ### same as above
report_checks(original_nrows = df_rows,
var = "submitting_lab_authors",
message = "submitting_lab_authors must be text or empty",
out_table = checks_df)
checks_df <- checks_df %>%
dplyr::mutate(idx = 1:nrow(.)) %>%
dplyr::arrange(dplyr::desc(idx)) %>%
dplyr::select(-idx)
}
#' Title
#'
#' @param df tibble
#'
#' @export
new_sample_id <- function(df) {
# Get patients table from SQL
cn <- connect_db("mysql_covid_seq")
on.exit(DBI::dbDisconnect(cn))
sql_df <- dplyr::tbl(cn, "samples")
set.seed(2021)
ids <- tibble::tibble(
type = "S",
str = stringi::stri_rand_strings(
n = 1e5,
length = 3,
pattern = "[A-Z]"
)
) %>%
tidyr::unite("sample_id", type:str) %>%
dplyr::filter(!duplicated(sample_id)) %>%
dplyr::filter(!sample_id %in% (sql_df %>% dplyr::pull(sample_id))) %>%
dplyr::slice_sample(n = nrow(df)) %>%
dplyr::pull(sample_id)
df %>%
dplyr::rename(original_sample_id = sample_id) %>%
dplyr::mutate(original_sample_id = glue::glue("{submitting_lab_name}_{original_sample_id}")) %>%
dplyr::mutate(sample_id = ids, .before = 1,
submit_date = Sys.Date())
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.