#' Title
#'
#' @param object
#'
#' @return
#' @export
#'
#' @examples
export_to_s3 <- function(object) {
aws.s3::s3save(
object,
object = glue::glue(
"original_metadata_{Sys.Date() %>% str_replace_all('-', '_')}.rds"
),
bucket = "fcatala-09142020-eu-west-1"
)
}
#' Title
#'
#' @param dbname
#'
#' @return
#' @export
#'
#' @examples
connect_db <- function(dbname) {
DBI::dbConnect(
drv = RMySQL::MySQL(),
username = "admin",
password = "c0v1drul3s",
host = "mncovidseqdb-instance.cyu1oiz4la9s.eu-west-1.rds.amazonaws.com",
port = 3306,
dbname = dbname,
":memory:"
)
}
#' Title
#'
#' @param table
#' @param data
#'
#' @return
#' @export
#'
#' @examples
ingest_db <- function(data, table, dbname = "mysql_covid_seq") {
cn <- connect_db(dbname)
on.exit(DBI::dbDisconnect(cn))
DBI::dbWriteTable(
conn = cn,
name = table,
value = data,
append = TRUE,
row.names = FALSE
)
}
#' Title
#'
#' @param x
#'
#' @return
#' @export
#'
#' @examples
gicon <- function(x) {
as.character(icon(x, lib = "glyphicon"))
}
#' Title
#'
#' @param res_nrwos
#' @param original_nrows
#' @param var
#' @param message
#' @param out_table
#'
#' @return
#' @export
#'
#' @examples
report_checks <- function(res_nrwos, original_nrows, var, message, out_table) {
tibble::tibble(
var = var,
icon = ifelse(original_nrows != res_nrwos, gicon("remove"), gicon("ok")),
msg = ifelse(icon == gicon("ok"), "", message)
) %>%
dplyr::bind_rows(out_table)
}
#' Title
#'
#' @param df_patients
#'
#' @return
#' @export
#'
#' @examples
check_patient_names <- function(df_patients) {
# Get patients table from SQL
cn <- connect_db("mysql_covid_seq")
on.exit(DBI::dbDisconnect(cn))
sql_patients <- tbl(cn, "patients")
# To lower
df_patients <- df_patients %>%
dplyr::mutate(dplyr::across(dplyr::everything(), function(x) {
stringr::str_to_lower(x)
}))
# nrow of input data and output table
df_rows <- nrow(df_patients)
checks_df <- dplyr::tibble()
# Check patient_id
checks_df <- df_patients %>%
dplyr::filter(stringr::str_detect(patient_id, "pat_[:digit:]{3}")) %>%
dplyr::filter(!patient_id %in% (sql_patients %>% dplyr::pull(patient_id))) %>%
tidyr::drop_na(patient_id) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "patient_id",
message = "Invalid patient_id or already present in the database",
out_table = checks_df)
# Check host
checks_df <- df_patients %>%
dplyr::filter(host %in% c("homo_sapiens", "felis_catus", "canis_lupus")) %>%
tidyr::drop_na(host) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "host",
message = "Invalid host",
out_table = checks_df)
# Check host_comment
checks_df <- df_patients %>%
tidyr::replace_na(list(host_comment = "text to detect")) %>%
dplyr::filter(stringr::str_detect(host_comment, "[:alpha:]")) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "host_comment",
message = "host_comment must be text or empty",
out_table = checks_df)
# Check gender
checks_df <- df_patients %>%
dplyr::filter(gender %in% c("male", "female", "unknown")) %>%
tidyr::drop_na(gender) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "gender",
message = "Invalid gender",
out_table = checks_df)
# Check age
checks_df <- df_patients %>%
dplyr::filter(age %in% 0:130) %>%
tidyr::drop_na(age) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "age",
message = "Invalid age",
out_table = checks_df)
# Check age units
checks_df <- df_patients %>%
dplyr::filter(age_units %in% c("years", "months")) %>%
tidyr::drop_na(age_units) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "age_units",
message = "Invalid age_units",
out_table = checks_df)
# Check residence_location
checks_df <- df_patients %>%
tidyr::drop_na(residence_location) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "residence_location",
message = "NAs in residence_location",
out_table = checks_df)
# Check infection_location
checks_df <- df_patients %>%
tidyr::drop_na(infection_location) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "infection_location",
message = "NAs in infection_location",
out_table = checks_df)
# Check location_comment
checks_df <- df_patients %>%
tidyr::replace_na(list(location_comment = "text to detect")) %>%
dplyr::filter(stringr::str_detect(location_comment, "[:alpha:]")) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "location_comment",
message = "location_comment must be text or empty",
out_table = checks_df)
# Check patient_status
opts <- c("home","hospitalized","released","live","deceased","uci","unknown")
checks_df <- df_patients %>%
dplyr::filter(patient_status %in% opts) %>%
tidyr::drop_na(patient_status) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "patient_status",
message = "Invalid patient_status",
out_table = checks_df)
# Check outbreak
checks_df <- df_patients %>%
tidyr::replace_na(list(outbreak = "text to detect")) %>%
dplyr::filter(stringr::str_detect(outbreak, "[:alpha:]")) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "outbreak",
message = "outbreak must be text or empty",
out_table = checks_df)
# Check vaccinated
checks_df <- df_patients %>%
dplyr::filter(vaccinated %in% c("true", "false", "unknown")) %>%
tidyr::drop_na(vaccinated) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "vaccinated",
message = "Invalid vaccinated (true, false or unknown)",
out_table = checks_df)
# Check vaccinated_date
checks_df <- df_patients %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "vaccinated_date",
message = "Invalid vaccinated_date",
out_table = checks_df)
# Check treatment
checks_df <- df_patients %>%
tidyr::replace_na(list(treatment = "text to detect")) %>%
dplyr::filter(stringr::str_detect(treatment, "[:alpha:]")) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "treatment",
message = "treatment must be text or empty",
out_table = checks_df)
# Check clinical_trial_name
checks_df <- df_patients %>%
tidyr::replace_na(list(clinical_trial_name = "text to detect")) %>%
dplyr::filter(stringr::str_detect(clinical_trial_name, "[:alpha:]")) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "clinical_trial_name",
message = "clinical_trial_name must be text or empty",
out_table = checks_df)
checks_df <- checks_df %>%
dplyr::mutate(idx = 1:nrow(.)) %>%
dplyr::arrange(dplyr::desc(idx)) %>%
dplyr::select(-idx)
}
#' Title
#'
#' @param df_samples
#'
#' @return
#' @export
#'
#' @examples
check_samples_names <- function(df_samples) {
# Get patients table from SQL
cn <- connect_db("mysql_covid_seq")
on.exit(DBI::dbDisconnect(cn))
sql_samples <- dplyr::tbl(cn, "samples")
# To lower
df_samples <- df_samples %>%
dplyr::mutate(dplyr::across(dplyr::everything(), function(x) {
stringr::str_to_lower(x)
}))
# nrow of input data and output table
df_rows <- nrow(df_samples)
checks_df <- tibble::tibble()
# Check sample_id
checks_df <- df_samples %>%
dplyr::filter(stringr::str_detect(sample_id, "samp_[:digit:]{3}")) %>%
dplyr::filter(!sample_id %in% (sql_samples %>% dplyr::pull(sample_id))) %>%
tidyr::drop_na(sample_id) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "sample_id",
message = "Invalid sample_id or already present in the database",
out_table = checks_df)
# Check patient_id
checks_df <- df_samples %>%
dplyr::filter(stringr::str_detect(patient_id, "pat_[:digit:]{3}")) %>%
tidyr::drop_na(patient_id) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "patient_id",
message = "Invalid patient_id",
out_table = checks_df)
# Check sample_shared_type
opts <- c("saliva", "rna", "cdna", "oro_pharyngeal_swab", "naso_pharyngeal_swab")
checks_df <- df_samples %>%
dplyr::filter(sample_shared_type %in% opts) %>%
tidyr::drop_na(sample_shared_type) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "sample_shared_type",
message = "Invalid sample_shared_type",
out_table = checks_df)
# Check original_source
opts <- c("saliva", "oro_pharyngeal_swab", "naso_pharyngeal_swab")
checks_df <- df_samples %>%
dplyr::filter(original_source %in% opts) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "original_source",
message = "Invalid original_source",
out_table = checks_df)
# Check collection_date
checks_df <- df_samples %>%
tidyr::drop_na(collection_date) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "collection_date",
message = "Missing values on collection_date",
out_table = checks_df)
# Check pcr_cycle_threshold
checks_df <- df_samples %>%
dplyr::filter(pcr_cycle_threshold %in% 0:80) %>%
tidyr::drop_na(pcr_cycle_threshold) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "pcr_cycle_threshold",
message = "Missing or invalid pcr_cycle_threshold",
out_table = checks_df)
# Check submitting_lab_name
checks_df <- df_samples %>%
dplyr::filter(submitting_lab_name %in% c("cr_microbiology")) %>%
tidyr::drop_na(submitting_lab_name) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "submitting_lab_name",
message = "Invalid submitting_lab_name",
out_table = checks_df)
# Check submitting_lab_address
checks_df <- df_samples %>%
tidyr::replace_na(list(submitting_lab_address = "text to detect")) %>%
dplyr::filter(stringr::str_detect(submitting_lab_address, "[:alpha:]")) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "submitting_lab_address",
message = "submitting_lab_address must be text or empty",
out_table = checks_df)
# Check submitting_lab_authors
checks_df <- df_samples %>%
tidyr::replace_na(list(submitting_lab_authors = "text to detect")) %>%
dplyr::filter(stringr::str_detect(submitting_lab_authors, "[:alpha:]")) %>%
nrow() %>%
report_checks(original_nrows = df_rows,
var = "submitting_lab_authors",
message = "submitting_lab_authors must be text or empty",
out_table = checks_df)
checks_df <- checks_df %>%
dplyr::mutate(idx = 1:nrow(.)) %>%
dplyr::arrange(dplyr::desc(idx)) %>%
dplyr::select(-idx)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.