R/utils.R

Defines functions check_patient_names report_checks gicon ingest_db connect_db export_to_s3

#' Title
#'
#' @param object
#'
#' @return
#' @export
#'
#' @examples
export_to_s3 <- function(object) {
    aws.s3::s3save(
        object,
        object = glue::glue(
            "original_metadata_{Sys.Date() %>% str_replace_all('-', '_')}.rds"
        ),
        bucket = "fcatala-09142020-eu-west-1"
    )
}


#' Title
#'
#' @param dbname
#'
#' @return
#' @export
#'
#' @examples
connect_db <- function(dbname) {
    DBI::dbConnect(
        drv = RMySQL::MySQL(),
        username = "admin",
        password = "c0v1drul3s",
        host = "mncovidseqdb-instance.cyu1oiz4la9s.eu-west-1.rds.amazonaws.com",
        port = 3306,
        dbname = dbname,
        ":memory:"
    )
}


#' Title
#'
#' @param table
#' @param data
#'
#' @return
#' @export
#'
#' @examples
ingest_db <- function(data, table, dbname = "mysql_covid_seq") {
    cn <- connect_db(dbname)
    on.exit(DBI::dbDisconnect(cn))
    DBI::dbWriteTable(
        conn = cn,
        name = table,
        value = data,
        append = TRUE,
        row.names = FALSE
    )
}

#' Title
#'
#' @param x
#'
#' @return
#' @export
#'
#' @examples
gicon <- function(x) {
    as.character(icon(x, lib = "glyphicon"))
}

#' Title
#'
#' @param res_nrwos
#' @param original_nrows
#' @param var
#' @param message
#' @param out_table
#'
#' @return
#' @export
#'
#' @examples
report_checks <- function(res_nrwos, original_nrows, var, message, out_table) {
    tibble::tibble(
        var = var,
        icon = ifelse(original_nrows != res_nrwos, gicon("remove"), gicon("ok")),
        msg = ifelse(icon == gicon("ok"), "", message)
    ) %>%
        dplyr::bind_rows(out_table)
}

#' Title
#'
#' @param df_patients
#'
#' @return
#' @export
#'
#' @examples
check_patient_names <- function(df_patients) {

    # Get patients table from SQL
    cn <- connect_db("mysql_covid_seq")
    on.exit(DBI::dbDisconnect(cn))
    sql_patients <- tbl(cn, "patients")

    # To lower
    df_patients <- df_patients %>%
        dplyr::mutate(dplyr::across(dplyr::everything(), function(x) {
            stringr::str_to_lower(x)
        }))

    # nrow of input data and output table
    df_rows <- nrow(df_patients)
    checks_df <- dplyr::tibble()

    # Check patient_id
    checks_df <- df_patients %>%
        dplyr::filter(stringr::str_detect(patient_id, "pat_[:digit:]{3}")) %>%
        dplyr::filter(!patient_id %in% (sql_patients %>% dplyr::pull(patient_id))) %>%
        tidyr::drop_na(patient_id) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "patient_id",
                      message = "Invalid patient_id or already present in the database",
                      out_table = checks_df)

    # Check host
    checks_df <- df_patients %>%
        dplyr::filter(host %in% c("homo_sapiens", "felis_catus", "canis_lupus")) %>%
        tidyr::drop_na(host) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "host",
                      message = "Invalid host",
                      out_table = checks_df)

    # Check host_comment
    checks_df <- df_patients %>%
        tidyr::replace_na(list(host_comment = "text to detect")) %>%
        dplyr::filter(stringr::str_detect(host_comment, "[:alpha:]")) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "host_comment",
                      message = "host_comment must be text or empty",
                      out_table = checks_df)

    # Check gender
    checks_df <- df_patients %>%
        dplyr::filter(gender %in% c("male", "female", "unknown")) %>%
        tidyr::drop_na(gender) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "gender",
                      message = "Invalid gender",
                      out_table = checks_df)

    # Check age
    checks_df <- df_patients %>%
        dplyr::filter(age %in% 0:130) %>%
        tidyr::drop_na(age) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "age",
                      message = "Invalid age",
                      out_table = checks_df)

    # Check age units
    checks_df <- df_patients %>%
        dplyr::filter(age_units %in% c("years", "months")) %>%
        tidyr::drop_na(age_units) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "age_units",
                      message = "Invalid age_units",
                      out_table = checks_df)

    # Check residence_location
    checks_df <- df_patients %>%
        tidyr::drop_na(residence_location) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "residence_location",
                      message = "NAs in residence_location",
                      out_table = checks_df)

    # Check infection_location
    checks_df <- df_patients %>%
        tidyr::drop_na(infection_location) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "infection_location",
                      message = "NAs in infection_location",
                      out_table = checks_df)

    # Check location_comment
    checks_df <- df_patients %>%
        tidyr::replace_na(list(location_comment = "text to detect")) %>%
        dplyr::filter(stringr::str_detect(location_comment, "[:alpha:]")) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "location_comment",
                      message = "location_comment must be text or empty",
                      out_table = checks_df)

    # Check patient_status
    opts <- c("home","hospitalized","released","live","deceased","uci","unknown")
    checks_df <- df_patients %>%
        dplyr::filter(patient_status %in% opts) %>%
        tidyr::drop_na(patient_status) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "patient_status",
                      message = "Invalid patient_status",
                      out_table = checks_df)

    # Check outbreak
    checks_df <- df_patients %>%
        tidyr::replace_na(list(outbreak = "text to detect")) %>%
        dplyr::filter(stringr::str_detect(outbreak, "[:alpha:]")) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "outbreak",
                      message = "outbreak must be text or empty",
                      out_table = checks_df)

    # Check vaccinated
    checks_df <- df_patients %>%
        dplyr::filter(vaccinated %in% c("true", "false", "unknown")) %>%
        tidyr::drop_na(vaccinated) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "vaccinated",
                      message = "Invalid vaccinated (true, false or unknown)",
                      out_table = checks_df)

    # Check vaccinated_date
    checks_df <- df_patients %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "vaccinated_date",
                      message = "Invalid vaccinated_date",
                      out_table = checks_df)

    # Check treatment
    checks_df <- df_patients %>%
        tidyr::replace_na(list(treatment = "text to detect")) %>%
        dplyr::filter(stringr::str_detect(treatment, "[:alpha:]")) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "treatment",
                      message = "treatment must be text or empty",
                      out_table = checks_df)

    # Check clinical_trial_name
    checks_df <- df_patients %>%
        tidyr::replace_na(list(clinical_trial_name = "text to detect")) %>%
        dplyr::filter(stringr::str_detect(clinical_trial_name, "[:alpha:]")) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "clinical_trial_name",
                      message = "clinical_trial_name must be text or empty",
                      out_table = checks_df)

    checks_df <- checks_df %>%
        dplyr::mutate(idx = 1:nrow(.)) %>%
        dplyr::arrange(dplyr::desc(idx)) %>%
        dplyr::select(-idx)
}

#' Title
#'
#' @param df_samples
#'
#' @return
#' @export
#'
#' @examples
check_samples_names <- function(df_samples) {

    # Get patients table from SQL
    cn <- connect_db("mysql_covid_seq")
    on.exit(DBI::dbDisconnect(cn))
    sql_samples <- dplyr::tbl(cn, "samples")

    # To lower
    df_samples <- df_samples %>%
        dplyr::mutate(dplyr::across(dplyr::everything(), function(x) {
            stringr::str_to_lower(x)
        }))

    # nrow of input data and output table
    df_rows <- nrow(df_samples)
    checks_df <- tibble::tibble()

    # Check sample_id
    checks_df <- df_samples %>%
        dplyr::filter(stringr::str_detect(sample_id, "samp_[:digit:]{3}")) %>%
        dplyr::filter(!sample_id %in% (sql_samples %>% dplyr::pull(sample_id))) %>%
        tidyr::drop_na(sample_id) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "sample_id",
                      message = "Invalid sample_id or already present in the database",
                      out_table = checks_df)

    # Check patient_id
    checks_df <- df_samples %>%
        dplyr::filter(stringr::str_detect(patient_id, "pat_[:digit:]{3}")) %>%
        tidyr::drop_na(patient_id) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "patient_id",
                      message = "Invalid patient_id",
                      out_table = checks_df)

    # Check sample_shared_type
    opts <- c("saliva", "rna", "cdna", "oro_pharyngeal_swab", "naso_pharyngeal_swab")
    checks_df <- df_samples %>%
        dplyr::filter(sample_shared_type %in% opts) %>%
        tidyr::drop_na(sample_shared_type) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "sample_shared_type",
                      message = "Invalid sample_shared_type",
                      out_table = checks_df)

    # Check original_source
    opts <- c("saliva", "oro_pharyngeal_swab", "naso_pharyngeal_swab")
    checks_df <- df_samples %>%
        dplyr::filter(original_source %in% opts) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "original_source",
                      message = "Invalid original_source",
                      out_table = checks_df)

     # Check collection_date
    checks_df <- df_samples %>%
        tidyr::drop_na(collection_date) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "collection_date",
                      message = "Missing values on collection_date",
                      out_table = checks_df)

    # Check pcr_cycle_threshold
    checks_df <- df_samples %>%
        dplyr::filter(pcr_cycle_threshold %in% 0:80) %>%
        tidyr::drop_na(pcr_cycle_threshold) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "pcr_cycle_threshold",
                      message = "Missing or invalid pcr_cycle_threshold",
                      out_table = checks_df)

    # Check submitting_lab_name
    checks_df <- df_samples %>%
        dplyr::filter(submitting_lab_name %in% c("cr_microbiology")) %>%
        tidyr::drop_na(submitting_lab_name) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "submitting_lab_name",
                      message = "Invalid submitting_lab_name",
                      out_table = checks_df)

    # Check submitting_lab_address
    checks_df <- df_samples %>%
        tidyr::replace_na(list(submitting_lab_address = "text to detect")) %>%
        dplyr::filter(stringr::str_detect(submitting_lab_address, "[:alpha:]")) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "submitting_lab_address",
                      message = "submitting_lab_address must be text or empty",
                      out_table = checks_df)

    # Check submitting_lab_authors
    checks_df <- df_samples %>%
        tidyr::replace_na(list(submitting_lab_authors = "text to detect")) %>%
        dplyr::filter(stringr::str_detect(submitting_lab_authors, "[:alpha:]")) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "submitting_lab_authors",
                      message = "submitting_lab_authors must be text or empty",
                      out_table = checks_df)

    checks_df <- checks_df %>%
        dplyr::mutate(idx = 1:nrow(.)) %>%
        dplyr::arrange(dplyr::desc(idx)) %>%
        dplyr::select(-idx)
}
xec-cm/metaChecker documentation built on Jan. 18, 2021, 12:40 a.m.