R/lab_submission_checks.R

Defines functions check_patient_names

Documented in check_patient_names

#' Title
#'
#' @param df_patients tibble
#'
#' @export
check_patient_names <- function(df_patients) {

    # Get patients table from SQL
    cn <- connect_db("mysql_covid_seq")
    on.exit(DBI::dbDisconnect(cn))
    sql_patients <- dplyr::tbl(cn, "patients")

    # To lower and remove existent patient ids
    df_patients <- df_patients %>%
        dplyr::mutate(dplyr::across(dplyr::everything(), function(x) {
            stringr::str_to_lower(x)
        })) %>%
        dplyr::filter(!patient_id %in% (sql_patients %>% dplyr::pull(patient_id)))

    # nrow of input data and output table
    df_rows <- nrow(df_patients)
    checks_df <- dplyr::tibble()

    # Check patient_id
    checks_df <- df_patients %>%
        dplyr::filter(stringr::str_detect(patient_id, "pat_[:digit:]{3}")) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "patient_id",
                      message = "Invalid patient_id or already present in the database",
                      out_table = checks_df)

    # Check host
    checks_df <- df_patients %>%
        dplyr::filter(host %in% c("homo_sapiens", "felis_catus", "canis_lupus", "panthera_leo")) %>%
        tidyr::drop_na(host) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "host",
                      message = "Invalid host", ### Report in which samples invalid_hosts are found
                                                ### report which hosts are valid
                      out_table = checks_df)

    # Check host_comment
    checks_df <- df_patients %>%
        tidyr::replace_na(list(host_comment = "text to detect")) %>%
        dplyr::filter(stringr::str_detect(host_comment, "[:alpha:]")) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "host_comment",
                      message = "host_comment must be text or empty",
                      out_table = checks_df)

    # Check gender
    checks_df <- df_patients %>%
        dplyr::filter(gender %in% c("male", "female", "unknown")) %>%
        tidyr::drop_na(gender) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "gender",
                      message = "Invalid gender",
                      out_table = checks_df)

    # Check age
    checks_df <- df_patients %>%
        dplyr::filter(age %in% 0:130) %>%
        tidyr::drop_na(age) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "age",
                      message = "Invalid age",
                      out_table = checks_df)

    # Check age units
    checks_df <- df_patients %>%
        dplyr::filter(age_units %in% c("years", "months")) %>%
        tidyr::drop_na(age_units) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "age_units",
                      message = "Invalid age_units",
                      out_table = checks_df)

    # Check residence_location
    checks_df <- df_patients %>%
        tidyr::drop_na(residence_location) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "residence_location",
                      message = "NAs in residence_location", ### location maybe not known, use unknown in those cases
                      out_table = checks_df)

    # Check infection_location
    checks_df <- df_patients %>%
        tidyr::drop_na(infection_location) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "infection_location",
                      message = "NAs in infection_location", ### What does infection_location refer to?
                      out_table = checks_df)

    # Check location_comment
    checks_df <- df_patients %>%
        tidyr::replace_na(list(location_comment = "text to detect")) %>%
        dplyr::filter(stringr::str_detect(location_comment, "[:alpha:]")) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "location_comment",
                      message = "location_comment must be text or empty",
                      out_table = checks_df)

    # Check patient_status
    opts <- c("home","hospitalized","released","live","deceased","uci","unknown")
    checks_df <- df_patients %>%
        dplyr::filter(patient_status %in% opts) %>%
        tidyr::drop_na(patient_status) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "patient_status",
                      message = "Invalid patient_status", ### Report which samples have this field invalid
                      out_table = checks_df)

    # Check outbreak
    checks_df <- df_patients %>%
        tidyr::replace_na(list(outbreak = "text to detect")) %>%
        dplyr::filter(stringr::str_detect(outbreak, "[:alpha:]")) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "outbreak",
                      message = "outbreak must be text or empty",
                      out_table = checks_df)

    # Check vaccinated
    checks_df <- df_patients %>%
        dplyr::filter(vaccinated %in% c("true", "false", "unknown")) %>%
        tidyr::drop_na(vaccinated) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "vaccinated",
                      message = "Invalid vaccinated (true, false or unknown)",
                      out_table = checks_df)

    # Check vaccinated_date
    checks_df <- df_patients %>%
        nrow() %>%  #### Need to make sure we can import usual date formats.
        report_checks(original_nrows = df_rows,
                      var = "vaccinated_date",
                      message = "Invalid vaccinated_date",
                      out_table = checks_df)

    # Check treatment
    checks_df <- df_patients %>%
        tidyr::replace_na(list(treatment = "text to detect")) %>%
        dplyr::filter(stringr::str_detect(treatment, "[:alpha:]")) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "treatment",
                      message = "treatment must be text or empty",
                      out_table = checks_df)

    # Check clinical_trial_name
    checks_df <- df_patients %>%
        tidyr::replace_na(list(clinical_trial_name = "text to detect")) %>%
        dplyr::filter(stringr::str_detect(clinical_trial_name, "[:alpha:]")) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "clinical_trial_name",
                      message = "clinical_trial_name must be text or empty",
                      out_table = checks_df)

    checks_df <- checks_df %>%
        dplyr::mutate(idx = 1:nrow(.)) %>%
        dplyr::arrange(dplyr::desc(idx)) %>%
        dplyr::select(-idx)
}

#' Title
#'
#' @param df_samples tibble
#'
#' @export
check_samples_names <- function(df_samples) {

    # Get patients table from SQL
    cn <- connect_db("mysql_covid_seq")
    on.exit(DBI::dbDisconnect(cn))
    sql_samples <- dplyr::tbl(cn, "samples")

    # To lower
    df_samples <- df_samples %>%
        dplyr::mutate(dplyr::across(dplyr::everything(), function(x) {
            stringr::str_to_lower(x)
        }))

    # nrow of input data and output table
    df_rows <- nrow(df_samples)
    checks_df <- tibble::tibble()

    # Check original_sample_id
    checks_df <- df_samples %>%
        dplyr::filter(!original_sample_id %in% (sql_samples %>% dplyr::pull(original_sample_id))) %>%
        tidyr::drop_na(original_sample_id) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "sample_id",
                      message = "Invalid sample_id or already present in the database",
                      out_table = checks_df)

    # Check patient_id
    checks_df <- df_samples %>%
        dplyr::filter(stringr::str_detect(patient_id, "pat_[:digit:]{3}")) %>%
        tidyr::drop_na(patient_id) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "patient_id",
                      message = "Invalid patient_id",
                      out_table = checks_df)

    # Check sample_shared_type
    opts <- c("saliva", "rna", "cdna", "oro_pharyngeal_swab", "naso_pharyngeal_swab")
    checks_df <- df_samples %>%
        dplyr::filter(sample_shared_type %in% opts) %>%
        tidyr::drop_na(sample_shared_type) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "sample_shared_type",
                      message = "Invalid sample_shared_type",
                      out_table = checks_df)

    # Check original_source
    opts <- c("saliva", "oro_pharyngeal_swab", "naso_pharyngeal_swab")
    checks_df <- df_samples %>%
        dplyr::filter(original_source %in% opts) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "original_source",
                      message = "Invalid original_source",
                      out_table = checks_df)

    # Check collection_date
    checks_df <- df_samples %>%
        tidyr::drop_na(collection_date) %>%
        nrow() %>% ### Need to make sure we import usual date formats
        report_checks(original_nrows = df_rows,
                      var = "collection_date",
                      message = "Missing values on collection_date",
                      out_table = checks_df)

    # Check pcr_cycle_threshold
    checks_df <- df_samples %>%
        dplyr::filter(pcr_cycle_threshold %in% 0:40) %>%
        tidyr::drop_na(pcr_cycle_threshold) %>%
        nrow() %>%
        report_checks(original_nrows = df_rows,
                      var = "pcr_cycle_threshold",
                      message = "Missing or invalid pcr_cycle_threshold",
                      out_table = checks_df)

    # Check submitting_lab_name
    checks_df <- df_samples %>%
        dplyr::filter(submitting_lab_name %in% c("cr_microbiology")) %>%
        tidyr::drop_na(submitting_lab_name) %>%
        nrow() %>%  ### Lab Name should be inferred from username, info should be queried upon registration
        report_checks(original_nrows = df_rows,
                      var = "submitting_lab_name",
                      message = "Invalid submitting_lab_name",
                      out_table = checks_df)

    # Check submitting_lab_address
    checks_df <- df_samples %>%
        tidyr::replace_na(list(submitting_lab_address = "text to detect")) %>%
        dplyr::filter(stringr::str_detect(submitting_lab_address, "[:alpha:]")) %>%
        nrow() %>%   ### Same as above
        report_checks(original_nrows = df_rows,
                      var = "submitting_lab_address",
                      message = "submitting_lab_address must be text or empty",
                      out_table = checks_df)

    # Check submitting_lab_authors
    checks_df <- df_samples %>%
        tidyr::replace_na(list(submitting_lab_authors = "text to detect")) %>%
        dplyr::filter(stringr::str_detect(submitting_lab_authors, "[:alpha:]")) %>%
        nrow() %>%   ### same as above
        report_checks(original_nrows = df_rows,
                      var = "submitting_lab_authors",
                      message = "submitting_lab_authors must be text or empty",
                      out_table = checks_df)

    checks_df <- checks_df %>%
        dplyr::mutate(idx = 1:nrow(.)) %>%
        dplyr::arrange(dplyr::desc(idx)) %>%
        dplyr::select(-idx)
}

#' Title
#'
#' @param df tibble
#'
#' @export
new_sample_id <- function(df) {

    # Get patients table from SQL
    cn <- connect_db("mysql_covid_seq")
    on.exit(DBI::dbDisconnect(cn))
    sql_df <- dplyr::tbl(cn, "samples")

    set.seed(2021)
    ids <- tibble::tibble(
        type = "S",
        str = stringi::stri_rand_strings(
            n = 1e5,
            length = 3,
            pattern = "[A-Z]"
        )
    ) %>%
        tidyr::unite("sample_id", type:str) %>%
        dplyr::filter(!duplicated(sample_id)) %>%
        dplyr::filter(!sample_id %in% (sql_df %>% dplyr::pull(sample_id))) %>%
        dplyr::slice_sample(n = nrow(df)) %>%
        dplyr::pull(sample_id)

    df %>%
        dplyr::rename(original_sample_id = sample_id) %>%
        dplyr::mutate(original_sample_id = glue::glue("{submitting_lab_name}_{original_sample_id}")) %>%
        dplyr::mutate(sample_id = ids, .before = 1,
                      submit_date = Sys.Date())
}
MicrobialGenomics/covidseq documentation built on Jan. 27, 2021, 7:58 p.m.