R/check_miss_data.R

Defines functions check_miss_data

Documented in check_miss_data

#' Remove SNPs with missing data
#'
#' @inheritParams format_sumstats
#' @param log_files list of log file locations
#' @return list containing sumstats_dt, the modified summary statistics data
#' table object and a log file list.
#' @keywords internal
#' @importFrom stats complete.cases
check_miss_data <- function(sumstats_dt, path, log_folder_ind, check_save_out,
                            tabix_index, nThread, log_files,
                            drop_na_cols) {
    message("Checking for missing data.")
    col_headers <- names(sumstats_dt)
    # use data table for speed
    # check for rows missing data to be excluded
    # don't check imputation columns
    #also don't check cols MSS creates - SNP_INFO
    ignore_cols <- c(
        col_headers[grepl("^IMPUTATION_", col_headers)],
        "flipped"["flipped" %in% col_headers],
        col_headers[grepl("^convert_", col_headers)],
        "SNP_INFO"["SNP_INFO"%in% col_headers]
    )
    
    if (!is.null(drop_na_cols)) {
      drop_na_cols_in_sumstats <-
        c(drop_na_cols)[drop_na_cols %in% names(sumstats_dt)]
      incl_cols <-
        c(drop_na_cols_in_sumstats)[!drop_na_cols_in_sumstats %in% ignore_cols]
      if(length(incl_cols)<1){
        msg <- paste0(
          "WARNING: None of the inputted columns:\n",
          paste(drop_na_cols,collapse=" "),"\n",
          "To be checked for missing data were found in the sumstats. Sumstats",
          " columns:\n",
          paste(names(sumstats_dt),collapse=" "),"\n",
          "This check will not be run."
        )
        message(msg)
      }
    } else {
      incl_cols <- names(sumstats_dt)[!names(sumstats_dt) %in% ignore_cols]  
    }
    
    if (nrow(sumstats_dt[!complete.cases(sumstats_dt[, incl_cols,
        with = FALSE
    ]), ]) > 0) {
        n_missing <- nrow(
            sumstats_dt[!complete.cases(
                sumstats_dt[, incl_cols, with = FALSE]
            ), ]
        )
        msg <- paste0(
            "WARNING: ",
            formatC(n_missing, big.mark = ","),
            " rows in sumstats file are missing data and will ",
            "be removed."
        )
        message(msg)
        # If user wants log, save it to there
        if (log_folder_ind) {
            name <- "missing_data"
            name <- get_unique_name_log_file(
                name = name,
                log_files = log_files
            )
            write_sumstats(
                sumstats_dt =
                    sumstats_dt[!complete.cases(sumstats_dt[, incl_cols,
                        with = FALSE
                    ])],
                save_path =
                    paste0(
                        check_save_out$log_folder,
                        "/", name,
                        check_save_out$extension
                    ),
                sep = check_save_out$sep,
                #don't tab indx as could be miss values & cause err
                #tabix_index = tabix_index,
                nThread = nThread
            )
            log_files[[name]] <-
                paste0(
                    check_save_out$log_folder, "/",
                    name, check_save_out$extension
                )
        }
        sumstats_dt <-
            sumstats_dt[complete.cases(sumstats_dt[, incl_cols, with = FALSE])]
        if (nrow(sumstats_dt) == 0) {
            stop_msg <- paste(
                "All SNPs have been filtered out of",
                " your summary statistics dataset"
            )
            stop(stop_msg)
        }

        return(list("sumstats_dt" = sumstats_dt, "log_files" = log_files))
    } else {
        return(list("sumstats_dt" = sumstats_dt, "log_files" = log_files))
    }
}
neurogenomics/MungeSumstats documentation built on Aug. 10, 2024, 5:59 a.m.