snpcheck: Package to determine sample swaps from SNP data.

Documented in perform_snpcheck

#' Perform SNP check
#'
#' @param snp_table A number
#' @param study Study identifier to include
#' @param sample Sample identifier to include
#' @param usable_minimum Minimum usable SNP's to include in analysis
#' @param identical_minimum Minimum identical SNP's to determine similarity
#' @param distance_max Maximun distance of changing SNP loci
#' @return Dataframe of pairwise comparisons
#' @export
perform_snpcheck = function(snp_table, study = NULL, sample = NULL, vecode = NULL, usable_minimum = 18, identical_minimum = 18, distance_max = 4, check_blood = F) {

  # Error checking (TODO)
  message("Importing global SNP data...")

  # Get information about VEcode sample
  if(!is.null(vecode)){
    temp = snp_table %>%
      filter(CODE == vecode)
    snps = snp_table %>%
      filter(STUDY %in% temp$STUDY & SAMPLE %in% temp$SAMPLE)
    message(paste0("Comparing to: ", paste0(snps$CODE, collapse = ",")))
  } else {
    # Perform sample-sample or sample-blood comparisons
    if(!check_blood){
      # Filter to keep a selected set of samples that have the same ID
      if(!is.null(sample)) {
        snps = snp_table %>%
          filter(STUDY %in% study) %>%
          filter(SAMPLE %in% sample)
        message(paste0("Found ", nrow(snps), " samples..."))
      } else {
        stop("Please enter a sample identifier (e.g 066, 079, 111)")
      }
    } else{
      snps = snp_table %>%
        filter(grepl("BL", Study_ID) | grepl("PBMC", Study_ID) | CODE == vecode)
      message(paste0("Found ", nrow(snp_table), " samples..."))
    }
  }

  # Remove unneeded columns
  snps.unique = snps %>%
    mutate(uniqueId = paste(STUDY, SAMPLE, CODE, sep = "_")) %>%
    relocate(uniqueId) %>%
    group_by(uniqueId) %>%
    mutate(row = row_number()) %>%
    mutate(uniqueId = paste0(uniqueId, "_" ,row)) %>%
    select(-row) %>%
    select(-STUDY:-CODE, -Study_ID)

  # get combinations
  message("Getting pairwise combinations...")
  if(!check_blood & is.null(vecode)){
    combinations = combn(snps.unique$uniqueId, 2) %>%
      t() %>%
      as.data.frame()
  } else if(!is.null(vecode)){
    combinations = combn(snps.unique$uniqueId, 2) %>%
      t() %>%
      as.data.frame() %>%
      filter((grepl(vecode, V1) | (grepl(vecode, V2))))
  } else{
    combinations = combn(snps.unique$uniqueId, 2) %>%
      t() %>%
      as.data.frame() %>%
      filter((grepl(sample, V1) | (grepl(sample, V2))))
   }

  # Iterate over each pairwise combination
  message(paste0("Computing ", nrow(combinations), " combinations"))
  all_comparisons <- list()
  all_tables <- list()
  pb <- progress::progress_bar$new(total = nrow(combinations))
  for(i in 1:nrow(combinations)){
    pb$tick()

    # Get reduced table
    snps.sub = snps.unique %>%
      filter(uniqueId %in% combinations[i,]) %>%
      select(-SAMPLE) %>%
      pivot_longer(cols = -c("uniqueId")) %>%
      pivot_wider(names_from = uniqueId, values_from = value) %>%
      rename(locus = name)

    # Sample names
    sample1.name = colnames(snps.sub)[2]
    sample2.name = colnames(snps.sub)[3]

    # Compute total skipped (needs to be chr)
    total_skipped = 0

    # Compute total usable sites
    total_usable = snps.sub %>%
      rowwise() %>%
      filter(
        if_all(.cols = -contains("locus"),
               .fns = ~ .x != "-")) %>%
      nrow()

    # Computer total failed sites
    total_failed = 26 - total_usable

    # (A-A=0 , A-B=1 , A-C=2 , B-B=0 , C-B=1)
    snps.sub.compare = snps.sub %>%
      filter( if_all(.cols = -contains("locus"),
                     .fns = ~ .x != "-")) %>%
      mutate(Dist = case_when(
        .[[2]] == "A" & .[[3]] == "A" ~ 0,
        .[[2]] == "B" & .[[3]] == "B" ~ 0,
        .[[2]] == "C" & .[[3]] == "C" ~ 0,

        .[[2]] == "A" & .[[3]] == "B" ~ 1,
        .[[2]] == "B" & .[[3]] == "A" ~ 1,

        .[[2]] == "A" & .[[3]] == "C" ~ 2,
        .[[2]] == "C" & .[[3]] == "A" ~ 2,

        .[[2]] == "C" & .[[3]] == "B" ~ 1,
        .[[2]] == "B" & .[[3]] == "C" ~ 1
      )) %>%
      mutate(Match = case_when(
        .[[2]] == "A" & .[[3]] == "A" ~ 1,
        .[[2]] == "C" & .[[3]] == "C" ~ 1,
        .[[2]] == "B" & .[[3]] == "B" ~ 1
      )) %>%
      mutate(Match = replace_na(Match, 0))

    # Compute identical
    total_identical = sum(snps.sub.compare$Match)
    percent_identical = total_identical / total_usable

    # Compute different
    total_different = total_usable - total_identical
    percent_different = total_different / total_usable

    # Compute total distance
    total_distance =  sum(snps.sub.compare$Dist)

    # Compute output
    if(total_usable < usable_minimum){
      FINAL = 'FAILED'
      FINAL_REASON = paste0("Not enough usable positions: ", total_usable, " < ", usable_minimum)
    } else if (total_distance > distance_max) {
      FINAL = "FAILED"
      FINAL_REASON = paste0("Distance too large: ", total_distance, " > ", distance_max)
    } else if (total_identical < identical_minimum) {
      FINAL = "FAILED"
      FINAL_REASON = paste0("Not enough identical calls: ", total_identical, " < ", identical_minimum)
    } else {
      FINAL = "PASS"
      FINAL_REASON = ""
    }

    # Output into dataframe
    res = tibble("Sample_1" = sample1.name,
                 "Sample_2" = sample2.name,
                 "Total_skipped" = total_skipped,
                 "Total_usable" = total_usable,
                 "Total_failed" = total_failed,
                 "Total_identical" = total_identical,
                 "Percent_identical" = percent_identical,
                 "Total_different" = total_different,
                 "Percent_different" = percent_different,
                 "Total_distance" = total_distance,
                 "DATA" = list(tibble(snps.sub.compare)),
                 "FINAL" = FINAL,
                 "Remarks" = FINAL_REASON)
    all_comparisons[[i]] = res
    all_tables[[i]] = snps.sub.compare
  }

  # Merge all results together
  message("Finished...")
  output = do.call(rbind, all_comparisons)
  return(output)
}