R/check_if_fragment_in_gene.R

Defines functions check_if_fragment_in_gene

## Requirement: "stringr" + "tibble"

check_if_fragment_in_gene = function(data, reference_data, remove_duplicates_index = F, id_col_name = NULL,
                                     sequences_col_name = "SEQUENCES", reference_sequences_col_name = "SEQUENCES",
                                     accession_col_name = "ACCESSION", reference_accession_col_name = "ACCESSION"){
  
  if(remove_duplicates_index) data[[accession_col_name]] = stringr::word(data[[accession_col_name]], 1, sep = stringr::fixed(":"))
  
  data = data[!duplicated(data),]
  
  found_reference = reference_data[which(reference_data[[reference_accession_col_name]] %in% 
                                           data[[accession_col_name]]), ]
  
  found_data = data[which(data[[accession_col_name]] %in%
                            reference_data[[reference_accession_col_name]]), ]
  
  not_found_data = data[which(!(data[[accession_col_name]] %in%
                                  reference_data[[reference_accession_col_name]])), ]
  
  if(nrow(found_data) != 0){
    
    number_time_found = list()
    
    if(nrow(found_data) > 100) cat("-----------------CHECKING NAMES-----------------\n")
    
    for(i in 1:nrow(found_data)){
      
      if(i > 100 && i %% 100 == 0) cat(paste0(round(i / nrow(found_data) * 100), "% ... "))
      
      reference_sequence = found_reference[which(found_reference[[reference_accession_col_name]] == 
                                                   found_data[i,][[accession_col_name]]), ][[reference_sequences_col_name]]
      
      number_time_found = stringr::str_count(string = reference_sequence, pattern = found_data[i,][[sequences_col_name]])
      
      if(any(number_time_found) == 1) number_time_found = 1
      
      else number_time_found = sum(number_time_found)
      
      if(i == 1) {
        
        if(!is.null(id_col_name)) number_matches = tibble::tibble(data.frame(ID = found_data[i,][[id_col_name]],
                                                                     ACCESSION = found_data[i,][[accession_col_name]], 
                                                                     NUMBER_MATCHES = number_time_found,
                                                                     REASON = "Sequence not found"))
        
        else number_matches = tibble::tibble(data.frame(ACCESSION = found_data[i,][[accession_col_name]], 
                                                NUMBER_MATCHES = number_time_found,
                                                REASON = "Sequence not found"))
        
      }
      
      else {
        
        if(!is.null(id_col_name)) 
          number_matches = tibble::tibble(rbind(number_matches, data.frame(ID = found_data[i,][[id_col_name]],
                                                                   ACCESSION = found_data[i,][[accession_col_name]], 
                                                                   NUMBER_MATCHES = number_time_found,
                                                                   REASON = "Sequence not found")))
        
        else number_matches = tibble::tibble(rbind(number_matches, data.frame(ACCESSION = found_data[i,][[accession_col_name]], 
                                                                      NUMBER_MATCHES = number_time_found,
                                                                      REASON = "Sequence not found")))
        
      }
      
    }
    
    if(nrow(found_data) > 100) {
      
      cat("\n")
      
      cat("------------------------------------------------\n")
      
      cat("\n")
      
    }
    
    if(nrow(not_found_data) != 0) {
      
      if(!is.null(id_col_name)) 
        number_matches = tibble::tibble(rbind(number_matches, data.frame(ID = not_found_data[[id_col_name]],
                                                                 ACCESSION = not_found_data[[accession_col_name]], 
                                                                 NUMBER_MATCHES = rep(0, nrow(not_found_data)),
                                                                 REASON = "Name not found")))
      
      else number_matches = tibble::tibble(rbind(number_matches, data.frame(ACCESSION = not_found_data[[accession_col_name]], 
                                                                    NUMBER_MATCHES = rep(0, nrow(not_found_data)),
                                                                    REASON = "Name not found")))
      
    }
    
  }
  
  else {
    
    warning(paste("No similar names has been found in the column", reference_accession_col_name, "of",
                  deparse(substitute(reference_data))))
    
    number_matches = paste("No names are present in the", reference_accession_col_name,
                           "of", deparse(substitute(reference_data)))
    
  }
  
  if(all(number_matches$NUMBER_MATCHES == 1)) 
    number_matches = paste("All", accession_col_name, "amplicons are present in", 
                           deparse(substitute(reference_data)), "sequences.")
  
  else {
    
    number_matches = number_matches[which(number_matches$NUMBER_MATCHES != 1), ]
    
  }
  
  return(number_matches)
  
}
Eliot-RUIZ/eDNAevaluation documentation built on Dec. 17, 2021, 6:25 p.m.