## Requirement: "stringr" + "tibble"
check_if_fragment_in_gene = function(data, reference_data, remove_duplicates_index = F, id_col_name = NULL,
sequences_col_name = "SEQUENCES", reference_sequences_col_name = "SEQUENCES",
accession_col_name = "ACCESSION", reference_accession_col_name = "ACCESSION"){
if(remove_duplicates_index) data[[accession_col_name]] = stringr::word(data[[accession_col_name]], 1, sep = stringr::fixed(":"))
data = data[!duplicated(data),]
found_reference = reference_data[which(reference_data[[reference_accession_col_name]] %in%
data[[accession_col_name]]), ]
found_data = data[which(data[[accession_col_name]] %in%
reference_data[[reference_accession_col_name]]), ]
not_found_data = data[which(!(data[[accession_col_name]] %in%
reference_data[[reference_accession_col_name]])), ]
if(nrow(found_data) != 0){
number_time_found = list()
if(nrow(found_data) > 100) cat("-----------------CHECKING NAMES-----------------\n")
for(i in 1:nrow(found_data)){
if(i > 100 && i %% 100 == 0) cat(paste0(round(i / nrow(found_data) * 100), "% ... "))
reference_sequence = found_reference[which(found_reference[[reference_accession_col_name]] ==
found_data[i,][[accession_col_name]]), ][[reference_sequences_col_name]]
number_time_found = stringr::str_count(string = reference_sequence, pattern = found_data[i,][[sequences_col_name]])
if(any(number_time_found) == 1) number_time_found = 1
else number_time_found = sum(number_time_found)
if(i == 1) {
if(!is.null(id_col_name)) number_matches = tibble::tibble(data.frame(ID = found_data[i,][[id_col_name]],
ACCESSION = found_data[i,][[accession_col_name]],
NUMBER_MATCHES = number_time_found,
REASON = "Sequence not found"))
else number_matches = tibble::tibble(data.frame(ACCESSION = found_data[i,][[accession_col_name]],
NUMBER_MATCHES = number_time_found,
REASON = "Sequence not found"))
}
else {
if(!is.null(id_col_name))
number_matches = tibble::tibble(rbind(number_matches, data.frame(ID = found_data[i,][[id_col_name]],
ACCESSION = found_data[i,][[accession_col_name]],
NUMBER_MATCHES = number_time_found,
REASON = "Sequence not found")))
else number_matches = tibble::tibble(rbind(number_matches, data.frame(ACCESSION = found_data[i,][[accession_col_name]],
NUMBER_MATCHES = number_time_found,
REASON = "Sequence not found")))
}
}
if(nrow(found_data) > 100) {
cat("\n")
cat("------------------------------------------------\n")
cat("\n")
}
if(nrow(not_found_data) != 0) {
if(!is.null(id_col_name))
number_matches = tibble::tibble(rbind(number_matches, data.frame(ID = not_found_data[[id_col_name]],
ACCESSION = not_found_data[[accession_col_name]],
NUMBER_MATCHES = rep(0, nrow(not_found_data)),
REASON = "Name not found")))
else number_matches = tibble::tibble(rbind(number_matches, data.frame(ACCESSION = not_found_data[[accession_col_name]],
NUMBER_MATCHES = rep(0, nrow(not_found_data)),
REASON = "Name not found")))
}
}
else {
warning(paste("No similar names has been found in the column", reference_accession_col_name, "of",
deparse(substitute(reference_data))))
number_matches = paste("No names are present in the", reference_accession_col_name,
"of", deparse(substitute(reference_data)))
}
if(all(number_matches$NUMBER_MATCHES == 1))
number_matches = paste("All", accession_col_name, "amplicons are present in",
deparse(substitute(reference_data)), "sequences.")
else {
number_matches = number_matches[which(number_matches$NUMBER_MATCHES != 1), ]
}
return(number_matches)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.