R/pairwise_similarity.R

Defines functions pairwise_similarity

# Requirement: 'tibble' + A .exe (by default called vsearch-2.17.1-win-x86_64.exe) installed in the same folder

# Info: Perform a pairwise aligment, then calculate the percentage of non-matches in the alignment (i.e. indels & mistmatches)

# Note: Number of comparisons should always be divided by 2 since two comparaisons are always performed (e.g. BIN1 - BIN2 + BIN2 - BIN1)

pairwise_similarity = function(data, bin_data, data_taxo, 
         # mode = "inter_bin", taxa = NULL, deduplicate = T, max_ratio_mean_length,
         min_similarity_threshold, max_similarity_threshold, minimum_sequence_length = 0,
         number_hits_considered = c(0,32),
         verbose = T, exe_name = "vsearch-2.17.1-win-x86_64.exe"){
  
  data_splited = split(tibble(data), data$PRIMERS)
  
  data_list = data_splited
  
  length_barcodes = lapply(data_splited, function(x) apply(x[,"FRAGMENT"], 1, nchar))
  
  for(i in 1:length(data_splited)){
    
    start = Sys.time()
    
    if(verbose){
      
      cat(paste0("ANALYSIS OF FRAGMENTS AMPLIFIED BY PRIMER: ", names(data_splited[i]), "\n"))
      
      cat("-------------------------------------------------------------------\n")
      
      cat("\n")
      
    }
    
    fasta_file = c(rbind(paste0(">", word(data_splited[[i]]$ACCESSION, 1, sep = fixed(":"))),
                         FRAGMENT = data_splited[[i]]$FRAGMENT))
    
    write.table(fasta_file, file = "temporary_fasta_file.fasta", row.names = F, col.names = F, quote = F)
    
    if(!verbose) system(paste0(exe_name, ' --usearch_global ', "temporary_fasta_file.fasta", 
                  ' --db ', "temporary_fasta_file.fasta", ' --threads ', 0, ' --maxid ', max_similarity_threshold/100,
                  ' --self --id ', min_similarity_threshold/100, ' --minseqlength ', minimum_sequence_length, 
                  ' --maxaccepts ', number_hits_considered[1], ' --maxrejects ', number_hits_considered[2],
                  ' --iddef 2 --minwordmatches 0 --userfields query+target+id --maxaccepts 0 --userout ', 
                  "temporary_pairwise_similarity.txt"), intern = F, show.output.on.console = F)
    
    else system(paste0(exe_name, ' --usearch_global ', "temporary_fasta_file.fasta", 
                       ' --db ', "temporary_fasta_file.fasta", ' --threads ', 0, ' --maxid ', max_similarity_threshold/100,
                       ' --self --id ', min_similarity_threshold/100, ' --minseqlength ', minimum_sequence_length, 
                       ' --maxaccepts ', number_hits_considered[1], ' --maxrejects ', number_hits_considered[2],
                       ' --iddef 2 --minwordmatches 0 --userfields query+target+id --userout ', 
                       "temporary_pairwise_similarity.txt"), intern = F)
    
    similarity_file = suppressWarnings(fread("temporary_pairwise_similarity.txt"))
    
    file.rename(from = paste0(getwd(), "/temporary_fasta_file.fasta"), to = paste0(tempdir(), "/temporary_fasta_file.fasta"))
    
    file.rename(from = paste0(getwd(), "/temporary_pairwise_similarity.txt"), to = paste0(tempdir(), "/temporary_pairwise_similarity.txt"))
    
    if(nrow(similarity_file) > 0){
      
      similarity_file = merge(subset(bin_data, select = c("ACCESSION", "BIN", "BOLD_SPECIES", "BOLD_GENUS", 
                                                          "BOLD_FAMILY", "BOLD_ORDER")), 
                              similarity_file, by.x = "ACCESSION", by.y = "V1", suffixes = c("1","2"))
      
      colnames(similarity_file)[1:2] = c("ACCESSION2", "BIN2")
      
      similarity_file = tibble(merge(subset(bin_data, select = c("ACCESSION", "BIN", "BOLD_SPECIES", "BOLD_GENUS", 
                                                                 "BOLD_FAMILY", "BOLD_ORDER")), 
                                     similarity_file, by.x = "ACCESSION", by.y = "V2", suffixes = c("1","2")))
      
      colnames(similarity_file)[c(1:2,13)] = c("ACCESSION1", "BIN1", "SIMILARITY")
      
      similarity_file = similarity_file[which(!is.na(similarity_file$BIN1) | !is.na(similarity_file$BIN2)),]
      
      similarity_file = merge(subset(data_taxo, select = c("ACCESSION", "SPECIES", "GENUS", "FAMILY", "ORDER")), 
                              similarity_file, by.x = "ACCESSION", by.y = "ACCESSION1", suffixes = c("1","2"))
      
      colnames(similarity_file)[1] = "ACCESSION1"
      
      similarity_file = tibble(merge(subset(data_taxo, select = c("ACCESSION", "SPECIES", "GENUS", "FAMILY", "ORDER")), 
                                     similarity_file, by.x = "ACCESSION", by.y = "ACCESSION2", suffixes = c("1","2")))
      
      colnames(similarity_file)[1] = "ACCESSION2"
      
      data_list[[i]] = tibble(cbind(similarity_file[,21], similarity_file[,6], similarity_file[,1],
                                    similarity_file[,c(11,16)], similarity_file[,2], similarity_file[,12],
                                    similarity_file[,7], similarity_file[,17], similarity_file[,3],
                                    similarity_file[,13], similarity_file[,8], similarity_file[,18],
                                    similarity_file[,4], similarity_file[,14], similarity_file[,9],
                                    similarity_file[,19], similarity_file[,5], similarity_file[,15],
                                    similarity_file[,10], similarity_file[,20]))
      
      #   ### Fastest way I found to deduplicate. But in the end useless and still very long -> just need to consider numbers only and divide them by 2
      #     
      #   duplicated_index = list()
      #   
      #   for(j in 1:nrow(similarity_file)) { 
      #     
      #     duplicated_index[[j]] = sort(which(similarity_file$ACCESSION2 == similarity_file[j,]$ACCESSION1 & 
      #                               similarity_file$ACCESSION1 == similarity_file[j,]$ACCESSION2 | 
      #                               similarity_file$ACCESSION1 == similarity_file[j,]$ACCESSION1 & 
      #                               similarity_file$ACCESSION2 == similarity_file[j,]$ACCESSION2))
      #     
      #   }
      #   
      #   duplicated_index_dedup = Map(`[`, duplicated_index, relist(!duplicated(unlist(duplicated_index)), skeleton = duplicated_index))
      #   
      #   duplicated_index_dedup = duplicated_index_dedup[lapply(duplicated_index_dedup, length) > 0]
      #   
      #   similarity_file_dedup = similarity_file[unlist(lapply(duplicated_index_dedup, function(x) x[1])),]
      #   
      #   data_list[[i]] = similarity_file_dedup
      
    }
    
    else data_list[[i]] = tibble(data.frame(SIMILARITY = NA, ACCESSION1 = NA, ACCESSION2 = NA, BIN1 = NA,  BIN2 = NA,  SPECIES1 = NA, 
                                            BOLD_SPECIES1 = NA, SPECIES2 = NA, BOLD_SPECIES2 = NA, GENUS1 = NA, BOLD_GENUS1 = NA, 
                                            GENUS2 = NA, BOLD_GENUS2 = NA, FAMILY1 = NA, BOLD_FAMILY1 = NA, FAMILY2 = NA, 
                                            BOLD_FAMILY2 = NA, ORDER1 = NA, BOLD_ORDER1 = NA, ORDER2 = NA, BOLD_ORDER2 = NA))
    
    if(verbose){
      
      end = Sys.time()
      
      duration = difftime(end, start)
      
      cat("\n")
      
      cat("-------------------------------------------------------------------\n")
      
      cat(paste("Time taken:", round(duration[[1]], 2), units(duration), "\n"))
      
      cat("\n")
      
      cat("\n")
      
    }
    
  }
  
  return(data_list)
  
}
Eliot-RUIZ/eDNAevaluation documentation built on Dec. 17, 2021, 6:25 p.m.