smmcdr3: CDR3 sequences analysis

sm_Hsampling <- function(input_file, output_dir, iterations){
  pkgs <- list("vegan", "iNEXT", "ggplot2", "ggpubr")
  pkg_attach <- function(x){
    library(package = x, character.only = T)
  }
  lapply(pkgs, pkg_attach)

  data <- read.table(file = input_file, header = TRUE, comment.char = "", sep = "\t")
  data <- data[,c(6:ncol(data))]

  number_sequences <- apply(data, 2, function(x){which(x > 0)})
  number_sequences <- unlist(lapply(number_sequences, length))
  min_number_sequences <- number_sequences[which(number_sequences == min(number_sequences))]
  sam_number_sequences <- names(number_sequences)[which(number_sequences == min(number_sequences))]

  h_iteration <- list()
  i <- 0
  while (i < iterations) {
    i <- i + 1
    h_iteration[[i]] <- vector()

    for (n in seq_len(ncol(data[, c(1:ncol(data))]))){
      random_choosen <- sample(x = seq_len(nrow(data)), size = min_number_sequences, prob = data[, n] / sum(data[, n]))
      random_diversity <- diversity(t(data[random_choosen, n]))
      h_iteration[[i]][n] <- random_diversity
    }
  }

  diversity_data <- cbind.data.frame(rep(colnames(data)[c(1:ncol(data))], iterations), unlist(h_iteration))
  colnames(diversity_data) <- c("Sample", "Shannon")
  diversity_data$Iteration <- rep(iterations, nrow(diversity_data))

  h_iteration <- as.data.frame(matrix(data = unlist(h_iteration), nrow = iterations, byrow = T))
  colnames(h_iteration) <- colnames(data)[c(1:ncol(data))]
  h_iteration$Iterations <- rep(iterations, nrow(h_iteration))

  h_ditribution_plot <-
    ggplot(data = diversity_data, aes(x = Sample, y = Shannon, fill = Sample)) +
      geom_boxplot() +
      ggtitle(label = paste0("Number of iterations: ", iterations)) +
      ylab(label = "Shannon index") +
      theme_pubr() +
      theme(legend.position = "none", axis.title.x = element_blank(), axis.title.y = element_text(face = "bold"), axis.text.x = element_text(angle = 90)) +

  write.table(x = h_iteration, file = paste0(output_dir, "/", colnames(data[2]), "-", iterations, ".tsv"), append = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
  ggsave(plot = h_ditribution_plot, filename = paste0(colnames(data[2]), "-", iterations, ".png"), device = "png", path = output_dir, width = 17, height = 15, units = "cm", dpi = 350)
  print("Info: Shannon index was calculated using the smallest sample size")
  print(paste0("Smallest sample was ", sam_number_sequences, " (", min_number_sequences, ")"))
  print(paste0("Data was wrote in ", output_dir, "/", colnames(data[2]), "-", iterations, ".tsv"))
  print(paste0("Chart was wrote in ",  output_dir, "/", colnames(data[2]), "-", iterations, ".png"))
}