R/sm_lengthprofiler.R

Defines functions smm_lengthprofiler

Documented in smm_lengthprofiler

#' Extract CDR3-length profile
#'
#' @param input_file File containing the CDR3 sequences length and frequency.
#' @param output_dir A directory to write the output (by default current working directory).
#' @param numb_threads Number of threads to use (by default 4).
#' @details Two files containing the CDR3-length profile i.e. like-histogram and tab-separated file from \code{input_file}. Both files are written in \code{output_dir}.
#' @examples smm_lengthprofile(input_file = "example_file.tsv", output_dir = "/path/to/dump/directory", numb_threads = 4)

smm_lengthprofiler <- function(input_file, output_dir, numb_threads){
  # By default settigns:
    # Output directory is the working directory
    output_dir <- getwd()
    # Number of threads to used is 4
    numb_threads <- c(4)

  # Attach required packages
  if (!require(package = "ggplot2", character.only = TRUE)){
    install.packages(pkgs = "ggplot2", repos = "https://cran.rediris.es/", dependencies = T)
    library(package = "ggplot2", character.only = TRUE)
  }

  # Load full file to R enviroment
  data_obj <- read.table(file = input_file, header = TRUE, comment.char = "", sep = "\t")

  # Remove surplus columns
  data_col <- grep(pattern = "avg", x = colnames(data_obj))
  leng_col <- grep(pattern = "length", x = colnames(data_obj))

  data_fre <- list()
  # Generate large naive vectors
  for (i in seq_len(length(data_col))){
    data_fre[[i]] <- cbind.data.frame(data_obj[,leng_col], data_obj[,data_col[i]])
    data_fre[[i]] <- data_fre[[i]][which(data_fre[[i]][,2] > 0),]
    data_fre[[i]] <- as.data.frame(table(data_fre[[i]][,1]))
    colnames(data_fre[[i]]) <- c("CDR3_Length", "Abs_Freq")
  }
  names(data_fre) <- colnames(data_obj)[data_col]

  # Export the files
  for (i in seq_len(length(data_fre))){
    leng_plot <- ggplot(data = data_fre[[i]], aes(x = as.numeric(as.character(data_fre[[i]][,1])), y = data_fre[[i]][,2])) +
      geom_bar(stat = "identity") + xlab("CDR3 Length") + ylab("Frequency") + ggtitle(label = names(data_fre)[i])

    write.table(x = data_fre[i], file = paste0(output_dir, "/", names(data_fre)[i], ".tsv"), append = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
    print(paste("Frequency file from", names(data_fre)[i], "has been written in", paste0(output_dir, "/", names(data_fre[i]), ".tsv")))
    ggsave(plot = leng_plot, filename = paste0(names(data_fre)[i], ".png"), path = output_dir, device = "png", width = 15, height = 20, units = "cm", dpi = 350)
    print(paste("Frequency plot from", names(data_fre)[i], "has been written in", paste0(output_dir, "/", names(data_fre[i]), ".png")))
  }
}
manuelsmendoza/smmcdr3 documentation built on May 4, 2019, 3:09 a.m.