R/getQuantData.R

Defines functions getQuantData

#' @export

# Takes a tx2gene, filepath for quant.sf files, sample prefix (e.g. "HomoSapiens"), and list of sample names (e.g. "Male_1", "12345", "sample_A")
# Returns a tibble of count data for all subjects, and a tibble of TPM data for all subjects

getQuantData <- function(tx2gene_path, salmon_path, species_prefix, sample_names) {

  # Normal installation
  require(BiocManager)
  require(tidyverse)
  require(stringr)

  # BiocManager installation
  require(tximport)
  require(GenomicFeatures) # Need mysql-client installed on system

  # Reading in the tx2gene object
  tx2gene <- as_tibble(read.delim(tx2gene_path, header = TRUE))

  # Looping through the quant.sf output files from salmon, creating tximport files that are better structured
  file_names <- dir(salmon_path, pattern =".sf")
  tximport_list <- vector("list", length = length(file_names))
  for (i in 1:length(file_names)) {
    tximport_list[[i]] <- tximport(paste(salmon_path, file_names[i], sep = ""), type = "salmon", txOut = TRUE, countsFromAbundance = "lengthScaledTPM") %>%
      summarizeToGene(tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM")
  }

  # Generating names for columns in the tables of quantification data
  col_names    <- vector("list", length = length(sample_names) + 1)
  col_names[1] <- "Gene.name"
  for (i in seq_along(sample_names)) {
    col_names[i+1] <- paste(species_prefix, sample_names[i], sep = "_")
  }

  # Creating a list of the tibble objects with count data for individual subjects
  counts_sep = vector("list", length = length(tximport_list))
  for (i in 1:length(tximport_list)) {
    rownames <- row.names(tximport_list[[i]][['counts']])
    counts_sep[[i]] <- as_tibble(tximport_list[[i]][['counts']]) %>%
      add_column(Gene.name = rownames) %>%
      dplyr::select(Gene.name, V1)
  }

  # Merging all individual subject count data into a single dataframe
  all_counts <- counts_sep[[1]]
  for (i in c(2:length(counts_sep))) {
    all_counts <- full_join(all_counts,
                            counts_sep[[i]],
                            by = "Gene.name")
  }

  colnames(all_counts) <- col_names

  # Creating a list of the tibble objects with TPM data for individual subjects
  tpm_sep = vector("list", length = length(tximport_list))
  for (i in 1:length(tximport_list)) {
    rownames <- row.names(tximport_list[[i]][['counts']])
    tpm_sep[[i]] <- as_tibble(tximport_list[[i]][['counts']]) %>%
      add_column(Gene.name = rownames) %>%
      dplyr::select(Gene.name, V1)
  }

  # Merging all individual subject TPM data into a single dataframe
  all_tpm <- tpm_sep[[1]]
  for (i in c(2:length(tpm_sep))) {
    all_tpm <- full_join(all_tpm,
                         tpm_sep[[i]],
                         by = "Gene.name")
  }

  colnames(all_tpm) <- col_names
  
  data_list   <- list()
  data_list$TPM    <- all_tpm
  data_list$counts <- all_counts

  return(data_list)

}
maddydoak/DoakThesis2020 documentation built on March 5, 2020, 12:53 a.m.