R/addXenLaeData.R

Defines functions addXenLaeData

#' @export

addXenLaeData <- function() {

    library(BiocManager)      # Comp bio superpackage
    library(tidyverse)        # Data analysis/graphing
    library(stringr)
    library(tximport)
    library(GenomicFeatures) # Need mysql-client installed on system

    # Xenopus laevis data - made .csv files with python script (found in Thesis git repo) to make tx2gene file from the output of StringTie's merge function
    # First files added were M_9 and F_1
    # Using a tx2gene file generated using python for "summarizeToGene" function
    xenLae_tx2gene <- as_tibble(read.delim("data-raw/xenLae_txt2gene.csv",header=TRUE))

    # Individual files for each read
    xenLae_M_3_tximport <- tximport(c("data-raw/xenLae_M_3_quant.sf"), type = "salmon",txOut=TRUE,countsFromAbundance = "lengthScaledTPM") %>%
      summarizeToGene(tx2gene=xenLae_tx2gene, countsFromAbundance="lengthScaledTPM")
    xenLae_M_4_tximport <- tximport(c("data-raw/xenLae_M_4_quant.sf"), type = "salmon",txOut=TRUE,countsFromAbundance = "lengthScaledTPM") %>%
      summarizeToGene(tx2gene=xenLae_tx2gene, countsFromAbundance="lengthScaledTPM")
    xenLae_M_5_tximport <- tximport(c("data-raw/xenLae_M_5_quant.sf"), type = "salmon",txOut=TRUE,countsFromAbundance = "lengthScaledTPM") %>%
      summarizeToGene(tx2gene=xenLae_tx2gene, countsFromAbundance="lengthScaledTPM")
    xenLae_M_6_tximport <- tximport(c("data-raw/xenLae_M_6_quant.sf"), type = "salmon",txOut=TRUE,countsFromAbundance = "lengthScaledTPM") %>%
      summarizeToGene(tx2gene=xenLae_tx2gene, countsFromAbundance="lengthScaledTPM")
    xenLae_M_9_tximport <- tximport(c("data-raw/xenLae_M_9_quant.sf"), type = "salmon",txOut=TRUE,countsFromAbundance = "lengthScaledTPM") %>%
      summarizeToGene(tx2gene=xenLae_tx2gene, countsFromAbundance="lengthScaledTPM")

    xenLae_F_1_tximport <- tximport(c("data-raw/xenLae_F_1_quant.sf"), type = "salmon",txOut=TRUE,countsFromAbundance = "lengthScaledTPM") %>%
      summarizeToGene(tx2gene=xenLae_tx2gene, countsFromAbundance="lengthScaledTPM")
    xenLae_F_2_tximport <- tximport(c("data-raw/xenLae_F_2_quant.sf"), type = "salmon",txOut=TRUE,countsFromAbundance = "lengthScaledTPM") %>%
      summarizeToGene(tx2gene=xenLae_tx2gene, countsFromAbundance="lengthScaledTPM")
    xenLae_F_4_tximport <- tximport(c("data-raw/xenLae_F_4_quant.sf"), type = "salmon",txOut=TRUE,countsFromAbundance = "lengthScaledTPM") %>%
      summarizeToGene(tx2gene=xenLae_tx2gene, countsFromAbundance="lengthScaledTPM")
    xenLae_F_5_tximport <- tximport(c("data-raw/xenLae_F_5_quant.sf"), type = "salmon",txOut=TRUE,countsFromAbundance = "lengthScaledTPM") %>%
      summarizeToGene(tx2gene=xenLae_tx2gene, countsFromAbundance="lengthScaledTPM")
    xenLae_F_6_tximport <- tximport(c("data-raw/xenLae_F_6_quant.sf"), type = "salmon",txOut=TRUE,countsFromAbundance = "lengthScaledTPM") %>%
      summarizeToGene(tx2gene=xenLae_tx2gene, countsFromAbundance="lengthScaledTPM")

    # Formatting Xenopus laevis data

    # List of all above files
    all_tximport <- list(xenLae_M_3_tximport, xenLae_M_4_tximport, xenLae_M_5_tximport, xenLae_M_6_tximport, xenLae_M_9_tximport,
                         xenLae_F_1_tximport, xenLae_F_2_tximport, xenLae_F_4_tximport, xenLae_F_5_tximport, xenLae_F_6_tximport)

    col_names <- c("Gene.name",
                   "XenLae_Male_Brain_M2_3", "XenLae_Male_Brain_M2_4", "XenLae_Male_Brain_M2_5", "XenLae_Male_Brain_M2_6", "XenLae_Male_Brain_M2_9",
                   "XenLae_Female_Brain_F2_1", "XenLae_Female_Brain_F2_2", "XenLae_Female_Brain_F2_4", "XenLae_Female_Brain_F2_5", "XenLae_Female_Brain_F2_6")

    # Converts each tximport object (a double type) to a tibble/dataframe, converting row names to a column (gene_name), then adds each to a list
    counts_list = list()
    for (i in seq_along(all_tximport)) {
      rownames <- row.names(all_tximport[[i]][['counts']])
      counts_list[[i]] <- as_tibble(all_tximport[[i]][['counts']]) %>%
        add_column(Gene.name = rownames) %>%
        dplyr::select(Gene.name, V1)
    }

    # Iterates through the list of data tibbles, merging into a single set
    xenLae_counts <- counts_list[[1]]
    for (i in c(2:length(counts_list))) {
      xenLae_counts <- full_join(xenLae_counts,
                                 counts_list[[i]],
                                 by = "Gene.name"
      )
    }

    # Renames the columns to their respective sample names
    colnames(xenLae_counts) <- col_names

    # Converting TPM data to tibbles
    tpm_list = list()
    for (i in seq_along(all_tximport)) {
      rownames <- row.names(all_tximport[[i]][['abundance']])
      tpm_list[[i]] <- as_tibble(all_tximport[[i]][['abundance']]) %>%
        add_column(Gene.name = rownames) %>%
        dplyr::select(Gene.name, V1)
    }

    # Merging TPM datasets
    xenLae_tpm <- tpm_list[[1]]
    for (i in c(2:length(tpm_list))) {
      xenLae_tpm <- full_join(xenLae_tpm,
                              tpm_list[[i]],
                              by = "Gene.name"
      )
    }

    # Naming columns
    colnames(xenLae_tpm) <- col_names

    # Creating a table of gene names and their corresponding gene IDs for L, S, and non-L/S genes
    xenLae_orth <- as_tibble(read.delim("data-raw/xenLae_orth_table.csv",header=TRUE,na.strings=c("","NA")))
    xenLae_orth <- dplyr::select(xenLae_orth, Gene.name = gene_name, Ref.ID = ref_ID)

    # Creating a table of just L and S genes, matched when both chromosomal copies are present
    xenLae_LS <- as_tibble(read.delim("data-raw/xenLae_LS_paralogs.csv",header=TRUE,na.strings=c("","NA")))

    # Creating a table that takes L/S genes and only keeps the ones that have both L and S versions present
    xenLae_paralogs <- xenLae_LS %>% filter(!is.na(xenLae_LS$L_chrom) & !is.na(xenLae_LS$S_chrom))

    return(list(xenLae_tpm, xenLae_counts, xenLae_orth, xenLae_LS, xenLae_paralogs))
}
maddydoak/DoakThesis2020 documentation built on March 5, 2020, 12:53 a.m.