R/addSpecies.R

Defines functions addSpecies

#' @export

# Adding new species data to the full set for Naqvi et al. analysis

addSpecies <- function(old_metadata, old_tpm, old_counts, old_orthologs, spec_orth, prefix, spec_tpm, spec_counts, tissues, sexes) {
  library(tidyverse)
  library(tidyselect)
  library(rlang)
  source("R/makeOrthologTable.R")
  
  all_orthologs <- makeOrthologTable(old_orthologs, spec_orth, prefix)
  
  # All genes are matched to standardized human gene names
  ref_genes_tpm = refGeneNamesTPM
  ref_genes_counts = refGeneNamesCounts

  # Creating the column name for the new species, standardized to Naqvi et al. naming conventions
  new_column <- paste(prefix, ".gene.stable.ID", sep = "")

  # Creating tpm dataframe for new species
  spec_tpm            <- merge(spec_orth, spec_tpm, by = "Gene.name")
  spec_tpm$rowname    <- spec_tpm$Ref.ID
  spec_tpm            <- column_to_rownames(spec_tpm)
  spec_tpm            <- spec_tpm %>% dplyr::select(-c(Gene.name, Ref.ID))

  # Creating counts dataframe for new species
  spec_counts         <- merge(spec_orth, spec_counts, by = "Gene.name")
  spec_counts$rowname <- spec_counts$Ref.ID
  spec_counts         <- column_to_rownames(spec_counts)
  spec_counts         <- spec_counts %>% dplyr::select(-c(Gene.name, Ref.ID))

  # Making metadata dataframe
  label   <- colnames(spec_tpm)
  Species <- c(prefix)
  Tissue  <- tissues
  Sex     <- sexes
  species_metadata  <- cbind(label, Species, Tissue, Sex)
  all_metadata      <- rbind(old_metadata, species_metadata)

  # Adding new species TPM data to TPM data matrix
  genes_names_to_use_TPM <- all_orthologs %>%
    filter(Gene.name %in% ref_genes_tpm)
  
  spec_tpm <- spec_tpm %>%
    mutate(Gene.ID = rownames(spec_tpm))
  
  spec_tpm_to_use <- spec_tpm %>%
    semi_join(genes_names_to_use_TPM, by = c("Gene.ID" = new_column)) %>%
    left_join(all_orthologs[, c("Gene.name", new_column)], by = c("Gene.ID" = new_column)) %>%    # Adds 
    rename("Gene.ID" = new_column)
  
  all_tpm <- old_tpm %>%
    as.data.frame() %>%
    mutate(Gene.name = rownames(old_tpm)) %>%
    right_join(spec_tpm_to_use, by = c("Gene.name" = "Gene.name")) %>%    # only use genes from genes_names_to_use_TPM
    column_to_rownames("Gene.name")
  
  # Adding new species counts data to counts data matrix
  genes_names_to_use_counts <- all_orthologs %>%
    filter(Gene.name %in% ref_genes_counts)
  
  spec_counts <- spec_counts %>%
    mutate(Gene.ID = rownames(spec_counts))
  
  spec_counts_to_use <- spec_counts %>%
    semi_join(genes_names_to_use_counts, by = c("Gene.ID" = new_column)) %>%
    left_join(all_orthologs[, c("Gene.name", new_column)], by = c("Gene.ID" = new_column)) %>%    # Adds 
    rename("Gene.ID" = new_column)
  
  all_counts <- old_counts %>%
    as.data.frame() %>%
    mutate(Gene.name = rownames(old_counts)) %>%
    right_join(spec_counts_to_use, by = c("Gene.name" = "Gene.name")) %>%    # only use genes from genes_names_to_use_counts
    column_to_rownames("Gene.name")
  
  # Creating final list of objects to return
  to_return           <- list()
  to_return$metadata  <- as.data.frame(all_metadata)
  to_return$TPM       <- as.matrix(all_tpm)
  to_return$counts    <- as.matrix(all_counts)
  to_return$orthologs <- as.data.frame(all_orthologs)
  
  return(to_return)
}
maddydoak/DoakThesis2020 documentation built on March 5, 2020, 12:53 a.m.