data-raw/DATASET.R

## code to prepare `DATASET` dataset goes here

library(tidyverse)

# Naqvi ortholog table
Naqvi_orth <- read.table("data-raw/one2oneorth_emblids.txt", header = TRUE, stringsAsFactors = FALSE, sep = "\t", fill = TRUE)

# Naqvi metadata table
human_metadata    <- read.table("data-raw/human.metadata.txt",  header = TRUE, stringsAsFactors = FALSE)
nonhuman_metadata <- read.table("data-raw/sample.metadata.txt", header = TRUE, stringsAsFactors = FALSE, sep="\t")
nonhuman_metadata <- subset(nonhuman_metadata, Tissue != "Testis")
nonhuman_metadata$comb_label <- paste(nonhuman_metadata$Species,
                                      nonhuman_metadata$Sex,
                                      nonhuman_metadata$Tissue,
                                      nonhuman_metadata$label,
                                      sep = "_")
Naqvi_metadata <- data.frame(rbind(as.matrix(human_metadata[,c("RUN", "Species", "Tissue", "Sex")]),
                                   as.matrix(nonhuman_metadata[,c("comb_label", "Species", "Tissue", "Sex")])))
colnames(Naqvi_metadata) <- c("label", "Species", "Tissue", "Sex")
Naqvi_metadata$label   <- as.character(Naqvi_metadata$label)
Naqvi_metadata$Species <- as.character(Naqvi_metadata$Species)
Naqvi_metadata$Tissue  <- as.character(Naqvi_metadata$Tissue)
Naqvi_metadata$Sex     <- as.character(Naqvi_metadata$Sex)

# Naqvi count/TPM data
human_counts  <- as.matrix(read.table(gzfile("data-raw/gtex.filt.salmon.tximport.adj.counts.txt.gz"), header = TRUE, row.names = 1, sep = "\t"))    # Over 100MB, go to https://zenodo.org/record/2658829#.XkrmgJNKhBx
human_tpm     <- as.matrix(read.table(gzfile("data-raw/gtex.filt.salmon.tximport.unadj.tpm.txt.gz"),  header = TRUE, row.names = 1, sep = "\t"))
cyno_counts   <- as.matrix(read.table(gzfile("data-raw/cyno.salmon.tximport.counts.txt.gz"),          header = TRUE, row.names = 1, sep = "\t"))
cyno_tpm      <- as.matrix(read.table(gzfile("data-raw/cyno.salmon.tximport.tpm.txt.gz"),             header = TRUE, row.names = 1, sep = "\t"))
mouse_counts  <- as.matrix(read.table(gzfile("data-raw/mouse.salmon.tximport.counts.txt.gz"),         header = TRUE, row.names = 1, sep = "\t"))
mouse_tpm     <- as.matrix(read.table(gzfile("data-raw/mouse.salmon.tximport.tpm.txt.gz"),            header = TRUE, row.names = 1, sep = "\t"))
rat_counts    <- as.matrix(read.table(gzfile("data-raw/rat.salmon.tximport.counts.txt.gz"),           header = TRUE, row.names = 1, sep = "\t"))
rat_tpm       <- as.matrix(read.table(gzfile("data-raw/rat.salmon.tximport.tpm.txt.gz"),              header = TRUE, row.names = 1, sep = "\t"))
dog_counts    <- as.matrix(read.table(gzfile("data-raw/dog.salmon.tximport.counts.txt.gz"),           header = TRUE, row.names = 1, sep = "\t"))
dog_tpm       <- as.matrix(read.table(gzfile("data-raw/dog.salmon.tximport.tpm.txt.gz"),              header = TRUE, row.names = 1, sep = "\t"))

Naqvi_tpm     <- cbind(human_tpm[subset(Naqvi_orth, Gene.name %in% rownames(human_tpm))$Gene.name, ],
                       cyno_tpm[subset(Naqvi_orth,Gene.name %in% rownames(human_tpm))$Crab.eating.macaque.gene.stable.ID,],
                       mouse_tpm[subset(Naqvi_orth,Gene.name %in% rownames(human_tpm))$Mouse.gene.stable.ID,],
                       rat_tpm[subset(Naqvi_orth,Gene.name %in% rownames(human_tpm))$Rat.gene.stable.ID,],
                       dog_tpm[subset(Naqvi_orth,Gene.name %in% rownames(human_tpm))$Dog.gene.stable.ID,])

Naqvi_counts  <- cbind(human_counts[subset(Naqvi_orth,Gene.name %in% rownames(human_counts))$Gene.name,],
                       cyno_counts[subset(Naqvi_orth,Gene.name %in% rownames(human_counts))$Crab.eating.macaque.gene.stable.ID,],
                       mouse_counts[subset(Naqvi_orth,Gene.name %in% rownames(human_counts))$Mouse.gene.stable.ID,],
                       rat_counts[subset(Naqvi_orth,Gene.name %in% rownames(human_counts))$Rat.gene.stable.ID,],
                       dog_counts[subset(Naqvi_orth,Gene.name %in% rownames(human_counts))$Dog.gene.stable.ID,])

# My Xenopus laevis data
source("R/getQuantData.R")
prefix          <- "XenLae"
sample_names    <- c("Male_Brain_M2_3",   "Male_Brain_M2_4",   "Male_Brain_M2_5",   "Male_Brain_M2_6",   "Male_Brain_M2_9",
                     "Female_Brain_F2_1", "Female_Brain_F2_2", "Female_Brain_F2_4", "Female_Brain_F2_5", "Female_Brain_F2_6")
xenLae_data     <- getQuantData("data-raw/xenLae_txt2gene.csv", "data-raw/", prefix, sample_names)
xenLae_counts   <- xenLae_data$counts
xenLae_tpm      <- xenLae_data$TPM

xenLae_orth     <- as_tibble(read_delim("data-raw/xenLae_orth_table.csv", "\t", col_names = TRUE, na = c("", "NA")))
  xenLae_orth   <- dplyr::select(xenLae_orth, Gene.name = gene_name, Ref.ID = ref_ID)
xenLae_LS       <- as_tibble(read_delim("data-raw/xenLae_LS_paralogs.csv", "\t", col_names = TRUE, na = c("", "NA")))
xenLae_paralogs <- xenLae_LS %>% filter(!is.na(xenLae_LS$L_chrom) & !is.na(xenLae_LS$S_chrom))

# Combining my data with Naqvi data
source("R/makeOrthologTable.R")
source("R/addSpecies.R")
tissues <- c("Brain")
sexes   <- c("Male",   "Male",   "Male",   "Male",   "Male",
             "Female", "Female", "Female", "Female", "Female")
refGeneNamesTPM    <- rownames(human_tpm)
refGeneNamesCounts <- rownames(human_counts)

final_data    <- addSpecies(Naqvi_metadata, Naqvi_tpm, Naqvi_counts, 
                            Naqvi_orth, xenLae_orth,
                            prefix, xenLae_tpm, xenLae_counts, tissues, sexes)

doakMetadata  <- final_data$metadata
doakTPM       <- final_data$TPM
doakCounts    <- final_data$counts
doakOrthologs <- final_data$orthologs

source("R/getConservedSexBiasedGenes.R")
conservedGenes <- getConservedSexBiasedGenes(doakMetadata, doakTPM, doakCounts)

usethis::use_data(doakMetadata, doakOrthologs, doakTPM, doakCounts, refGeneNamesTPM, refGeneNamesCounts, conservedGenes, overwrite = TRUE)
maddydoak/DoakThesis2020 documentation built on March 5, 2020, 12:53 a.m.