library(BiocStyle)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Downloading the count data

We obtain a copy of the processed counts from the GitHub repo of the Barton group.

library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
snf_data_path <- bfcrpath(bfc,
    file.path("https://github.com/bartongroup/profDGE48/raw",
              "375dc0d57d9d1fa96a4245a6530e0fda34305891/Preprocessed_data",
              "Snf2_countdata.tar.gz"))
wt_data_path <- bfcrpath(bfc, 
    file.path("https://github.com/bartongroup/profDGE48/raw",
              "375dc0d57d9d1fa96a4245a6530e0fda34305891/Preprocessed_data",
              "WT_countdata.tar.gz"))
bad_repl_path <- bfcrpath(bfc, 
    file.path("https://github.com/bartongroup/profDGE48/raw",
              "375dc0d57d9d1fa96a4245a6530e0fda34305891/Bad_replicate_identification",
              "exclude.lst"))
tmp_file_folder <- file.path(tempdir(), "/count_files")
untar(snf_data_path, exdir=tmp_file_folder)
untar(wt_data_path , exdir=tmp_file_folder)

Load all the files

library(dplyr)
library(stringr)
library(readr)
library(tidyr)
library(purrr)
library(tibble)

wt_cnts <- list.files(tmp_file_folder, pattern = "^WT") %>%
  map_df(function(file_name){
    read_tsv(file.path(tmp_file_folder, file_name), 
             col_names = c("GeneName", "Count"), col_types = "ci") %>%
      mutate(file_name = file_name,
             Condition = "wildtype",
             replicate = as.numeric(str_match(file_name, "rep(\\d{2})_")[,2]))
  })

ko_cnts <- list.files(tmp_file_folder, pattern = "^Snf") %>%
  map_df(function(file_name){
    read_tsv(file.path(tmp_file_folder, file_name), 
             col_names = c("GeneName", "Count"), col_types = "ci") %>%
      mutate(file_name = file_name,
             Condition = "knockout",
             replicate = as.numeric(str_match(file_name, "rep(\\d{2})_")[,2]))
  })

bad_replicates <- paste0(read_lines(bad_repl_path), ".gbgout")
bad_genes <- c("no_feature", "ambiguous", "too_low_aQual", 
               "not_aligned", "alignment_not_unique")
all_cnts <- bind_rows(wt_cnts, ko_cnts) %>%
  filter(! file_name %in% bad_replicates) %>%
  filter(! GeneName %in% bad_genes)

Save the count matrix and the column data separately:

cnt_mat <-  all_cnts %>%
  mutate(name = paste0(Condition, "_", sprintf("%02i", replicate))) %>%
  dplyr::select(GeneName, Count, name) %>%
  pivot_wider(GeneName, names_from = name, values_from = Count, 
              values_fill = list(Count = NA)) %>%
  column_to_rownames("GeneName") %>%
  as.matrix()

col_data <- all_cnts %>%
  dplyr::select(file_name, condition = Condition, replicate) %>%
  mutate(name = paste0(condition, "_", sprintf("%02i", replicate))) %>% 
  distinct %>%
  as.data.frame()
outpath <- file.path("final_data", "schurch")
dir.create(outpath, showWarnings=FALSE, recursive=TRUE)

saveRDS(cnt_mat, file.path(outpath, "count_matrix.rds"))
saveRDS(col_data, file.path(outpath, "column_data.rds"))

Session information

sessionInfo()


const-ae/HighlyReplicatedRNASeq documentation built on Jan. 19, 2021, 12:46 a.m.