data-raw/gtex_bulk_matrix.R

library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
library(recount)
library(usethis)

dl_recount <- function(sra_id){
  if (!file.exists(file.path(sra_id, "rse_gene.Rdata"))) {
    download_study(sra_id)
  }
  load(file.path(sra_id, "rse_gene.Rdata"))
  # no longer need to downloaded data
  unlink(sra_id, recursive = TRUE)
  rse <- scale_counts(rse_gene)
  read_counts <- assay(rse, "counts")
  gene_ids <- rownames(read_counts)
  # get gene symbols, which are stored in rowData
  id2symbol <- data_frame(ids = rowData(rse_gene)$gene_id,
                          symbols = rowData(rse_gene)$symbol@listData) %>%
    mutate(symbols = map_chr(symbols, ~.x[1]))
  
  # clean up metadata into a dataframe
  print("cleaning up meta")
  mdata <- colData(rse)
  mdata_cols <- lapply(mdata$characteristics,
                       function(x){str_match(x, "^([^:]+):")[, 2]}) %>%
    unique() %>%
    unlist()
  
  mdata <- data_frame(run =  mdata$run,
                      all_data = as.list(mdata$characteristics)) %>%
    mutate(out = purrr::map_chr(all_data,
                                ~str_c(.x, collapse = "::"))) %>%
    tidyr::separate(out,
                    sep = "::",
                    into = mdata_cols) %>%
    select(-all_data) %>%
    mutate_at(.vars = vars(-matches("run")),
              .funs = function(x) str_match(x, ": (.+)")[, 2])
  
  # convert ids to symbols
  row_ids_to_symbols <- left_join(data_frame(ids = gene_ids),
                                  id2symbol, by = "ids")
  
  if(length(gene_ids) != nrow(row_ids_to_symbols)) {
    warning("gene id mapping to symbols produce more or less ids")
  }
  
  row_ids_to_symbols <- filter(row_ids_to_symbols, !is.na(symbols))
  out_df <- read_counts %>%
    as.data.frame() %>%
    tibble::rownames_to_column("gene_id") %>%
    left_join(., row_ids_to_symbols,
              by = c("gene_id" = "ids")) %>%
    dplyr::select(-gene_id) %>%
    dplyr::select(symbols, everything()) %>%
    filter(!is.na(symbols))
  
  out_matrix <- tidyr::gather(out_df, library, expr, -symbols) %>%
    group_by(symbols, library) %>%
    summarize(expr = sum(expr)) %>%
    tidyr::spread(library, expr) %>%
    as.data.frame() %>%
    tibble::column_to_rownames("symbols") %>%
    as.matrix()
  
  list(read_counts = out_matrix,
       meta_data = mdata)
}

gtex_bulk_matrix <- dl_recount("SRP012682")

usethis::use_data(gtex_bulk_matrix, compress = "xz", overwrite = TRUE)
rnabioco/clustifyrdata documentation built on Nov. 8, 2022, 4:26 a.m.