In cmatKhan/bartNP: Execute BART on TF perturbation data

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  echo = TRUE,
  include = TRUE,
  eval = FALSE
)

Setup

library(tidyverse)
library(gprofiler2)
library(rtracklayer)

Raw Data

As of now, the raw data originally published in @pmid24766815 is available through the lab's website. In this example, I am using deleteome_all_mutants_svd_transformed.txt which is the originally published data with an effort made to remove a slow-growth factor of unwanted variance @https://doi.org/10.15252/msb.20145172.

Parsing the kemmeren data

parse_kemmeren_data = function(kemmeren_svd){

  # read kemmeren_svd 
  kemmeren_data_mutants = read_tsv(kemmeren_svd)

  # add gene_id as the column name of the first column
  colnames(kemmeren_data_mutants)[1] = 'gene_id'

  # remove any 'matA' samples
  kemmeren_data_mutants = kemmeren_data_mutants %>%
    select(-all_of(
      colnames(kemmeren_data_mutants)[
        str_detect(colnames(kemmeren_data_mutants), "matA")]))

  # create a table with columns deleted_locus, which is extracted from the colnames 
  # of the kemmeren data, and another column with the corresponding column names
  kemmeren_col_data = tibble(
    deleted_locus = str_extract(
      colnames(kemmeren_data_mutants)[3:ncol(kemmeren_data_mutants)], ".+(?=\\.del)"), 
    colnames = colnames(kemmeren_data_mutants)[3:ncol(kemmeren_data_mutants)]
  )

  # use gprofilr2 to get the corresponding scerevisiae systematic IDs from the
  # gene names in the kemmeren column names
  kemmeren_cols_convert = gconvert(kemmeren_col_data$deleted_locus,
                               organism = "scerevisiae",
                               mthreshold = 1,
                               filter_na = FALSE)

  # create a dataframe which will act as the colData of the NetProphetDataSet
  coldata = kemmeren_col_data %>% left_join(
    select(kemmeren_cols_convert, -all_of(c('input_number', 'target_number'))),
    by = c("deleted_locus" = "input")
  ) %>%
    filter(!is.na(target)) %>%
    select(-deleted_locus) %>%
    dplyr::rename(deleted_locus = target, name = name)

  # return a list with slots coldata and expr. The expr matrix is the kemmeren 
  # data, but filtered for those columns in the coldata
  list(
      coldata = coldata,
      expr = kemmeren_data_mutants[,c('gene_id', 'commonName', coldata$colnames)]
  )

}

Read in s. cerevisiae annotations

Yeast Genome files may be found here

YEAST_GFF = "~/Desktop/rnaseq_pipeline/rnaseq_pipeline/genome_files/S288C_R64/S288C_R64.gff"

# list of top level gene features. Note that the kemmeren data does say that it
# is at the transcript level. However, yeast genes have few exons and these seem 
# to be mostly equivalent to genes. That said, I would suggest doing some investigation 
# on your own

GFF_GENE_FEATURES = c('gene', 'pseudogene', 'tRNA_gene', 
                      'rRNA_gene', 'ncRNA_gene', 'snoRNA_gene', 'snRNA_gene')

yeast_gff = import(YEAST_GFF)

yeast_genes = yeast_gff[yeast_gff$type %in% GFF_GENE_FEATURES]

Read in the list of regulators from the NP2 paper

Parse Uniprot data

To get the data I am using, and more, go to the Uniprot website and enter the following into the search:

taxonomy:"Saccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast) [559292]" AND reviewed:yes

Note that you can select a different set of columns if you click the 'column' button

UNIPROT_DATA = "data/uniprot_yeast_20211102.tab"

# lots of parsing -- provided as an example, but you will likely need to look 
# at the file and extract what you'd like. This will be required if you go to 
# the website and select/download a different set of columns

uniprot = read_tsv(UNIPROT_DATA) %>%
  dplyr::rename(ID = `Gene names  (ordered locus )`,
                dna_binding = `DNA binding`) %>%
  mutate(dna_binding_interval        = str_extract(dna_binding, "\\d+\\.\\.\\d+"),
         dna_binding_domain          = str_extract(dna_binding, '(?<=note=)".+"'),
         dna_binding_domain_evidence = str_extract(dna_binding, '(?<=evidence=)".+"')) %>%
  separate(dna_binding_interval, sep="\\.\\.", into=c("dna_binding_start", "dna_binding_end")) %>%
  mutate(dna_binding_start = as.integer(dna_binding_start),
         dna_binding_end = as.integer(dna_binding_end),
         dna_binding_domain = trimws(str_remove_all(dna_binding_domain, ';|"|/evidence.+')),
         dna_binding_domain_evidence = str_remove_all(dna_binding_domain_evidence, ';|"'),
         dna_binding_domain = str_replace(dna_binding_domain,"A\\.T hook 1 DNA_BIND 1502\\.\\.1513  /note=A.T hook 2 DNA_BIND 1516\\.\\.1526  /note=A\\.T hook 3", "A.T hook"))

colnames(uniprot) = trimws(str_remove_all(colnames(uniprot), "Cross-reference|\\(|\\)|"), which = "both")

Read in the list of regulators used in the NP2 paper

ROWNAMES_GENE_CONVERT = here("data/gProfiler_scerevisiae_11-1-2021_5-31-10.csv")
regulators_df = read_csv(ROWNAMES_GENE_CONVERT)

create kemmeren granges

regulators_uniprot = uniprot %>%
  filter(ID %in% regulators_df$initial_alias)

cant_find_id = setdiff(kemmeren_data$expr$gene_id, yeast_gff$ID)

# deleted if the annotation has since been removed or merged from the annotations
replace_wrong_ids        = c("YKL047W", "YLR003C", "YAR061W", "DELETED", "DELETED", "DELETED",   "DELETED", "DELETED")
names(replace_wrong_ids) = c("ANR2",    "CMS1",    "YAR062W", "YDL038C", "YGR272C",  "YIL080W",  "YIL168W", "YIR044C")


kemmeren_data$expr[which(kemmeren_data$expr$gene_id %in% names(replace_wrong_ids)), "gene_id"] = replace_wrong_ids

kemmeren_data$expr = kemmeren_data$expr %>%
  filter(gene_id != "DELETED")

setdiff(kemmeren_data$expr$gene_id, yeast_gff$ID[!is.na(yeast_gff$ID)])

kemmeren_granges = yeast_gff[yeast_gff$ID %in% kemmeren_data$expr$gene_id]

kemmeren_granges = kemmeren_granges[order(match(kemmeren_granges$ID,kemmeren_data$expr$gene_id))]

stopifnot(identical(kemmeren_data$expr$gene_id, kemmeren_granges$ID))

add gene-wise data to the granges object

kemmeren_granges[which(is.na(kemmeren_granges$gene))]$gene =
  kemmeren_data$expr[which(is.na(kemmeren_granges$gene)), 'commonName']

kemmeren_granges$regulator = ifelse(kemmeren_granges$ID %in% regulators_df$initial_alias,
                                    TRUE,
                                    FALSE)

uniprot_cols_for_granges = as_tibble(kemmeren_granges) %>%
  left_join(uniprot)

elementMetadata(kemmeren_granges)[colnames(uniprot)[3:ncol(uniprot)]] =
  uniprot_cols_for_granges[,colnames(uniprot)[3:ncol(uniprot)]]

kem_tfs = kemmeren_granges[!is.na(kemmeren_granges$dna_binding_domain)]

expr = as.matrix(select(kemmeren_data$expr, -gene_id, -commonName))
rownames(expr) = kemmeren_data$expr$gene_id

del_loci_no_gene = setdiff(kemmeren_data$coldata$deleted_locus,
                           kemmeren_granges[kemmeren_granges$ID %in% kemmeren_data$coldata$deleted_locus]$ID)

fltr_coldata = kemmeren_data$coldata[!kemmeren_data$coldata$deleted_locus %in% del_loci_no_gene, ]

fltr_expr = expr[,fltr_coldata$colnames]

create the regulation matrix

regulators = read_tsv("data/YEAST_DATA/YEAST/OUTPUT/regulators", col_names = FALSE)$X1

regulators = regulators[regulators %in% fltr_coldata$deleted_locus]

tf_fltr = kemmeren_granges$ID %in% regulators

regulation_matrix = matrix(TRUE, nrow(fltr_expr), nrow(fltr_expr),
                           dimnames = list(rownames(fltr_expr), rownames(fltr_expr)))
regulation_matrix = regulation_matrix[,rownames(fltr_expr) %in% regulators]

self_reg = intersect(rownames(regulation_matrix), colnames(regulation_matrix))

regulation_matrix[cbind(self_reg,self_reg)] = FALSE

create the NetProphetDataSet

kem_np = NetProphetDataSet(
  expr        = fltr_expr,
  regMatrix   = regulation_matrix,
  rowRanges   = kemmeren_granges,
  colData     = fltr_coldata
)

Result

cmatKhan/bartNP documentation built on Dec. 19, 2021, 5:16 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

cmatKhan/bartNP
Execute BART on TF perturbation data

In cmatKhan/bartNP: Execute BART on TF perturbation data

Setup

Raw Data

Parsing the kemmeren data

Read in s. cerevisiae annotations

Read in the list of regulators from the NP2 paper

Parse Uniprot data

Read in the list of regulators used in the NP2 paper

create kemmeren granges

add gene-wise data to the granges object

create the regulation matrix

create the NetProphetDataSet

Result

R Package Documentation

Browse R Packages

We want your feedback!

cmatKhan/bartNP Execute BART on TF perturbation data

In cmatKhan/bartNP: Execute BART on TF perturbation data

Setup

Raw Data

Parsing the kemmeren data

Read in s. cerevisiae annotations

Read in the list of regulators from the NP2 paper

Parse Uniprot data

Read in the list of regulators used in the NP2 paper

create kemmeren granges

add gene-wise data to the granges object

create the regulation matrix

create the NetProphetDataSet

Result

R Package Documentation

Browse R Packages

We want your feedback!

cmatKhan/bartNP
Execute BART on TF perturbation data