R/create_strain_ids.R

library(tidyverse)
# this script reads SGA_data_combined data frame which contains costanzo 2016 data
# and creates a data frame containing SGD systematic gene names and the Allele names used in the experiments
# also removes suppressor mutation data since we weren't interested in those


net.df <- readRDS('analysis/data/derived_data/SGA_data_combined.rds.gz')

# Take query data and array data as separate data frames
# give meaningful column names to both
# combine two data frames, remove suppressor mutations, extract unique rows and save
q.data <- net.df[, c(1, 2)] %>% distinct()
a.data <- net.df[, c(3, 4)] %>% distinct()
colnames(q.data) <- c("Systematic gene name", "Allele Gene name")
colnames(a.data) <- c("Systematic gene name", "Allele Gene name")

strain_ids <- bind_rows(q.data, a.data) %>%
  distinct() %>%
  filter(grepl("supp", `Allele Gene name`) == F)

write_csv(strain_ids,'analysis/data/derived_data/strain_ids.csv')
oacar/pgsNetwork documentation built on Oct. 1, 2019, 9:15 a.m.