R/create_strain_ids_with_experiment_count.R

Defines functions exp.type

library(tidyverse)
# read inputs -------------
net.df <- readRDS("analysis/data/derived_data/SGA_data_combined.rds.gz")
strain_ids <- read_csv("analysis/data/derived_data/strain_ids.csv")

# extract gene names depending on the experiment they are ---------------
nones.query.names <- net.df %>%
  filter(data_source == "NxN") %>%
  select(`Query allele name`) %>%
  distinct() %>%
  pull()

nones.array.names <- net.df %>%
  filter(data_source == "NxN") %>%
  select(`Array allele name`) %>%
  distinct() %>%
  pull()


nxes.query.names <- net.df %>%
  filter(data_source == "ExN_NxE" & (`Arraytype/Temp` == "TSA26" | `Arraytype/Temp` == "TSA30")) %>%
  select(`Query allele name`) %>%
  distinct() %>%
  pull()

essetial.query <- net.df %>%
  filter(data_source=='ExE') %>%
  select(`Query allele name`) %>%
  pull()
essetial.array <- net.df %>%
  filter(data_source=='ExE') %>%
  select(`Array allele name`) %>%
  pull()
exn.query <- net.df %>%
  filter(data_source == "ExN_NxE" & (`Arraytype/Temp` == "TSA26" | `Arraytype/Temp` == "TSA30")==F) %>%
  select(`Query allele name`) %>%
  distinct() %>%
  pull()
essential.alleles <- unique(c(essetial.query,essetial.array,exn.query))

# name conversion
na <- nones.array.names
nq <- nones.query.names
nxes <- nxes.query.names


# find genes that are tested in different conditions (being array, being query, crossed with essential, crossed with non-essential)
na.nq <- setdiff(intersect(na, nq), nxes)
na.nxes <- setdiff(intersect(na, nxes), nq)
nq.nxes <- setdiff(intersect(nq, nxes), na)
na.nq.nxes <- intersect(na, intersect(nq, nxes))
na.only <- setdiff(na, union(nq, nxes))
nq.only <- setdiff(nq, union(na, nxes))
nxes.only <- setdiff(nxes, union(nq, na))

# calculate number of experiments for a gene -----------
library(igraph)
exp.type <- function(x, na.nq,
                     na.nxes,
                     nq.nxes,
                     na.nq.nxes,
                     na.only,
                     nq.only,
                     nxes.only) {
  id <- which(sapply(list(na.nq, na.nxes, nq.nxes, na.nq.nxes, na.only, nq.only, nxes.only), function(y) any(x %in% y)))
  if (sum(id) == 0) id <- 8
  switch(id,
    "1" = "na.nq",
    "2" = "na.nxes",
    "3" = "nq.nxes",
    "4" = "na.nq.nxes",
    "5" = "na.only",
    "6" = "nq.only",
    "7" = "nxes.only",
    NA
  )
}

# create a network from all data, the degrees of nodes will give the number of times a gene tested
# using a for loop, add number of tests and experiment category to strain ids dataframe and save it

all.nw <- graph_from_edgelist(as.matrix(net.df[, c(2, 4)]), directed = F)
deg.all.nw <- degree(all.nw)

exp.number.data <- strain_ids
exp.number.data$num <- NA
i <- 1
exp.number.data$cat <- NA
exp.number.data$maincat <- NA

for (i in 1:nrow(exp.number.data)) {
  name <- exp.number.data$`Allele Gene name`[i]
  num <- deg.all.nw[names(deg.all.nw) == name]
  exp.number.data$num[i] <- ifelse(length(num)==0,0,num)
  exp.number.data$maincat[i] <- ifelse(name%in%essential.alleles,'essential','nonessential')

  exp.number.data$cat[i] <- exp.type(name, na.nq, na.nxes, nq.nxes, na.nq.nxes, na.only, nq.only, nxes.only)
}

exp.number.data$bin <- cut(exp.number.data$num, breaks = seq(0, 12000, 100), labels = F)


nones.exp.data <- exp.number.data[is.na(exp.number.data$cat) == F, ]


# save files ---------------
write_csv(nones.exp.data, "analysis/data/derived_data/strain_ids_with_experiment_count_nonessential.csv")
write_csv(exp.number.data, "analysis/data/derived_data/strain_ids_with_experiment_count_all.csv")
oacar/pgsNetwork documentation built on Oct. 1, 2019, 9:15 a.m.