R/interactionNetworkAnalysis.R

Defines functions give.n

set.seed(1)
## Inputs -------------
library(tidyverse)
library(igraph)
library(ggsignif)
library(cowplot)
source("R/pgs.subnetwork.R")
source("R/add.degree.R")
source("R/sample.nones.R")
source("R/plot_data.R")
#source('R/dist.sample.R')

#pgs <- read_csv("~/Main/pgsNetwork/analysis/data/derived_data/filtering/costanzoplusmypcc_0815.csv")
#pgs <- read_csv("~/Downloads/omersyntenyfilter.csv")
overlappingorfs <- read_csv("analysis/data/raw_data/overlappingorfs.csv")
net.df <- readRDS("analysis/data/derived_data/SGA_data_combined.rds.gz")
strain_ids <- read_csv("analysis/data/derived_data/strain_ids.csv")
net_df_significant <- filter(net.df, `P-value` <= 0.05 & `Query Strain ID` %in% overlappingorfs$orf_name == FALSE & `Array Strain ID` %in% overlappingorfs$orf_name == FALSE)

net_df_significant_sl <- filter(net_df_significant, `Genetic interaction score (ε)` <= -0.2)#& `Double mutant fitness standard deviation`<=0.1)

net_df_significant_sl_035 <- filter(net.df, `P-value` <= 0.05 &`Genetic interaction score (ε)` <= -0.35)#& `Double mutant fitness standard deviation`<=0.1)

allele.frame <- readRDS("analysis/data/derived_data/df_different_alleles.rds")
nones.exp.data <- read_csv("analysis/data/derived_data/strain_ids_with_experiment_count_nonessential.csv")
exp.data <- read_csv("analysis/data/derived_data/strain_ids_with_experiment_count_all.csv")%>% mutate(group=ifelse(`Systematic gene name`%in%pgs$orf_name,'proto-gene',maincat))

try <- allele.frame # [allele.frame$orf_name%in%summary$orf_name,]
lengths <- lapply(try$different.alleles, length)
try.multiple <- try[lengths > 1, ]
try.single <- try[lengths == 1, ]
ess.names <- exp.data$`Systematic gene name`[exp.data$maincat=='essential']


## r non-essential proto-gene percentage
strong_orf_list <- net.df %>% filter(`P-value`<=0.05,`Genetic interaction score (ε)` <= -0.2) %>% select(`Query Strain ID`, `Array Strain ID`) %>% as.list() %>% unlist() %>% unique()
  #unique(c(net_df_significant_sl$`Query Strain ID`,net_df_significant_sl$`Array Strain ID`))
lethal_orf_list <- net.df %>% filter(`P-value`<=0.05,`Genetic interaction score (ε)` <= -0.35) %>% select(`Query Strain ID`, `Array Strain ID`) %>% as.list() %>% unlist() %>% unique()
exp.data <- exp.data %>% mutate(strong_interaction=ifelse(`Systematic gene name`%in%strong_orf_list,TRUE,FALSE),
                          lethal_interaction=ifelse(`Systematic gene name`%in%lethal_orf_list,TRUE,FALSE))

exp_data_nones=exp.data %>% filter(group!='essential') #%>% group_by(group,strong_interaction) %>% summarize(n=n())
table(exp_data_nones$group,exp_data_nones$strong_interaction) %>% fisher.test()
table(exp_data_nones$group,exp_data_nones$lethal_interaction) %>% fisher.test()

percent_plot <-
  exp_data_nones %>%
  select(group,strong_interaction) %>%
  group_by(group,strong_interaction) %>%
  summarise(str_count=n()) %>%
  mutate(freq = str_count / sum(str_count)) %>%
  filter(strong_interaction==TRUE) %>% mutate(cat='strong') %>% #%>% gather()
  bind_rows(
  exp_data_nones %>% select(group,lethal_interaction) %>% group_by(group,lethal_interaction) %>% summarise(leth_count=n()) %>% mutate(freq = leth_count / sum(leth_count)) %>% filter(lethal_interaction==TRUE) %>% mutate(cat='lethal')
  ) %>% mutate(cat=factor(cat,levels=c('strong','lethal'))) %>% ggplot(aes(x=cat,y=freq,fill=group))+
  geom_bar(stat='identity',position = position_dodge(width=0.95))+theme_me_bar+
  scale_fill_manual(labels=c('Proto-genes','Nonessential\nGenes'),values = c( "#1CBDC2", "#EF3D23"))+
  scale_x_discrete(labels=c(expression(epsilon*'<-0.35'),expression(epsilon*'<-0.2')))+
  ylab('Ratio with at least one\ninteraction at given threshold')+scale_y_continuous(labels=scales::percent)+xlab('')

ggsave(plot = percent_plot,filename = 'figures/Figure2B.pdf',width = 4,height = 4)
#%>% gather(leth_freq,str_freq,-group)
#exp.data %>% group_by(group,lethal_interaction) %>% summarize(n=n()) %>% filter(group!='essential') %>% spread(key=group,value=n) %>% fisher.test()


## connectivity -----------

rand_giant_nodes <- c()
rand_giant_edges <- c()
rand_giant_nodes_es <- c()
rand_giant_edges_es <- c()
giant_nodes <- c()
giant_edges <- c()
colname <- colnames(nones.exp.data)[1]
count <- 0
for (i in 1:20) {
  allnw.list <- c(unlist(try.single$different.alleles), unlist(lapply(try.multiple$different.alleles, sample, 1)))
  net.df.randomallele <- filter(net_df_significant_sl, `Query allele name` %in% allnw.list & `Array allele name` %in% allnw.list)

  net_edgelist <- as.matrix(net.df.randomallele[, c(1, 3)])

  pgs_allele <- strain_ids %>%
    filter(`Systematic gene name` %in% pgs) %>%
    select(`Allele Gene name`) %>%
    pull()
  allnet <- graph_from_edgelist(net_edgelist, directed = F)
  allnet <- simplify(allnet)
  nones.exp.data <- nones.exp.data %>%
    add.degree(allnet, colname) %>%
    dplyr::mutate(int_density = deg / num)

  exp.number.data.filtered <- exp.data %>%
    add.degree(allnet, colname) %>%
    mutate(int_density = deg / num) %>%
    filter(`Allele Gene name` %in% allnw.list)


  allnet.deg <- degree(allnet)
  allnet.btw <- betweenness(allnet, normalized = T)
  allnet.eig <- eigen_centrality(allnet)$vector


  nn <- pgs.subnetwork(allnet, pgs$orf_name)
  giant_comp_nn <- induced_subgraph(nn, V(nn)[components(nn)$membership == which.max(components(nn)$csize)])
  # giant_nodes <- append(giant_nodes,vcount(giant_comp_nn))
  # giant_edges <- append(giant_edges,ecount(giant_comp_nn))
  if(vcount(nn)==vcount(giant_comp_nn)){
    count <- count+1
  }
  for (t in 1:100) {
    giant_nodes <- append(giant_nodes, vcount(giant_comp_nn))
    giant_edges <- append(giant_edges, ecount(giant_comp_nn))
    s <- sample.nones(pgs$orf_name[pgs$orf_name %in% V(allnet)$name], nones.exp.data %>% filter(!!sym(colname)%in%V(allnet)$name), colname,dist=T)
    nn_re <- pgs.subnetwork(allnet, s)
    giant_comp_nn_re <- induced_subgraph(nn_re, V(nn_re)[components(nn_re)$membership == which.max(components(nn_re)$csize)])

    rand_giant_nodes <- append(rand_giant_nodes, vcount(giant_comp_nn_re))
    rand_giant_edges <- append(rand_giant_edges, ecount(giant_comp_nn_re))
    ess.s <-  sample.nones(pgs$orf_name[pgs$orf_name %in% V(allnet)$name],exp.number.data.filtered %>% filter(!!sym(colname)%in%V(allnet)$name& (!!sym(colname)%in%ess.names)|(!!sym(colname)%in%pgs$orf_name[pgs$orf_name %in% V(allnet)$name])),colname,array=FALSE,dist=T)

    #sample(V(allnet)$name[V(allnet)$name %in% ess.names$X1], length(pgs$orf_name[pgs$orf_name %in% V(allnet)$name]))
    nn_es <- pgs.subnetwork(allnet, ess.s)
    giant_comp_nn_es <- induced_subgraph(nn_es, V(nn_es)[components(nn_es)$membership == which.max(components(nn_es)$csize)])
    rand_giant_nodes_es <- append(rand_giant_nodes_es, vcount(giant_comp_nn_es))
    rand_giant_edges_es <- append(rand_giant_edges_es, ecount(giant_comp_nn_es))
  }
}

# boxplot(giant_nodes,rand_giant_nodes,rand_giant_nodes_es)
# boxplot(giant_edges,rand_giant_edges,rand_giant_edges_es)
giant_comp_df <- bind_rows(data.frame(data = giant_nodes, group = "proto-gene"), data.frame(data = rand_giant_nodes, group = "nonessential"), data.frame(data = rand_giant_nodes_es, group = "essential"))
giant_comp_df <- bind_rows(data.frame(data = giant_edges, group = "proto-gene"), data.frame(data = rand_giant_edges, group = "nonessential"), data.frame(data = rand_giant_edges_es, group = "essential"))
giant_comp_df$group <- factor(giant_comp_df$group, levels = c("proto-gene", "nonessential", "essential"))
# ggplot(giant_comp_df,aes(x=group,y=data,fill=group))+geom_boxplot()
p12 <- 2 * sum(abs(giant_comp_df$data[giant_comp_df$group == "proto-gene"]) - abs(giant_comp_df$data[giant_comp_df$group == "nonessential"]) >= 0) / length(giant_comp_df$data[giant_comp_df$group == "proto-gene"])#0.026
p13 <- 2 * sum(abs(giant_comp_df$data[giant_comp_df$group == "proto-gene"]) - abs(giant_comp_df$data[giant_comp_df$group == "essential"]) >= 0) / length(giant_comp_df$data[giant_comp_df$group == "proto-gene"])#0
p23 <- 2 * sum(abs(giant_comp_df$data[giant_comp_df$group == "nonessential"]) - abs(giant_comp_df$data[giant_comp_df$group == "essential"]) >= 0) / length(giant_comp_df$data[giant_comp_df$group == "proto-gene"])#0.157

# function for number of observations
give.n <- function(x){
  return(c(y = median(x)*1.05, label = length(x)))
  # experiment with the multiplier to find the perfect position
}

ggplot(giant_comp_df, aes(x = group, y = data, group = group, fill = group)) + geom_boxplot(outlier.shape = NA) + scale_x_discrete(labels = c("Proto-genes", "Non-essential Genes", "Essential Genes")) +
  ylab("Connectivity") + xlab("") + theme_me_bar+ scale_fill_manual(values = c("#1CBDC2", "#EF3D23", "#FAA51A"))+theme(legend.position='none')# +
ggsave('figures/Figure3B.pdf',width = 3.5,height = 3.5)

## r random node selection degree-betweenness-trans-------------


sim_n=10
essentials <- c(net_df_significant$`Query Strain ID`[net_df_significant$data_source=='ExE'],net_df_significant$`Array Strain ID`[net_df_significant$data_source=='ExE']) %>% unique()#net.df%>%filter(data_source=='ExE')
rand_degree <- vector("list",sim_n)
rand_betweenness <- vector("list",sim_n)
rand_trans <- vector("list",sim_n)
rand_ess_degree <- vector("list",sim_n)
rand_ess_betweenness <- vector("list",sim_n)
rand_ess_trans <- vector("list",sim_n)
pgs_degree <- vector("list",sim_n)
pgs_betweenness <- vector("list",sim_n)
pgs_trans <- vector("list",sim_n)
colname <- colnames(nones.exp.data)[1]
pgs_ <- pgs#[pgs$orf_name%in%nones.exp.data$`Systematic gene name`[nones.exp.data$cat=='nxes.only'],]
for (i in 1:sim_n) {
  allnw.list <- c(unlist(try.single$different.alleles), unlist(lapply(try.multiple$different.alleles, sample, 1)))
  net.df.randomallele <- filter(net_df_significant_sl, `Query allele name` %in% allnw.list & `Array allele name` %in% allnw.list)

  net_edgelist <- as.matrix(net.df.randomallele[, c(1, 3)])

  pgs_allele <- strain_ids %>%
    filter(`Systematic gene name` %in% pgs) %>%
    select(`Allele Gene name`) %>%
    pull()
  allnet <- graph_from_edgelist(net_edgelist, directed = F)
  allnet <- simplify(allnet)
  nones.exp.data <- nones.exp.data %>%
    add.degree(allnet, colname) %>%
    dplyr::mutate(int_density = deg / num)

  exp.number.data.filtered <- exp.data %>%
    add.degree(allnet, colname) %>%
    mutate(int_density = deg / num) %>%
    filter(`Allele Gene name` %in% allnw.list)


  allnet.deg <- degree(allnet)
  allnet.btw <- betweenness(allnet, normalized = T,weights = NA)
  # allnet.eig <- eigen_centrality(allnet)$vector
  allnet.trans <- transitivity(allnet,'local')
  names(allnet.trans) <- names(allnet.deg)
  # nn<-pgs.subnetwork(allnet, pgs$orf_name)
  # giant_comp_nn <- induced_subgraph(nn,V(nn)[components(nn)$membership==which.max(components(nn)$csize)])
  # giant_nodes <- append(giant_nodes,vcount(giant_comp_nn))
  # giant_edges <- append(giant_edges,ecount(giant_comp_nn))
  for (t in 1:100) {
    #cat=='nxes.only'&
    s <- sample.nones(pgs_$orf_name[pgs_$orf_name %in% V(allnet)$name], nones.exp.data %>% filter(!!sym(colname)%in%V(allnet)$name), colname,dist=F,array=T)
    s.dist <- sample.nones(pgs_$orf_name[pgs_$orf_name %in% V(allnet)$name], nones.exp.data %>% filter(!!sym(colname)%in%V(allnet)$name), colname,dist=T)
    ess.s <-  sample.nones(pgs_$orf_name[pgs_$orf_name %in% V(allnet)$name],exp.number.data.filtered %>% filter(!!sym(colname)%in%V(allnet)$name& (!!sym(colname)%in%ess.names)|(!!sym(colname)%in%pgs_$orf_name[pgs_$orf_name %in% V(allnet)$name])),colname,array=FALSE,dist=F)
    ess.s.dist <- sample.nones(pgs_$orf_name[pgs_$orf_name %in% V(allnet)$name],exp.number.data.filtered %>% filter(!!sym(colname)%in%V(allnet)$name& (!!sym(colname)%in%ess.names)|(!!sym(colname)%in%pgs_$orf_name[pgs_$orf_name %in% V(allnet)$name])),colname,array=FALSE,dist=T)

    rand_degree[[i]][t] <- list(allnet.deg[names(allnet.deg)%in%s])
    rand_betweenness[[i]][t] <- list(allnet.btw[names(allnet.btw)%in%s.dist])
    rand_trans[[i]][t] <- list(allnet.trans[names(allnet.trans)%in%s.dist])
    pgs_degree[[i]][t] <- list(allnet.deg[names(allnet.deg)%in%pgs_$orf_name])
    pgs_betweenness[[i]][t] <- list(allnet.btw[names(allnet.btw)%in%pgs_$orf_name])
    pgs_trans[[i]][t] <- list(allnet.trans[names(allnet.trans)%in%pgs_$orf_name])
    rand_ess_degree[[i]][t] <- list(allnet.deg[names(allnet.deg)%in%ess.s])
    rand_ess_betweenness[[i]][t] <- list(allnet.btw[names(allnet.btw)%in%ess.s.dist])
    rand_ess_trans[[i]][t] <- list(allnet.trans[names(allnet.trans)%in%ess.s.dist])

  }

}

rand_degree_mean <- map(rand_degree,function(x) map(x,mean, na.rm=T)) %>% unlist()
rand_degree <- unlist(rand_degree)

rand_betweenness_mean <- map(rand_betweenness,function(x) map(x,mean, na.rm=T)) %>% unlist()
rand_betweenness <- unlist(rand_betweenness)

rand_trans_mean <- map(rand_trans,function(x) map(x,mean, na.rm=T)) %>% unlist()
rand_trans <- unlist(rand_trans)

rand_ess_degree_mean <- map(rand_ess_degree,function(x) map(x,mean, na.rm=T)) %>% unlist()
rand_ess_degree <- unlist(rand_ess_degree)

rand_ess_betweenness_mean <- map(rand_ess_betweenness,function(x) map(x,mean, na.rm=T)) %>% unlist()
rand_ess_betweenness <- unlist(rand_ess_betweenness)

rand_ess_trans_mean <- map(rand_ess_trans,function(x) map(x,mean,na.rm=T)) %>% unlist()
rand_ess_trans <- unlist(rand_ess_trans)

pgs_degree_mean <- map(pgs_degree,function(x) map(x,mean, na.rm=T)) %>% unlist()
pgs_degree <- unlist(pgs_degree)

pgs_betweenness_mean <- map(pgs_betweenness,function(x) map(x,mean, na.rm=T)) %>% unlist()
pgs_betweenness <- unlist(pgs_betweenness)

pgs_trans_mean <- map(pgs_trans,function(x) map(x,mean, na.rm=T)) %>% unlist()
pgs_trans <- unlist(pgs_trans)



## r degree-betweenness-trans plots------------
#degree p=0.003
#btw p=0.339
#trans p=0.106
# d1=plot_data(pgs_degree_mean,rand_degree_mean,rand_ess_degree_mean,pgs_degree,rand_degree,rand_ess_degree,title = 'degree')[[1]]
# d2=plot_data(pgs_betweenness_mean,rand_betweenness_mean,rand_ess_betweenness_mean,pgs_betweenness,rand_betweenness,rand_ess_betweenness,binwidth = 0.0001,xlim=0,title = 'betweenness')[[1]]
# d3=plot_data(pgs_trans_mean,rand_trans_mean,rand_ess_trans_mean,pgs_trans,rand_trans,rand_ess_trans,binwidth = 0.1,xlim=0,title = 'transitivity')[[1]]
#
# plot_grid(d1,d2,d3,ncol=3)


d1=plot_data_ecdf(pgs_degree,rand_degree,rand_ess_degree,title = 'Degree',binwidth=2,logscale = T)
d2=plot_data_ecdf(pgs_betweenness,rand_betweenness,rand_ess_betweenness,binwidth = 0.001,xlim=0,title = 'Betweenness',logscale = T)
d3=plot_data_ecdf(pgs_trans,rand_trans,rand_ess_trans,binwidth = 0.1,xlim=0,title = 'Transitivity',logscale = T)

legend = get_legend(d1+theme(legend.box = 'horizontal',legend.justification = 'center'))
prow = plot_grid(d1+theme(legend.position = 'none'),
                 d2+theme(legend.position = 'none'),
                 d3+theme(legend.position = 'none'),
                 align = 'vh',
                 hjust = -1,
                 nrow = 1)

pdf('figures/Figure4A_C.pdf',width = 7,height = 3)
plot_grid(prow,legend,ncol=1,rel_heights = c(1,.1))
dev.off()
oacar/pgsNetwork documentation built on Oct. 1, 2019, 9:15 a.m.