R/enm_analysis.R

set.seed(1)
## Read Inputs files----------------
library(tidyverse)
library(igraph)
library(ggsignif)
library(cowplot)
library(data.table)
source("R/pgs.subnetwork.R")
source("R/add.degree.R")
source("R/sample.nones.R")
source("R/plot_data.R")
getwd()
#set.seed(1)
#pgs <- read_csv("~/Main/pgsNetwork/analysis/data/derived_data/filtering/costanzoplusmypcc_0815.csv")
#read_csv("~/Main/pgsNetwork/data/0701_proto-genes_list")
#pgs <- read_csv("~/Downloads/omersyntenyfilter.csv")
overlappingorfs <- read_csv("analysis/data/raw_data/overlappingorfs.csv")
strain_ids <- read_csv("analysis/data/derived_data/strain_ids.csv")
costanzo=T
if(costanzo==F){
  #read my pcc values

  ndund1='analysis/data/derived_data/pcc_calculations_rcpp/nodelist_und1'
  und1pcc <- 'analysis/data/derived_data/pcc_calculations_rcpp/und12avgpcc'
  ndavgnorm <- 'analysis/data/derived_data/pcc_calculations_rcpp/ndlistavgpcc_calc_normalized_matrix.csv'
  avgnorm <- 'analysis/data/derived_data/pcc_calculations_rcpp/avgpcc_calc_normalized_matrix.csv'
  pcc0703 <- 'analysis/data/derived_data/pcc_calculations_rcpp/pcc_calc_0703'
  ndlist0703 <- 'analysis/data/derived_data/pcc_calculations_rcpp/nodelist_0703'
  pccnorm <- 'analysis/data/derived_data/pcc_calculations/pcc_calc_normalized_matrix.csv'
  ndlistpccnorm <- 'analysis/data/derived_data/pcc_calculations/ndlistpcc_calc_normalized_matrix.csv'
  # ndlist <- fread(ndlist_fname,header=F)
  # adj_res <- fread(adj_res_fname,header=F)
  adj_res <- fread(pcc0703,header=F)
  ndlist <- fread(ndlist0703,header=F)

  adj_res <- as.matrix(adj_res)
  colnames(adj_res) <- ndlist$V1
  rownames(adj_res) <- ndlist$V1

  adj_res_graph <- graph_from_adjacency_matrix(adj_res,weighted=TRUE,mode='undirected',diag=F)
  g <- delete.vertices(adj_res_graph,which(str_detect(V(adj_res_graph)$name,'supp')|V(adj_res_graph)$name%in%strain_ids$`Allele Gene name`[strain_ids$`Systematic gene name`%in%overlappingorfs$orf_name]))


  g_final=delete.edges(g, which(E(g)$weight <.2|is.na(E(g)$weight)|is.nan(E(g)$weight)))

  g_final <- delete.vertices(g_final,which(degree(g_final)<1 ))
  pcc.net <- g_final#graph_from_data_frame(df.pcc,F)
  #E(pcc.net)$weight <- df.pcc$pcc
  nw.del<-igraph::simplify(pcc.net,remove.loops = T)
}else{
  costanzo_matrix_data <- fread('analysis/data/raw_data//pcc_ALL.txt',header=T)
  costanzo_matrix <- costanzo_matrix_data[-1,-c(1,2)]%>%as.matrix()
  rownames(costanzo_matrix) <- colnames(costanzo_matrix)
  class(costanzo_matrix) <- "numeric"
  adj_res_graph <- graph_from_adjacency_matrix(costanzo_matrix,weighted=TRUE,mode='undirected',diag=F)
  g <- delete.vertices(adj_res_graph,which(str_detect(V(adj_res_graph)$name,'supp')|V(adj_res_graph)$name%in%strain_ids$`Allele Gene name`[strain_ids$`Systematic gene name`%in%overlappingorfs$orf_name]))#
  # load('~/Main/anne/network_analysis/df.pcc.Rdata')
  # df.pcc <- df.pcc %>% filter((pcc)>=0.2) %>% drop_na()

  g_final=delete.edges(g, which(E(g)$weight <.2|is.na(E(g)$weight)|is.nan(E(g)$weight)))

  g_final <- delete.vertices(g_final,which(degree(g_final)<1 ))
  pcc.net <- g_final#graph_from_data_frame(df.pcc,F)
  # E(pcc.net)$weight <- df.pcc$pcc
  nw.del<-igraph::simplify(pcc.net,remove.loops = T)
}



allele.frame <- readRDS("analysis/data/derived_data/df_different_alleles.rds")
nones.exp.data <- read_csv("analysis/data/derived_data/strain_ids_with_experiment_count_nonessential.csv")
exp.number.data <- read_csv("analysis/data/derived_data/strain_ids_with_experiment_count_all.csv")%>% mutate(group=ifelse(`Systematic gene name`%in%pgs$orf_name,'proto-gene',maincat))
try <- allele.frame # [allele.frame$orf_name%in%summary$orf_name,]
lengths <- lapply(try$different.alleles, length)
try.multiple <- try[lengths > 1, ]
try.single <- try[lengths == 1, ]
ess.names <- read_csv("../anne/network_analysis/0530essential/ess_names.csv", col_names = F)
essentials <-  exp.number.data$`Allele Gene name`[exp.number.data$maincat=='essential']
#read_csv("../anne/network_analysis/0530essential/ess_alleles.csv", col_names = F) %>% dplyr::select(X1) %>% pull()

# Simulation -----------
enm_res <- read_tsv('analysis/data/derived_data/enm/sqfs_500modes.tsv',col_names = c('gene1','sqfs'))
sim_n=10
rand_enm <- vector("list",sim_n)
rand_ess_enm <- vector("list",sim_n)
pgs_enm <- vector("list",sim_n)
colname = colnames(nones.exp.data)[2]
for (i in 1:sim_n) {
  allnw.list <- c(unlist(try.single$different.alleles), unlist(lapply(try.multiple$different.alleles, sample, 1)))

  pgs_allele <- strain_ids %>%
    filter(`Systematic gene name` %in% pgs$orf_name) %>%
    select(`Allele Gene name`) %>%
    pull()
  allnet <- induced.subgraph(nw.del, V(nw.del)[V(nw.del)$name%in%allnw.list]) %>%
    delete.vertices(which(degree(.)==0))

  nones.exp.data <- nones.exp.data %>%
    add.degree(allnet, colname) %>%
    dplyr::mutate(int_density = deg / num)

  exp.number.data.filtered <- exp.number.data %>%
    add.degree(allnet, colname) %>%
    mutate(int_density = deg / num) %>%
    filter(`Allele Gene name` %in% allnw.list)


  allnet.deg <- degree(allnet)
  allnet.btw <- betweenness(allnet, weights = NA,normalized = T)
  # allnet.eig <- eigen_centrality(allnet)$vector
  allnet.trans <- transitivity(allnet,'local')
  names(allnet.trans) <- names(allnet.deg)
  # nn<-pgs.subnetwork(allnet, pgs$orf_name)
  # giant_comp_nn <- induced_subgraph(nn,V(nn)[components(nn)$membership==which.max(components(nn)$csize)])
  # giant_nodes <- append(giant_nodes,vcount(giant_comp_nn))
  # giant_edges <- append(giant_edges,ecount(giant_comp_nn))
  for (t in 1:100) {
    #s <- sample.nones(pgs_allele[pgs_allele %in% V(allnet)$name], nones.exp.data %>% filter(!!sym(colname)%in%V(allnet)$name), colname,dist=F)
    s.dist <- sample.nones(pgs_allele[pgs_allele %in% V(allnet)$name], nones.exp.data %>% filter(!!sym(colname)%in%V(allnet)$name), colname,dist=T)
    #ess.s <-  sample.nones(pgs_allele[pgs_allele %in% V(allnet)$name],exp.number.data.filtered %>% filter(!!sym(colname)%in%V(allnet)$name& (!!sym(colname)%in%essentials)|(!!sym(colname)%in%pgs_allele[pgs_allele%in% V(allnet)$name])),colname,array=FALSE,dist=F)
    ess.s.dist <- sample.nones(pgs_allele[pgs_allele %in% V(allnet)$name],exp.number.data.filtered %>% filter(!!sym(colname)%in%V(allnet)$name& (!!sym(colname)%in%essentials)|(!!sym(colname)%in%pgs_allele[pgs_allele %in% V(allnet)$name])),colname,array=FALSE,dist=T)

    rand_enm[[i]][t] <- list(enm_res %>% filter(gene1%in%s.dist) %>% select(sqfs) %>% pull())
    pgs_enm[[i]][t] <- list(enm_res %>% filter(gene1%in%pgs_allele) %>% select(sqfs) %>% pull())
    rand_ess_enm[[i]][t] <- list(enm_res %>% filter(gene1%in%ess.s.dist) %>% select(sqfs) %>% pull())

  }
  print(i)
}

rand_enm_mean <- map(rand_enm,function(x) map(x,mean, na.rm=T)) %>% unlist()
rand_enm <- unlist(rand_enm)


rand_ess_enm_mean <- map(rand_ess_enm,function(x) map(x,mean, na.rm=T)) %>% unlist()
rand_ess_enm <- unlist(rand_ess_enm)


pgs_enm_mean <- map(pgs_enm,function(x) map(x,mean, na.rm=T)) %>% unlist()
pgs_enm <- unlist(pgs_enm)



## r enm-betweenness-trans plots------------

d1=plot_data(pgs_enm_mean,rand_enm_mean,rand_ess_enm_mean,pgs_enm,rand_enm,rand_ess_enm,title = 'enm',binwidth=0.1)[[1]]
d1
#plot_grid(d1,d2,d3,ncol=3)


## r enm-betweenness-trans plots ecdf-----------
d1=plot_data_ecdf(pgs_enm,rand_enm,rand_ess_enm,title = 'enm',binwidth=0.1,logscale = T)
strain_ids <- read_csv('analysis/data/derived_data/strain_ids_with_experiment_count_all.csv') %>% mutate(group  = ifelse(`Systematic gene name`%in%pgs$orf_name,'proto-gene',maincat))
enm_cat <- enm_res %>% filter(gene1%in%strain_ids$`Allele Gene name`) %>% left_join(strain_ids,by=c('gene1'='Allele Gene name'))
ggplot(enm_cat,aes(x=group,y=sqfs))+stat_summary(fun.y='mean',geom='bar')+stat_summary(fun.data='mean_se',geom='errorbar')
#ggplot(enm_cat,aes(x=sqfs,y=..density..,color=group))+geom_freqpoly(binwidth=0.5)
ggplot(enm_cat,aes(x=sqfs))+geom_bar(stat="density" )
p_ess_pgs = coin::independence_test(group~sqfs,enm_cat%>%filter(group!='nonessential')%>%mutate(group=factor(group)))
p_nones_pgs = coin::independence_test(group~sqfs,enm_cat%>%filter(group!='essential'&gene1!='yil086c'&gene1!='ylr412c-a')%>%mutate(group=factor(group)))
p_ess_nones = coin::independence_test(group~sqfs,enm_cat%>%filter(group!='proto-gene')%>%mutate(group=factor(group)))


## compare node by node
net = nw.del
all_deg = degree(net)
pgs_data_list = vector('numeric', length(pgs_allele))
compared_mean_list = vector('numeric', length(pgs_allele))
pgs_deg = vector('numeric',length(pgs_allele))
se_list = vector('numeric',length(pgs_allele))
for(i in seq_along(pgs_allele)){
  orf_name = pgs_allele[i]
  if(orf_name%in%V(net)$name==F) next
  orf_deg = all_deg[names(all_deg)==orf_name]
  same_deg_list = names(all_deg)[all_deg==orf_deg&names(all_deg)!=orf_name&names(all_deg)%in%strain_ids$`Allele Gene name`[strain_ids$group=='nonessential']]
  orf_sqfs = enm_res %>% filter(gene1 == orf_name) %>% select(sqfs) %>% pull()
  same_deg_sqfs =same_deg_mean_sqfs = enm_res %>% filter(gene1%in%same_deg_list) %>% select(sqfs) %>% pull()
  same_deg_mean_sqfs = same_deg_sqfs %>%  mean(na.rm=T)
  se = sd(same_deg_sqfs)/length(same_deg_sqfs)
  pgs_data_list[i]=orf_sqfs
  compared_mean_list[i]=same_deg_mean_sqfs
  pgs_deg[i] = orf_deg
  se_list[i] = se
}

df = tibble(orf_name = pgs_allele, sqfs = pgs_data_list, mean = compared_mean_list, se = se_list, degree = pgs_deg)
ggplot(df %>% filter(degree>1),aes(x=orf_name))+
  geom_point(aes(y=sqfs),stat='identity',color='red')+
  geom_point(aes(y=mean), stat="identity", color="black",
           position=position_dodge(.9)) +
  geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.2,
                position=position_dodge(.9))# +ylim(NA,2)
ggsave('analysis/figures/0904_enm/enm_compare_meanse.pdf',width = 20,height = 12)
oacar/pgsNetwork documentation built on Oct. 1, 2019, 9:15 a.m.