R/linkcomanalysis.R

Defines functions plotBoxSign

library(plyr)
library(igraph)
library(tidyverse)

## Funcs-------------
library(ggsignif)
# saveRDS(cl_mean_df_list, "../anne/mcl/cl_mean_df_list.rds")
# saveRDS(clusterdf, "../anne/mcl/clusterdf.rds")
# rand_linkcom_df <- readRDS('../anne/linkcom/rand_linkcom_withdegree_df.rds')

plotBoxSign <- function(data, itr, signf = F) {
  data$group <- factor(data$group, levels = c("proto-gene", "nonessential", "essential"))
  data <- data %>% filter(i == itr)
  p12 <- 2 * sum(data$data[data$group == "proto-gene"] < data$data[data$group == "nonessential"]) / length(data$data[data$group == "proto-gene"])
  p13 <- 2 * sum(data$data[data$group == "proto-gene"] < data$data[data$group == "essential"]) / length(data$data[data$group == "proto-gene"])
  p23 <- 2 * sum(data$data[data$group == "nonessential"] < data$data[data$group == "essential"]) / length(data$data[data$group == "proto-gene"])
  if (signf) {
    ggplot(data, aes(x = group, y = data, group = group, fill = group)) + geom_boxplot() + scale_x_discrete(labels = c("Proto-gene", "Non-essential Genes", "Essential Genes")) + theme_bw() +
      ylab("Mean") + xlab("") + theme_me_bar + scale_fill_manual(name = "", values = c("#1CBDC2", "#EF3D23", "#FAA51A")) + xlab("") + ylab("Mean number of different clusters") +
      geom_signif(
        annotations = c(formatC(p12, digits = 3), formatC(p13, digits = 3), formatC(p23, digits = 3)),
        y_position = c(max(data$data) + 1, max(data$data) + 2, max(data$data) + 3), xmin = c(1, 1, 2), xmax = c(2, 3, 3)
      ) + ggtitle(itr)
  } else {
    ggplot(data, aes(x = group, y = data, group = group, fill = group)) + geom_boxplot() + scale_x_discrete(labels = c("Proto-gene", "Non-essential Genes", "Essential Genes")) + theme_bw() +
      ylab("Mean") + xlab("") + theme_me_bar + scale_fill_manual(name = "", labels = element_blank(), values = c("#1CBDC2", "#EF3D23", "#FAA51A")) + xlab("") + ylab("Mean number of different clusters") +
      ggtitle(itr)
  }
}


## Input------------

#pgs <- read_csv("~/Main/pgsNetwork/analysis/data/derived_data/filtering/costanzoplusmypcc_0815.csv")
allele.frame <- readRDS("analysis/data/derived_data/df_different_alleles.rds")
source("R/dist.sample.R")
source("R/add.degree.R")
source("R/sample.nones.R")
set.seed(1)
# allele.frame <- readRDS('analysis/data/derived_data/df_different_alleles.rds')
nones.exp.data <- read_csv("analysis/data/derived_data/strain_ids_with_experiment_count_nonessential.csv") # %>% add.degree(pcc_net, colnames(.)[2])
exp.number.data <- read_csv("analysis/data/derived_data/strain_ids_with_experiment_count_all.csv")
strain_ids <- read_csv("analysis/data/derived_data/strain_ids.csv")
essentials <- read_csv("../anne/network_analysis/0530essential/ess_alleles.csv", col_names = F) %>%
  dplyr::select(X1) %>%
  pull()
# deg.pcc <- degree(pcc_net)
linkcom_df <- data.frame()
rand_linkcom_df <- data.frame()
colname <- colnames(nones.exp.data)[2]
pgs.alleles <- strain_ids %>%
  filter(`Systematic gene name` %in% pgs$orf_name) %>%
  select(`Allele Gene name`) %>%
  pull()


## Analysis-------------

for (itr in seq(0.1, 0.9, 0.1)) {
  # print(itr)
  # fname <- paste0("../anne/linkcom/mypcc/network.tsv_", itr, ".el")
  fname <- paste0("../anne/linkcom/linkcom_res/pcc_elist_", itr, ".el")

  pcc <- suppressMessages(read_delim(fname, delim = " ", col_names = F))
  largeclusters <- pcc %>%
    group_by(X3) %>%
    dplyr::summarise(n = dplyr::n()) %>%
    filter(n > 2) %>%
    dplyr::select(X3)

  pcc_clfiltered <- pcc # %>% filter(X3 %in% largeclusters$X3)
  pcc_net <- graph_from_edgelist(as.matrix(pcc_clfiltered[c(1, 2)]), directed = F)
  E(pcc_net)$linkcom <- pcc_clfiltered[, 3] %>% pull()
  nums <- pcc %>%
    group_by(X3) %>%
    dplyr::summarise(n = dplyr::n())
  # pgs.alleles <- unlist(allele.frame$different.alleles[allele.frame$orf_name %in% pgs$orf_name])
  linkcom_df <- bind_rows(linkcom_df, data.frame(
    i = itr,
    numberofclusters = nrow(nums),
    cl1 = sum(nums$n == 1),
    cl2 = sum(nums$n == 2),
    cl3 = sum(nums$n == 3),
    cl4 = sum(nums$n == 4),
    cl5 = sum(nums$n > 4)
  ))
  # pcc_net_neigh <- neighborhood(pcc_net, order = 1,nodes = V(pcc_net)[V(pcc_net)$name %in% pgs.alleles]) %>% map(function(x) induced_subgraph(pcc_net, x))
  nones.exp.data <- nones.exp.data %>% add.degree(pcc_net, colnames(.)[2])
  exp.number.data.filtered <- exp.number.data %>%
    add.degree(pcc_net, colname) %>%
    mutate(int_density = deg / num)
  different_clusters <- map(V(pcc_net)$name, function(x) length(unique(incident_edges(pcc_net, V(pcc_net)[V(pcc_net)$name == x])[[1]]$linkcom)))
  # numberofedges=map(V(pcc_net)$name, function(x) length(incident_edges(pcc_net,V(pcc_net)[V(pcc_net)$name==x])[[1]]$linkcom))
  names(different_clusters) <- V(pcc_net)$name



  # deg_2 <- deg.pcc[deg.pcc>1]
  rand.s <- c()
  ess.rand.s <- c()
  pgs.s <- c()
  # pgs.s <- different_clusters[names(different_clusters)%in%pgs.alleles] %>% unlist()#[pgs.alleles%in%names(deg_2)]
  for (i in 1:100) {
    pgs.s <- append(pgs.s, mean(different_clusters[names(different_clusters) %in% pgs.alleles] %>% unlist())) # [pgs.alleles%in%names(deg_2)]

    s <- sample.nones(pgs.alleles[pgs.alleles %in% V(pcc_net)$name], nones.exp.data %>% filter(`Allele Gene name` %in% V(pcc_net)$name), colname, dist = T, T, T)
    rand.s <- append(rand.s, mean(different_clusters[names(different_clusters) %in% s] %>% unlist())) # [s%in%names(deg_2)]
    # ess.rand.s <- append(ess.rand.s, mean(different_clusters[names(different_clusters) %in% (ess$X1[ess$X1 %in% V(pcc_net)$name] %>% sample(length(pgs.alleles[pgs.alleles %in% V(pcc_net)$name])))] %>% unlist()))
    # hst <- deg.pcc[names(deg.pcc)%in%pgs.alleles] %>% hist(plot=F,breaks=-1:max(.))
    ess.s <- sample.nones(pgs.alleles[pgs.alleles %in% V(pcc_net)$name], exp.number.data.filtered %>% filter(!!sym(colname) %in% V(pcc_net)$name & (!!sym(colname) %in% essentials) | (!!sym(colname) %in% pgs.alleles[pgs.alleles %in% V(pcc_net)$name])), colname, array = FALSE, dist = T)

    ess.rand.s <- append(ess.rand.s, different_clusters[names(different_clusters) %in% ess.s] %>% unlist() %>% mean(na.rm = T))
  }
  smdf <- bind_rows(
    data.frame(group = "proto-gene", data = pgs.s, i = itr),
    data.frame(group = "nonessential", data = rand.s, i = itr),
    data.frame(group = "essential", data = ess.rand.s, i = itr)
  )
  # print(smdf)
  rand_linkcom_df <- bind_rows(
    rand_linkcom_df, smdf
  )
}

rand_linkcom_df$i <- as.factor(rand_linkcom_df$i)
## Plot------------


plotBoxSign(rand_linkcom_df, 0.5) +
ylab("Mean number of \ndifferent clusters connected") +
  ggtitle("") + scale_x_discrete(labels = c("Proto-gene", "Non-essential\nGenes", "Essential\nGenes"))+theme(legend.position = 'none')
ggsave("analysis/figures/0821_paper/fig3i.pdf", height = 4, width = 4, unit = "in")
oacar/pgsNetwork documentation built on Oct. 1, 2019, 9:15 a.m.