R/clusteringAnalysis_combined.R

Defines functions plotBoxSign

#clustering analysis
library(plyr)
library(igraph)
library(tidyverse)

## Funcs-------------
library(ggsignif)
source("R/add.degree.R")
source("R/sample.nones.R")
# saveRDS(cl_mean_df_list, "../anne/mcl/cl_mean_df_list.rds")
# saveRDS(clusterdf, "../anne/mcl/clusterdf.rds")
# rand_linkcom_df <- readRDS('../anne/linkcom/rand_linkcom_withdegree_df.rds')

plotBoxSign <- function(data, itr, signf = F) {
  data$group <- factor(data$group, levels = c("proto-gene", "nonessential", "essential"))
  data <- data %>% filter(i == itr)
  p12 <- 2 * sum(data$data[data$group == "proto-gene"] < data$data[data$group == "nonessential"]) / length(data$data[data$group == "proto-gene"])
  p13 <- 2 * sum(data$data[data$group == "proto-gene"] < data$data[data$group == "essential"]) / length(data$data[data$group == "proto-gene"])
  p23 <- 2 * sum(data$data[data$group == "nonessential"] < data$data[data$group == "essential"]) / length(data$data[data$group == "proto-gene"])
  if (signf) {
    ggplot(data, aes(x = group, y = data, group = group, fill = group)) + geom_boxplot() + scale_x_discrete(labels = c("Proto-gene", "Non-essential Genes", "Essential Genes")) + theme_bw() +
      ylab("Mean") + xlab("") + theme_me_bar + scale_fill_manual(name = "", values = c("#1CBDC2", "#EF3D23", "#FAA51A")) + xlab("") + ylab("Mean number of different clusters") +
      geom_signif(
        annotations = c(formatC(p12, digits = 3), formatC(p13, digits = 3), formatC(p23, digits = 3)),
        y_position = c(max(data$data) + 1, max(data$data) + 2, max(data$data) + 3), xmin = c(1, 1, 2), xmax = c(2, 3, 3)
      ) + ggtitle(itr)
  } else {
    ggplot(data, aes(x = group, y = data, group = group, fill = group)) + geom_boxplot() + scale_x_discrete(labels = c("Proto-gene", "Non-essential Genes", "Essential Genes")) + theme_bw() +
      ylab("Mean") + xlab("") + theme_me_bar + scale_fill_manual(name = "", labels = element_blank(), values = c("#1CBDC2", "#EF3D23", "#FAA51A")) + xlab("") + ylab("Mean number of different clusters") +
      ggtitle(itr)
  }
}

#Input ------
allele.frame <- readRDS("analysis/data/derived_data/df_different_alleles.rds")

set.seed(1)
# allele.frame <- readRDS('analysis/data/derived_data/df_different_alleles.rds')
nones.exp.data <- read_csv("analysis/data/derived_data/strain_ids_with_experiment_count_nonessential.csv") # %>% add.degree(pcc_net, colnames(.)[2])
exp.number.data <- read_csv("analysis/data/derived_data/strain_ids_with_experiment_count_all.csv")
strain_ids <- read_csv("analysis/data/derived_data/strain_ids.csv")
essentials <-  exp.number.data$`Allele Gene name`[exp.number.data$maincat=='essential']

# deg.pcc <- degree(pcc_net)
linkcom_df <- data.frame()
rand_linkcom_df <- data.frame()
colname <- colnames(nones.exp.data)[2]
pgs.alleles <- strain_ids %>%
  filter(`Systematic gene name` %in% pgs$orf_name) %>%
  select(`Allele Gene name`) %>%
  pull()

#linkcom analysis------------

itr = 0.5
  # print(itr)
  # fname <- paste0("../anne/linkcom/mypcc/network.tsv_", itr, ".el")
fname <- paste0("analysis/data/derived_data/linkcom_res_", itr, ".el")

pcc <- suppressMessages(read_delim(fname, delim = " ", col_names = F))
largeclusters <- pcc %>%
  group_by(X3) %>%
  dplyr::summarise(n = dplyr::n()) %>%
  filter(n > 2) %>%
  dplyr::select(X3)

pcc_clfiltered <- pcc # %>% filter(X3 %in% largeclusters$X3)
pcc_net <- graph_from_edgelist(as.matrix(pcc_clfiltered[c(1, 2)]), directed = F)
E(pcc_net)$linkcom <- pcc_clfiltered[, 3] %>% pull()
nums <- pcc %>%
  group_by(X3) %>%
  dplyr::summarise(n = dplyr::n())
# pgs.alleles <- unlist(allele.frame$different.alleles[allele.frame$orf_name %in% pgs$orf_name])
linkcom_df <- bind_rows(linkcom_df, data.frame(
  i = itr,
  numberofclusters = nrow(nums),
  cl1 = sum(nums$n == 1),
  cl2 = sum(nums$n == 2),
  cl3 = sum(nums$n == 3),
  cl4 = sum(nums$n == 4),
  cl5 = sum(nums$n > 4)
))
# pcc_net_neigh <- neighborhood(pcc_net, order = 1,nodes = V(pcc_net)[V(pcc_net)$name %in% pgs.alleles]) %>% map(function(x) induced_subgraph(pcc_net, x))
nones.exp.data <- nones.exp.data %>% add.degree(pcc_net, colnames(.)[2])
exp.number.data.filtered <- exp.number.data %>%
  add.degree(pcc_net, colname) %>%
  mutate(int_density = deg / num)
different_clusters <- map(V(pcc_net)$name, function(x) length(unique(incident_edges(pcc_net, V(pcc_net)[V(pcc_net)$name == x])[[1]]$linkcom)))
# numberofedges=map(V(pcc_net)$name, function(x) length(incident_edges(pcc_net,V(pcc_net)[V(pcc_net)$name==x])[[1]]$linkcom))
names(different_clusters) <- V(pcc_net)$name



# deg_2 <- deg.pcc[deg.pcc>1]
rand.s <- c()
ess.rand.s <- c()
pgs.s <- c()
# pgs.s <- different_clusters[names(different_clusters)%in%pgs.alleles] %>% unlist()#[pgs.alleles%in%names(deg_2)]
for (i in 1:100) {
  pgs.s <- append(pgs.s, mean(different_clusters[names(different_clusters) %in% pgs.alleles] %>% unlist())) # [pgs.alleles%in%names(deg_2)]

  s <- sample.nones(pgs.alleles[pgs.alleles %in% V(pcc_net)$name], nones.exp.data %>% filter(`Allele Gene name` %in% V(pcc_net)$name), colname, dist = T, T, T)
  rand.s <- append(rand.s, mean(different_clusters[names(different_clusters) %in% s] %>% unlist())) # [s%in%names(deg_2)]
  # ess.rand.s <- append(ess.rand.s, mean(different_clusters[names(different_clusters) %in% (ess$X1[ess$X1 %in% V(pcc_net)$name] %>% sample(length(pgs.alleles[pgs.alleles %in% V(pcc_net)$name])))] %>% unlist()))
  # hst <- deg.pcc[names(deg.pcc)%in%pgs.alleles] %>% hist(plot=F,breaks=-1:max(.))
  ess.s <- sample.nones(pgs.alleles[pgs.alleles %in% V(pcc_net)$name], exp.number.data.filtered %>% filter(!!sym(colname) %in% V(pcc_net)$name & (!!sym(colname) %in% essentials) | (!!sym(colname) %in% pgs.alleles[pgs.alleles %in% V(pcc_net)$name])), colname, array = FALSE, dist = T)

  ess.rand.s <- append(ess.rand.s, different_clusters[names(different_clusters) %in% ess.s] %>% unlist() %>% mean(na.rm = T))
}
smdf <- bind_rows(
  data.frame(group = "proto-gene", data = pgs.s, i = itr),
  data.frame(group = "nonessential", data = rand.s, i = itr),
  data.frame(group = "essential", data = ess.rand.s, i = itr)
)
# print(smdf)
rand_linkcom_df <- bind_rows(
  rand_linkcom_df, smdf
)


rand_linkcom_df$i <- as.factor(rand_linkcom_df$i)



#mcl--------
# induced.subgraph(pcc_net,mclres[str_detect(mclres,'ybr196c-a')] %>% str_split('\t') %>% unlist) %>% plot()
# mclres[str_detect(mclres,'ybr196c-a')] %>% str_split('\t') %>% unlist %>% write
# name <- 'ybr196c-a'
neighborclusters <- function(pcc_net, name, clusterdata) {
  neigh <- neighbors(pcc_net, name)$name
  map(neigh, function(x) which(str_detect(clusterdata, x))) %>%
    unlist() %>%
    unique()
}
neighborclusters <- function(pcc_net, name, clusterdata) {
  neigh <- neighbors(pcc_net, name)$name
  V(pcc_net)[neigh]$memb %>% unique()
  # map(neigh, function(x) which(str_detect(clusterdata, x))) %>%
  #   unlist() %>%
  #   unique()
}
colname <- colnames(nones.exp.data)[2]
cl_mean_df_list <- data.frame()
clusterdf <- data.frame() # i=seq(14, 30, 2),numberofclusters=NA,cl1=NA,cl2=NA,cl3=NA,cl4=NA,cl5=NA)
vv=as.character(seq(1.2,5.0,0.2))
vv[c(5,10,15,20)] <- c("2.0","3.0","4.0","5.0")
#for (itr in vv) {


#for(itr in seq(12,26,2)){
  #clusterdata <- readLines(paste0("~/Main/anne/mcl/out.data1.mci.I12.labels.out", as.character(itr)))
itr=26
clusterdata <- readLines(paste0("analysis/data/derived_data/mcl_res_I",itr,".labels.out"))
ll <- map(clusterdata, function(x) length(str_split(x, "	") %>% unlist()))
clusterdf <- data.frame(
  i = itr,
  numberofclusters = length(clusterdata),
  cl1 = sum(ll == 1),
  cl2 = sum(ll == 2),
  cl3 = sum(ll == 3),
  cl4 = sum(ll == 4),
  cl5 = sum(ll > 4)
) %>% bind_rows(clusterdf)
#clusterdata <- clusterdata[ll>2]
cldata <- map(clusterdata,function(x) unlist(str_split(x,'\t')))
V(pcc_net)$memb <- V(pcc_net)$name %>% map(function(x) map(1:length(cldata), function(i) any(cldata[[i]]==x)) %>% unlist() %>% which() ) %>% unlist()
pgs_cluster_means <- c()
rand_cluster_means <- c()
rand_ess_cl_means <- c()
deg.pcc <- degree(pcc_net)
nones.exp.data <- nones.exp.data %>%
  add.degree(pcc_net, colname) %>%
  mutate(int_density = deg / num)
exp.number.data.filtered <- exp.number.data %>%
  add.degree(pcc_net, colname) %>%
  mutate(int_density = deg / num)

for (i in 1:100) {
  pgs_cluster_means <- append(pgs_cluster_means, pgs.alleles[pgs.alleles %in% V(pcc_net)$name] %>% map(function(x) length(neighborclusters(pcc_net, x, clusterdata))) %>% unlist() %>% mean())

  rand_cluster_means <- append(rand_cluster_means, sample.nones(pgs.alleles[pgs.alleles %in% V(pcc_net)$name], nones.exp.data %>% filter(`Allele Gene name` %in% V(pcc_net)$name), colnames(nones.exp.data)[2], dist = T) %>% map(function(x) length(neighborclusters(pcc_net, x, clusterdata))) %>% unlist() %>% mean())

  #hst <- deg.pcc[names(deg.pcc)%in%pgs.alleles] %>% hist(plot=F,breaks=-1:max(.))
  ess.s <-  ess.s <- sample.nones(pgs.alleles[pgs.alleles %in% V(pcc_net)$name],exp.number.data.filtered %>% filter(!!sym(colname)%in%V(pcc_net)$name& (!!sym(colname)%in%essentials)|(!!sym(colname)%in%pgs.alleles[pgs.alleles %in% V(pcc_net)$name])),colname,array=FALSE,dist=T)
  rand_ess_cl_means <- append(rand_ess_cl_means, ess.s %>% map(function(x) length(neighborclusters(pcc_net, x, clusterdata))) %>% unlist() %>% mean())
}


cl_mean_df <- bind_rows(
  data.frame(group = "proto-gene", data = pgs_cluster_means, i = itr),
  data.frame(group = "nonessential", data = rand_cluster_means, i = itr),
  data.frame(group = "essential", data = rand_ess_cl_means, i = itr)
)
cl_mean_df_list <- bind_rows(cl_mean_df_list, cl_mean_df)
# boxplot(pgs_cluster_means, rand_cluster_means, rand_ess_cl_means)
# title(itr)
# sum(pgs_cluster_means < rand_cluster_means)
#}
## Plot------------


plotBoxSign(rand_linkcom_df, 0.5) +
  ylab("Mean number of \ndifferent clusters connected") +
  ggtitle("") + scale_x_discrete(labels = c("Proto-gene", "Non-essential\nGenes", "Essential\nGenes"))+theme(legend.position = 'none')
ggsave("figures/Figure4H.pdf", height = 4, width = 4, unit = "in")

plotBoxSign(cl_mean_df_list, 26,F)+ylab('Mean number of \ndifferent clusters connected')+ggtitle('')+scale_x_discrete(labels =c("Proto-gene","Non-essential\nGenes","Essential\nGenes"))+theme(legend.position = 'none')
ggsave('figures/Figure4I.pdf',height = 4,width = 4,unit='in')
oacar/pgsNetwork documentation built on Oct. 1, 2019, 9:15 a.m.