R/neighborDistanceAnalysis.R

library(tidyverse)
library(igraph)
library(data.table)
library(ggsignif)
library(coin)

source("R/calculate_node_distances.R")
source("R/add.degree.R")
#inputs ------------
costanzo_matrix_data <- fread('analysis/data/raw_data//pcc_ALL.txt',header=T)
costanzo_matrix <- costanzo_matrix_data[-1,-c(1,2)]%>%as.matrix()
rownames(costanzo_matrix) <- colnames(costanzo_matrix)
class(costanzo_matrix) <- "numeric"
adj_res_graph <- graph_from_adjacency_matrix(costanzo_matrix,weighted=TRUE,mode='undirected',diag=F)
g <- delete.vertices(adj_res_graph,which(str_detect(V(adj_res_graph)$name,'supp')|V(adj_res_graph)$name=='pbr1-5013'))#pbr1-5013 is removed bc while it is in the pcc network, it wasn't in the interaction network dataset provided by thecellmap.org
#|V(adj_res_graph)$name%in%strain_ids$`Allele Gene name`[strain_ids$`Systematic gene name`%in%overlappingorfs$orf_name]))#

genomic_neighbors <- read_delim("analysis/data/derived_data/neighbor_genes", delim = ",")
colnames(genomic_neighbors) <- c("orf_name", "up", "down")

exp.data <- read_csv('analysis/data/derived_data/strain_ids_with_experiment_count_all.csv')
ess.names <- exp.data %>% filter(maincat=='essential') %>% select(`Systematic gene name`) %>% pull() %>% unique()
ess.alleles <- exp.data %>% filter(maincat=='essential') %>% select(`Allele Gene name`) %>% pull() %>% unique()
strain_ids <- read_csv("analysis/data/derived_data/strain_ids.csv")


#pccs with neighbors----------
pcc_net <- g
pcc_net_simpl <- pcc_net
nones.exp.data <- read_csv("analysis/data/derived_data/strain_ids_with_experiment_count_nonessential.csv") %>% add.degree(pcc_net, colnames(.)[2])

V(pcc_net_simpl)$sgd <- map(V(pcc_net_simpl)$name, function(x) strain_ids$`Systematic gene name`[strain_ids$`Allele Gene name` == x])

i <- 1
genomic_neighbors <- genomic_neighbors %>% mutate(updata = NA, downdata = NA, upess = NA, downess = NA, geneess = NA)


s <- map(1:nrow(genomic_neighbors), function(i) ecount(induced_subgraph(pcc_net_simpl, V(pcc_net_simpl)[V(pcc_net_simpl)$sgd %in% (genomic_neighbors[i, ] %>% as.character())])))

for (i in 1:nrow(genomic_neighbors)) {
  # print(i)
  gallele <- V(pcc_net_simpl)$name[V(pcc_net_simpl)$sgd == genomic_neighbors$orf_name[i]]
  allelnamesup <- V(pcc_net_simpl)$name[V(pcc_net_simpl)$sgd %in% genomic_neighbors[i, 2]]
  allelnamesdown <- V(pcc_net_simpl)$name[V(pcc_net_simpl)$sgd %in% genomic_neighbors[i, 3]]
  c <- tidyr::crossing(allelnamesup, gallele) # %>% filter(allelnamesup %in% gallele & allelnamesup1 %in% gallele == F)
  if (nrow(c) == 0 | length(gallele) == 0 | length(allelnamesup) == 0 | length(allelnamesdown) == 0) next

    v <- c()
    for (t in 1:nrow(c)) {
      v <- append(v, as.character(c[t, ]))
    }
    # [1,] %>% as.character
    #E(pcc_net_simpl)[get.edge.ids(pcc_net_simpl, v)]
    genomic_neighbors[i,]$updata <- mean(edge_attr(pcc_net_simpl, "weight", get.edge.ids(pcc_net_simpl, v)))

  c <- tidyr::crossing(allelnamesdown, gallele) # %>% filter(allelnamesdown %in% gallele & allelnamesdown1 %in% gallele == F)
  if (nrow(c) == 0 | length(gallele) == 0 | length(allelnamesdown) == 0) next

    v <- c()
    for (t in 1:nrow(c)) {
      v <- append(v, as.character(c[t, ]))
    }
    # [1,] %>% as.character
    #E(pcc_net_simpl)[get.edge.ids(pcc_net_simpl, v)]
    genomic_neighbors[i,]$downdata <- mean(edge_attr(pcc_net_simpl, "weight", get.edge.ids(pcc_net_simpl, v)))
  genomic_neighbors[i, ]$upess <- ifelse(any(allelnamesup %in% ess.alleles), T, F)
  genomic_neighbors[i, ]$downess <- ifelse(any(allelnamesdown %in% ess.alleles), T, F)
  genomic_neighbors[i, ]$geneess <- ifelse(any(gallele %in% ess.alleles), T, F)
}

#genomic_neighbors[s > 1, ] %>% filter((updata>=0.1&downdata>=0.1))
neighboreffects=genomic_neighbors%>% filter((updata>=0.1|downdata>=0.1)&orf_name%in%strain_ids$`Systematic gene name`)

#distances between genes---------
V(g)$sgd <- map_chr(V(g)$name, function(x) strain_ids$`Systematic gene name`[strain_ids$`Allele Gene name` == x])
g_final_noneigh <- delete.vertices(g,V(g)$sgd%in%neighboreffects$orf_name)

#remove edges with pcc<0.2 and NA/NAN weights
g_final_noneigh=delete.edges(g_final_noneigh, which(E(g_final_noneigh)$weight <.2|is.nan(E(g_final_noneigh)$weight)|is.na(E(g_final_noneigh)$weight)))

g_final_noneigh <- delete.vertices(g_final_noneigh,which(degree(g_final_noneigh)<1 ))


pcc_net <- g_final_noneigh#graph_from_data_frame(df_pcc_filtered, F)
pcc_net_simpl <- pcc_net #%>% igraph::simplify(edge.attr.comb = "mean")

V(pcc_net_simpl)$sgd <- map_chr(V(pcc_net_simpl)$name, function(x) strain_ids$`Systematic gene name`[strain_ids$`Allele Gene name` == x])

E(pcc_net_simpl)$dist <- calculate_node_distances(pcc_net_simpl)

genomic_neighbors <- genomic_neighbors %>% mutate(nw_dist_down = NA, nw_dist_up = NA)
w <-E(pcc_net_simpl)$dist
for (i in 1:nrow(genomic_neighbors)) {

  gallele <- V(pcc_net_simpl)$name[V(pcc_net_simpl)$sgd == genomic_neighbors$orf_name[i]]
  allelnamesup <- V(pcc_net_simpl)$name[V(pcc_net_simpl)$sgd %in% genomic_neighbors[i, 2]]
  allelnamesdown <- V(pcc_net_simpl)$name[V(pcc_net_simpl)$sgd %in% genomic_neighbors[i, 3]]
  c <- tidyr::crossing(allelnamesup, gallele) %>% as.data.frame() # %>% filter(allelnamesup %in% gallele & allelnamesup1 %in% gallele == F)
  if (nrow(c) == 0 | length(gallele) == 0 | length(allelnamesup) == 0 | length(allelnamesdown) == 0) next

  v <- c()
  for (t in 1:nrow(c)) {
    v <- append(v, distances(pcc_net_simpl, c[t, 1], c[t, 2], weights = w))
  }

  genomic_neighbors[i, ]$nw_dist_up <- min(v, na.rm = T)

  c <- tidyr::crossing(allelnamesdown, gallele) %>% as.data.frame() # %>% filter(allelnamesdown %in% gallele & allelnamesdown1 %in% gallele == F)
  if (nrow(c) == 0 | length(gallele) == 0 | length(allelnamesdown) == 0) next

  v <- c()
  for (t in 1:nrow(c)) {
    v <- append(v, distances(pcc_net_simpl, c[t, 1], c[t, 2], weights = w))
  }

  genomic_neighbors[i, ]$nw_dist_down <- min(v, na.rm = T)
}
pgs_in_network <- as.character(pgs$orf_name[pgs$orf_name %in% V(pcc_net_simpl)$sgd])
pgs_nw_dist <- genomic_neighbors %>%
  filter(orf_name %in% pgs_in_network) %>%
  select(nw_dist_down, nw_dist_up) %>%
  drop_na()
pgs_nw_dist <- c(pgs_nw_dist$nw_dist_down, pgs_nw_dist$nw_dist_up)
nones_nw_dist <- genomic_neighbors %>%
  filter(orf_name %in% nones.exp.data$`Systematic gene name` & orf_name %in% pgs_in_network == F) %>%
  select(nw_dist_down, nw_dist_up) %>%
  drop_na()
nones_nw_dist <- c(nones_nw_dist$nw_dist_down, nones_nw_dist$nw_dist_up)

ess_nw_dist <- genomic_neighbors %>%
  filter(orf_name %in% ess.names) %>%
  select(nw_dist_down, nw_dist_up) %>%
  drop_na()
ess_nw_dist <- c(ess_nw_dist$nw_dist_down, ess_nw_dist$nw_dist_up)

  #nw_dist_data <- suppressWarnings(bind_rows(nw_dist_data, data.frame(data = (pgs_nw_dist[is.infinite(pgs_nw_dist) == F]), group = "proto-genes"), data.frame(data = (nones_nw_dist[is.infinite(nones_nw_dist) == F]), group = "nonessential"), data.frame(data = (ess_nw_dist[is.infinite(ess_nw_dist) == F]), group = "essential")))

#nw_dist_data$group <- factor(nw_dist_data$group, levels = c("proto-genes", "nonessential", "essential"))

#genomic_neighbors %>% saveRDS('analysis/data/derived_data/genomic_neighbors_withmypccall.rds')

all_ds <- distances(pcc_net_simpl,weights = w)

for (i in 1:nrow(genomic_neighbors)) {
  gallele <- V(pcc_net_simpl)$name[V(pcc_net_simpl)$sgd == genomic_neighbors$orf_name[i]]
  allelnamesup <- V(pcc_net_simpl)$name[V(pcc_net_simpl)$sgd %in% genomic_neighbors[i, 2]]
  allelnamesdown <- V(pcc_net_simpl)$name[V(pcc_net_simpl)$sgd %in% genomic_neighbors[i, 3]]
  all_ds[rownames(all_ds)%in%gallele, colnames(all_ds)%in%allelnamesdown] <- NA
  all_ds[rownames(all_ds)%in%gallele, colnames(all_ds)%in%allelnamesup] <- NA
  #genomic_neighbors[i, ]$nw_dist_down <- min(v, na.rm = T)
}
upper <- upper.tri(all_ds)

group_name <- as.character(pgs$orf_name)
dd <- pgs_nw_dist
other <- all_ds[upper&rownames(all_ds)%in%strain_ids$`Allele Gene name`[strain_ids$`Systematic gene name`%in%group_name]]
nw_dist_data_comparewneigh_pgs <- bind_rows(data.frame(group='neighbors',data=dd[is.na(dd)==F&is.infinite(dd)==F]),data.frame(group='everything',data=other[is.infinite(other)==F&is.na(other)==F])) %>% mutate(type='proto-gene')

dd <- nones_nw_dist
other <- all_ds[upper&rownames(all_ds)%in%nones.exp.data$`Allele Gene name` & rownames(all_ds) %in% strain_ids$`Allele Gene name`[strain_ids$`Systematic gene name`%in%pgs_in_network] == F]
nw_dist_data_comparewneigh_nones <- bind_rows(data.frame(group='neighbors',data=dd[is.na(dd)==F&is.infinite(dd)==F]),data.frame(group='everything',data=other[is.infinite(other)==F&is.na(other)==F])) %>% mutate(type='nonessential')

dd <- ess_nw_dist
other <- all_ds[upper&rownames(all_ds)%in%ess.alleles]
nw_dist_data_comparewneigh_ess <- bind_rows(data.frame(group='neighbors',data=dd[is.na(dd)==F&is.infinite(dd)==F]),data.frame(group='everything',data=other[is.infinite(other)==F&is.na(other)==F])) %>% mutate(type='essential')
#nw_dist_data_comparewneigh$group <- factor(nw_dist_data_comparewneigh$group,levels = c('neighbors','everything'))
#png('analysis/figures/boxtry.png')
comb_nw_data <- bind_rows(nw_dist_data_comparewneigh_pgs,nw_dist_data_comparewneigh_nones,nw_dist_data_comparewneigh_ess)
comb_nw_data$group <- factor(comb_nw_data$group,levels = c('neighbors','everything'))
comb_nw_data$type <- factor(comb_nw_data$type,levels=c('proto-gene','nonessential','essential'))

p_pgs_neigh <- pvalue(independence_test(group~data,data=comb_nw_data %>% filter(type=='proto-gene')))
p_nones_neigh <- pvalue(independence_test(group~data,data=comb_nw_data %>% filter(type=='nonessential')))
p_ess_neigh <- pvalue(independence_test(group~data,data=comb_nw_data %>% filter(type=='essential')))


#save.image('analysis/figures/0708pres/neighdistimage.RData')
## Plots -----------
plt <- ggplot(comb_nw_data, aes(x=type,y=data,fill=group,group = as.factor(group))) +
    stat_summary(fun.y = mean, geom = "bar",position = position_dodge(width=1)) +
    stat_summary(fun.data = mean_se,width=0.3, geom = "errorbar",position = position_dodge(width=1))+#ggtitle('new_pcc_values')
    ylab('Network layout weighted distance')+xlab('')+
  scale_x_discrete(labels=c('Proto-genes','Nonessential\nGenes','Essential\nGenes'))+theme_me_bar
ggsave('figures/Figure2E.pdf',plot=plt,width = 2,height = 2)
#P values should be as following:
#> p_ess_neigh
#[1] 0.005665025
#> p_nones_neigh
#[1] 0.8516699
#> p_pgs_neigh
#[1] 0.4677611
#>
oacar/pgsNetwork documentation built on Oct. 1, 2019, 9:15 a.m.