R/regulatoryAnalysis.R

Defines functions find.overlap se expandRange convertBool

## ------------------------------------------------------------------------
library(rtracklayer)
library(GenomicRanges)
library(tidyverse)
library(NucleoATACR)
library(tidyverse)
source('R/add.degree.R')
library(igraph)
library(coin)
library(ggsignif)
##Functions--------------
find.overlap <- function(query.granges,data.ranges, colname='score',FUN){
  data.vector <- c()
  for(i in 1:length(query.granges)){
    data <- query.granges[i]
    data.range <- data.ranges[data.ranges%over%data]
    data.vector <- append(data.vector,FUN(mcols(data.range)[[colname]]))
    names(data.vector)[i] <- mcols(query.granges)$name[i]
  }
  data.vector
}


se <- function(x) sqrt(var(x)/length(x))



setMethod("$", "GRanges", function(x, name) { # {{{
  elementMetadata(x)[, name]
}) # }}}

setMethod("$<-", "GRanges", function(x, name, value) { # {{{
  elementMetadata(x)[[ name ]] <- value
  return(x)
}) # }}}



expandRange = function(x, upstream=0, downstream=200) {
  strand_is_minus = strand(x) == "-"
  on_plus = which(!strand_is_minus)
  on_minus = which(strand_is_minus)
  start(x)[on_plus] = end(x)[on_plus]+1#start(x)[on_plus] - upstream
  end(x)[on_plus] = end(x)[on_plus] + downstream
  end(x)[on_minus] = start(x)[on_minus]-1#end(x)[on_minus] + upstream
  start(x)[on_minus] = start(x)[on_minus] - downstream

  x
}

##Inputs------------
out_width=2 #in inches
out_height=2 #in inches
scer <- import(con='analysis/data/raw_data/ucscgenes.bed')
#setwd('~/Main/pgsNetwork/vignettes/')
#summ <- read.csv('~/Main/SUMMARY_2012.csv')
#summ.gene <- summ$orf_name[summ$category=='gene']
grl<-scer#[scer$name%in%summ.gene]

#pgs <- read_csv('analysis/data/derived_data/filtering/costanzoplusmypcc_0815.csv')
thr <- -0.2 #interaction network threshold

pgs <- pgs$orf_name

pgs.data <- scer[scer$name%in%pgs]

## distance to TSS analysis



grl<-scer
tif <- read_delim('analysis/data/raw_data/S1_TIFs.txt',delim=' ')
tif <- transform(tif, start = ifelse(strand == '-', t3, t5), end = ifelse(strand == '-', t5, t3)) %>%
  mutate(chr=recode(chr,`1`='chrI',`2`='chrII',`3`='chrIII',`4`='chrIV',`5`='chrV',`6`='chrVI',
                    `7`='chrVII',`8`='chrVIII',`9`='chrIX',`10`='chrX',`11`='chrXI',`12`='chrXII',`13`='chrXIII',
                    `14`='chrXIV',`15`='chrXV',`16`='chrXVI'))



tif_rs <- GRanges(seqnames = tif$chr,strand = tif$strand,ranges = IRanges(start=ifelse(tif$strand=='+',tif$start,tif$end),end = ifelse(tif$strand=='+',tif$start,tif$end)),DataFrame(score=tif$ypd,gal=tif$gal))
tif_re <- GRanges(seqnames = tif$chr,strand = tif$strand,ranges = IRanges(start=ifelse(tif$strand=='+',tif$end,tif$start),end =ifelse(tif$strand=='+',tif$end,tif$start)),DataFrame(score=tif$ypd,gal=tif$gal))
gene.prm <- promoters(grl,upstream=200,downstream=0)
gene.prm$tss <- NA
pranges <- GRanges()
for(i in 1:length(gene.prm)){
  p <- gene.prm[i]
  tss_ <- median(start(tif_rs[tif_rs%over%p]))
  gene.prm[i]$tss <- tss_
  if(is.na(tss_)==F){
    pranges <- append(pranges,GRanges(seqnames = seqnames(p),strand=strand(p),ranges=IRanges(start=tss_,end=tss_),DataFrame(name=p$name,score=p$score)))

  }

}
tss_tif <- pranges

pgs.data <- scer[scer$name%in%pgs]


tss <- tss_tif[tss_tif$name%in%setdiff(tss_tif$name,pgs.data$name)]
dist.list <- c()
dist.list.data <- data.frame(name=NA, dist=NA)
for(i in 1:length(pgs.data)){
  tryCatch({
    nn <- GRanges(seqnames = seqnames(pgs.data[i]),ranges = ranges(pgs.data[i]),strand = '-')
    np <- GRanges(seqnames = seqnames(pgs.data[i]),ranges = ranges(pgs.data[i]),strand = '+')
    o <- queryHits(findOverlaps(tss,pgs.data[i],ignore.strand=T))
    if(length(o)!=0){
      #u <- follow(nn,tss)
      #d <- nearest(pgs.data[i],tss,ignore.strand=T)
      #print(i)
      dist.list[[ mcols(pgs.data[i])$name]] <- list(list(pgs.data[i],tss[o]))
      dist.list.data <- rbind(dist.list.data,data.frame(name=mcols(pgs.data)$name[i],dist=0))

    }else{
      #d <- nearest(pgs.data[i],tss,ignore.strand=T)
      u <- precede(np,tss)
      d <- precede(nn,tss)
      du <- distance(tss[u],pgs.data[i],ignore.strand=T)
      dd <- distance(tss[d],pgs.data[i],ignore.strand=T)
      if(du<dd){
        dist.list[[ mcols(pgs.data[i])$name]] <- list(list(pgs.data[i],tss[d]))
        dist.list.data <- rbind(dist.list.data,data.frame(name=mcols(pgs.data)$name[i],dist=du))

      }else{
        dist.list[[ mcols(pgs.data[i])$name]] <- list(list(pgs.data[i],tss[d]))
        dist.list.data <- rbind(dist.list.data,data.frame(name=mcols(pgs.data)$name[i],dist=dd))
      }
      # dist.list[[ mcols(pgs.data[i])$name]] <- list(list(pgs.data[i],tss[d]))
      # dist.list.data <- append(dist.list.data,distance(tss[d],pgs.data[i],ignore.strand=T))
      # names(dist.list.data)[i] <-mcols(pgs.data)$name[i]
    }
  }, error = function(e) NULL )
  #dist.list <- append(dist.list,mcols(distanceToNearest(pgs.data[i],tss_park,ignore.strand=T)))
}

dist.list.data <- dist.list.data[-1,]


pgs.df <- data.frame(orf_name=pgs,tss=NA)
for(i in 1:length(pgs)){
  pgs.df$tss[i] <- dist.list.data[dist.list.data$name==as.character(pgs.df$orf_name[i]),2]
}



## ----nucleosome data---------

set.seed(1)

nucs <- readNucs('analysis/data/raw_data/lin_all.nucpos.bed')%>%flank(74,both=T,ignore.strand=T)#"test.nucmap_combined.bed.gz")

## Network data----------

overlappingorfs <- read_csv("analysis/data/raw_data/overlappingorfs.csv")
net.df <- readRDS("analysis/data/derived_data/SGA_data_combined.rds.gz")
strain_ids <- read_csv("analysis/data/derived_data/strain_ids.csv")
nones.exp.data <- read_csv("analysis/data/derived_data/strain_ids_with_experiment_count_nonessential.csv")
exp.data <- read_csv("analysis/data/derived_data/strain_ids_with_experiment_count_all.csv")%>% mutate(group=ifelse(`Systematic gene name`%in%pgs,'proto-gene',maincat))#ifelse(`Systematic gene name`%in%nones.exp.data$`Systematic gene name`==F,'essential','nonessential')))


net_df_significant <- filter(net.df, `P-value` <= 0.05 & `Query Strain ID` %in% overlappingorfs$orf_name == FALSE & `Array Strain ID` %in% overlappingorfs$orf_name == FALSE)#& str_detect(`Query allele name`,'supp')==F & str_detect(`Query allele name`,'supp')==F)
pgs_allele <- strain_ids %>%
  filter(`Systematic gene name` %in% pgs) %>%
  dplyr::select(`Allele Gene name`) %>%
  pull()

allnet <- graph_from_data_frame(net_df_significant[,c(2,4,6)],directed=FALSE)
adj <- as_adjacency_matrix(allnet,type='both',attr="Genetic interaction score (ε)") %>% as.matrix()
net_df_significant_sl <- filter(net_df_significant, `Genetic interaction score (ε)` <= thr)#& `Double mutant fitness standard deviation` <= 0.1)
allnet_sl <- graph_from_data_frame(net_df_significant_sl[,c(1,3)],directed=FALSE)#%>% igraph::simplify()
colname <- colnames(nones.exp.data)[1]
#adj_sl <- as_adjacency_matrix(allnet_sl,type='both',attr="Genetic interaction score (ε)") %>% as.matrix()
allnet_sl_deg <- degree(allnet_sl)
pgs_deg <- allnet_sl_deg[names(allnet_sl_deg) %in% pgs]
nones.exp.data <- nones.exp.data %>%
    add.degree(allnet_sl, colname) %>%
    dplyr::mutate(int_density = deg / num,group=ifelse(`Systematic gene name`%in%pgs,'Proto-gene','Nonessential'))
exp.data <- exp.data %>%
  add.degree(allnet_sl, colname) %>%
  dplyr::mutate(int_density = deg / num)

adj[adj==0] <- NA

pgs_min <- map(pgs_allele,function(x) {
  d=min(adj[,x],na.rm=T)
  d
} ) %>% unlist()
names(pgs_min) <- pgs_allele




## ----Graphs----------------------------------------------------------
for(i in 1:nrow(pgs.df)){
  name <- pgs.df$orf_name[i] %>% as.character()
  width <- width(pgs.data[pgs.data$name==name])
  deg <- pgs_deg[names(pgs_deg)==name]
  #                  strain_ids %>%
  # filter(`Systematic gene name` ==name) %>%
  # dplyr::select(`Allele Gene name`) %>%
  # pull()]
  min <- pgs_min[names(pgs_min)==strain_ids %>%
  filter(`Systematic gene name` ==name) %>%
  dplyr::select(`Allele Gene name`) %>%
  pull()]
  pgs.df$deg[i] <- ifelse(length(deg)==0,0,deg)
  pgs.df$min[i] <- ifelse(length(min)==0,NA,min)
  #tss.d <- tss.dist[tss.dist$orf_name==name,]
  pgs.df$tss.bool[i] <- ifelse(pgs.df$tss[i]>200,1,0)
  pgs.df$nucs[i] <- ifelse(countOverlaps(pgs.data[i],nucs)>0,1,0)#pgs.atac$data[pgs.atac$orf_name==name]
  pgs.df$int_dens[i] <- nones.exp.data$int_density[nones.exp.data$`Systematic gene name`==name]
}



convertBool <- function(x) ifelse(x>0,1,0)


p=pvalue(independence_test(nucs~min,data=pgs.df %>% mutate(nucs=as.factor(nucs))%>%unnest()))#p=0.9554
nucs_min <-
  ggplot(pgs.df %>% mutate(nucs=as.factor(nucs))%>%unnest(),aes(x=nucs,y=abs(min),fill=nucs))+
  stat_summary(fun.y = mean, geom = "bar",position = position_dodge(width=1)) +
  stat_summary(fun.data = mean_se,width=0.5, geom = "errorbar",position = position_dodge(width=1))+ylab('Strongest negative\n interaction')+xlab('Nucleosome occupied')+
  scale_fill_manual(values = c("#B2DDC9","#AEBDE1"))+scale_x_discrete(labels=c('No','Yes'))+
  theme_me_bar+theme(legend.position = 'none')+
  geom_signif(annotations=c(paste0("p=",formatC(p,digits=3))),xmin=1,xmax=2,
              y_position=0.6,tip_length = 0.01,textsize = 5)

p=pvalue(independence_test(tss.bool~min,data=pgs.df %>% mutate(tss.bool=as.factor(tss.bool))%>%unnest()))#p=0.775
tss_min <- ggplot(pgs.df %>% mutate(tss.bool=as.factor(tss.bool))%>%unnest(),aes(x=tss.bool,y=abs(min),fill=tss.bool))+
  stat_summary(fun.y = mean, geom = "bar",position = position_dodge(width=1)) +
 # stat_summary(fun.data = n_fun, geom = "text",position=position_dodge(width=1))+
  #stat_summary(aes(label=round(..y..,2)), fun.y=mean, geom="text", size=6,vjust = -0.5)+
  stat_summary(fun.data = mean_se,width=0.5, geom = "errorbar",position = position_dodge(width=1))+ylab('Strongest negative\n interaction')+xlab('Distance to downstream TSS >200bp')+
  scale_fill_manual(values = c("#B2DDC9","#AEBDE1"))+scale_x_discrete(labels=c('No','Yes'))+
  theme_me_bar+theme(legend.position = 'none')+
  geom_signif(annotations=c(paste0("p=",formatC(p,digits=3))),xmin=1,xmax=2,
               y_position=0.6,tip_length = 0.01,textsize = 5)

# nucs_grid <- cowplot::plot_grid(nucs_deg+xlab(''),nucs_min+xlab(''),ncol=2,align = 'h',axis = 'b')
# nucs_grid_s <- cowplot::add_sub(nucs_grid, "Nucleosome occupied", vpadding=grid::unit(0,"lines"),y=6, x=0.5, vjust=4.5,size=8)
#
# tss_grid <- cowplot::plot_grid(tss_deg+xlab(''),tss_min+xlab(''),ncol=2)
# tss_grid_s <- cowplot::add_sub(tss_grid, 'Distance to downstream TSS >200bp', vpadding=grid::unit(0,"lines"),y=6, x=0.5, vjust=4.5,size=8)

#nucs_tss <-  cowplot::plot_grid(nucs_min, tss_min)
#nucs_tss
#plt <- cowplot::plot_grid(nucs_deg,nucs_min,tss_deg,tss_min)
ggsave('figures/Figure2F.pdf',width=out_width,height=out_height,units = 'in',plot = nucs_min )
ggsave('figures/Figure2G.pdf',width=out_width,height=out_height,units = 'in',plot = tss_min )
#ggsave('analysis/figures/0922_figures/fig1f.pdf',width=out_width,height=out_height,units = 'in',plot = tss_grid_s )
oacar/pgsNetwork documentation built on Oct. 1, 2019, 9:15 a.m.