knitr::opts_chunk$set(echo = FALSE,message = FALSE,warning = FALSE)
library(rtracklayer)
library(AnnotationHub)
library(Rsamtools)
library(GenomicRanges)
library(Gviz)
library(GenomicFeatures)
library(biomaRt)
library(openxlsx)
library(tidyverse)
source('~/Main/anne/epigenetics/nucleosome_analysis/functions.R')
scer <- import(con='~/Downloads/ucscgenes.bed')
setwd('~/Main/pgsNetwork/vignettes/')
summ <- read.csv('~/Main/SUMMARY_2012.csv')
summ.gene <- summ$orf_name[summ$category=='gene']
grl<-scer[scer$name%in%summ.gene]
pgs <- read_csv('../analysis/data/derived_data/filtering/costanzoplusmypcc_0815.csv')
pgs <- pgs$orf_name

pgs.data <- scer[scer$name%in%pgs]

tif <- read_delim('~/Main/anne/epigenetics/tif/S1_TIFs.txt',delim=' ')
#tif <- read_delim('../tif/S2_tcd_mTIFAnno.txt',delim=' ')
tif <- transform(tif, start = ifelse(strand == '-', t3, t5), end = ifelse(strand == '-', t5, t3)) %>%
  mutate(chr=recode(chr,`1`='chrI',`2`='chrII',`3`='chrIII',`4`='chrIV',`5`='chrV',`6`='chrVI',
                    `7`='chrVII',`8`='chrVIII',`9`='chrIX',`10`='chrX',`11`='chrXI',`12`='chrXII',`13`='chrXIII',
                    `14`='chrXIV',`15`='chrXV',`16`='chrXVI'))


tif_ <- read_delim('~/Main/anne/epigenetics/tif/S1_TIFs.txt',delim=' ',col_names = paste0("V",seq_len(12)))
colnames(tif_)[1:8] <- tif_[1,1:8]
colnames(tif_)[11] <- 'orf_name'
tif_ <- tif_[-1,] %>% filter(is.na(orf_name)==F)



pgs.upstream <- promoters(pgs.data,upstream=300,downstream=0)
pgs.downstream <- expandRange(pgs.data,upstream=0,downstream=200)
pgs.tifreads <- data.frame(orf_name=pgs.data$name,ypd=NA,gal=NA,count=NA)
for(i in 1:length(pgs.data)){

  pgs.tifreads$ypd[i] <- mean(as.numeric(c(tif_$ypd[tif_$orf_name==pgs.data$name[i]])))
  pgs.tifreads$gal[i] <- mean(as.numeric(c(tif_$gal[tif_$orf_name==pgs.data$name[i]])))
  pgs.tifreads$count[i] <- nrow(tif_[tif_$orf_name==pgs.data$name[i],])
}
pgs.tifreads
library(rtracklayer)
library(AnnotationHub)
library(Rsamtools)
library(GenomicRanges)
library(Gviz)
library(GenomicFeatures)
library(biomaRt)
library(openxlsx)


scer <- import(con='~/Downloads/ucscgenes.bed')

tss_park <- import(con='~/Main/anne/epigenetics/tif/park.bed',format = 'bed')

summ <- read.csv('~/Main/SUMMARY_2012.csv')
summ.gene <- summ$orf_name[summ$category=='gene']
grl<-scer[scer$name%in%summ.gene]
tif <- read_delim('~/Main/anne/epigenetics/tif/S1_TIFs.txt',delim=' ')
#tif <- read_delim('../tif/S2_tcd_mTIFAnno.txt',delim=' ')
tif <- transform(tif, start = ifelse(strand == '-', t3, t5), end = ifelse(strand == '-', t5, t3)) %>%
  mutate(chr=recode(chr,`1`='chrI',`2`='chrII',`3`='chrIII',`4`='chrIV',`5`='chrV',`6`='chrVI',
                    `7`='chrVII',`8`='chrVIII',`9`='chrIX',`10`='chrX',`11`='chrXI',`12`='chrXII',`13`='chrXIII',
                    `14`='chrXIV',`15`='chrXV',`16`='chrXVI'))



tif_rs <- GRanges(seqnames = tif$chr,strand = tif$strand,ranges = IRanges(start=ifelse(tif$strand=='+',tif$start,tif$end),end = ifelse(tif$strand=='+',tif$start,tif$end)),DataFrame(score=tif$ypd,gal=tif$gal))
tif_re <- GRanges(seqnames = tif$chr,strand = tif$strand,ranges = IRanges(start=ifelse(tif$strand=='+',tif$end,tif$start),end =ifelse(tif$strand=='+',tif$end,tif$start)),DataFrame(score=tif$ypd,gal=tif$gal))
gene.prm <- promoters(grl,upstream=200,downstream=0)
gene.prm$tss <- NA
pranges <- GRanges()
for(i in 1:length(gene.prm)){
  p <- gene.prm[i]
  tss_ <- median(start(tif_rs[tif_rs%over%p]))
  gene.prm[i]$tss <- tss_
  if(is.na(tss_)==F){
    pranges <- append(pranges,GRanges(seqnames = seqnames(p),strand=strand(p),ranges=IRanges(start=tss_,end=tss_),DataFrame(name=p$name,score=p$score)))

  }

}
tss_tif <- pranges

park.diff <- setdiff(tss_park$name,tss_tif$name)
pgs.data <- scer[scer$name%in%pgs]

tss_comb <- append(tss_tif,tss_park)

tss <- tss_tif[tss_tif$name%in%setdiff(tss_tif$name,pgs.data$name)]#tss_comb[tss_comb$name%in%setdiff(tss_comb$name,pgs.data$name)]
dist.list <- c()
dist.list.data <- data.frame(name=NA, dist=NA)
for(i in 1:length(pgs.data)){
  tryCatch({
    nn <- GRanges(seqnames = seqnames(pgs.data[i]),ranges = ranges(pgs.data[i]),strand = '-')
    np <- GRanges(seqnames = seqnames(pgs.data[i]),ranges = ranges(pgs.data[i]),strand = '+')
    o <- queryHits(findOverlaps(tss,pgs.data[i],ignore.strand=T))
    if(length(o)!=0){
      #u <- follow(nn,tss)
      #d <- nearest(pgs.data[i],tss,ignore.strand=T)
      print(i)
      dist.list[[ mcols(pgs.data[i])$name]] <- list(list(pgs.data[i],tss[o]))
      dist.list.data <- rbind(dist.list.data,data.frame(name=mcols(pgs.data)$name[i],dist=0))

    }else{
      #d <- nearest(pgs.data[i],tss,ignore.strand=T)
      u <- precede(np,tss)
      d <- precede(nn,tss)
      du <- distance(tss[u],pgs.data[i],ignore.strand=T)
      dd <- distance(tss[d],pgs.data[i],ignore.strand=T)
      if(du<dd){
        dist.list[[ mcols(pgs.data[i])$name]] <- list(list(pgs.data[i],tss[d]))
        dist.list.data <- rbind(dist.list.data,data.frame(name=mcols(pgs.data)$name[i],dist=du))

      }else{
        dist.list[[ mcols(pgs.data[i])$name]] <- list(list(pgs.data[i],tss[d]))
        dist.list.data <- rbind(dist.list.data,data.frame(name=mcols(pgs.data)$name[i],dist=dd))
      }
      # dist.list[[ mcols(pgs.data[i])$name]] <- list(list(pgs.data[i],tss[d]))
      # dist.list.data <- append(dist.list.data,distance(tss[d],pgs.data[i],ignore.strand=T))
      # names(dist.list.data)[i] <-mcols(pgs.data)$name[i]
    }
  }, error = function(e) NULL )
  #dist.list <- append(dist.list,mcols(distanceToNearest(pgs.data[i],tss_park,ignore.strand=T)))
}

dist.list.data <- dist.list.data[-1,]
rhee.all <- Sys.glob('~/Main/anne/epigenetics/Rhee_2011/*.bed')

rhee.dat <- lapply(rhee.all, import) %>% unlist() %>% GenomicRangesList() %>% unlist() 
rhee.pgs <- c()
j=0
for( i in 1:length(pgs)){
  c <- countOverlaps(pgs.data[i],rhee.dat)
  if(c>0){
    #print(paste(c,mcols(pgs.data)$name[i],i))
    rhee.pgs <- append(rhee.pgs,pgs.data$name[i])
    j <- j+1
  }
}

pgs.df <- data.frame(orf_name=pgs,tss=NA)
for(i in 1:length(pgs)){
  pgs.df$tss[i] <- dist.list.data[dist.list.data$name==as.character(pgs.df$orf_name[i]),2]   
}

Proto-gene

library(openxlsx)
library(NucleoATACR)
library(tidyverse)
library(dplyr)
library(GenomicRanges)
library(BBmisc)
library(rtracklayer)
library(RColorBrewer)
source('~/Main/anne/epigenetics/nucleosome_analysis/functions.R')
set.seed(1)
scer <- import(con='~/Downloads/ucscgenes.bed')

pgs.data <- scer[scer$name%in%pgs]


nucs <- readNucs('~/Main/anne/epigenetics/nucleosome_occupancy/lin_all.nucpos.bed')%>%flank(74,both=T,ignore.strand=T)#"test.nucmap_combined.bed.gz")
#tif.data <- read_tsv('../../../../pgs/tif_count.tsv') #%>% arrange(order(orf_name))
#tif.data <- tif.data[order(tif.data$orf_name),]

#nfrs <- import('~/Main/anne/epigeneticsnucleosome_occupancy/lin_all.occ.bw')#"test.nfrpos.bed.gz")
source('../R/add.degree.R')

overlappingorfs <- read_csv("../analysis/data/raw_data/overlappingorfs.csv")
net.df <- readRDS("../analysis/data/derived_data/SGA_data_combined.rds.gz")
strain_ids <- read_csv("../analysis/data/derived_data/strain_ids.csv")
allele.frame <- readRDS("../analysis/data/derived_data/df_different_alleles.rds")
nones.exp.data <- read_csv("../analysis/data/derived_data/strain_ids_with_experiment_count_nonessential.csv")
exp.data <- read_csv("../analysis/data/derived_data/strain_ids_with_experiment_count_all.csv")%>% mutate(group=ifelse(`Systematic gene name`%in%pgs,'proto-gene',maincat))#ifelse(`Systematic gene name`%in%nones.exp.data$`Systematic gene name`==F,'essential','nonessential')))


net_df_significant <- filter(net.df, `P-value` <= 0.05 & `Query Strain ID` %in% overlappingorfs$orf_name == FALSE & `Array Strain ID` %in% overlappingorfs$orf_name == FALSE)#& str_detect(`Query allele name`,'supp')==F & str_detect(`Query allele name`,'supp')==F)
pgs_allele <- strain_ids %>%
  filter(`Systematic gene name` %in% pgs) %>%
  dplyr::select(`Allele Gene name`) %>%
  pull()
library(igraph)
library(coin)
thr <- -0.2
allnet <- graph_from_data_frame(net_df_significant[,c(2,4,6)],directed=FALSE)
adj <- as_adjacency_matrix(allnet,type='both',attr="Genetic interaction score (ε)") %>% as.matrix()
net_df_significant_sl <- filter(net_df_significant, `Genetic interaction score (ε)` <= thr)#& `Double mutant fitness standard deviation` <= 0.1)
allnet_sl <- graph_from_data_frame(net_df_significant_sl[,c(1,3)],directed=FALSE)#%>% igraph::simplify()
colname <- colnames(nones.exp.data)[1]
#adj_sl <- as_adjacency_matrix(allnet_sl,type='both',attr="Genetic interaction score (ε)") %>% as.matrix()
allnet_sl_deg <- degree(allnet_sl)
pgs_deg <- allnet_sl_deg[names(allnet_sl_deg) %in% pgs]
nones.exp.data <- nones.exp.data %>%
    add.degree(allnet_sl, colname) %>%
    dplyr::mutate(int_density = deg / num,group=ifelse(`Systematic gene name`%in%pgs,'Proto-gene','Nonessential'))
exp.data <- exp.data %>%
  add.degree(allnet_sl, colname) %>%
  dplyr::mutate(int_density = deg / num)
# pgs_min <- map(pgs_allele[pgs_allele%in%V(allnet_sl)$name],function(x) {
#   d=adj_sl[,x]
#   d[d <= thr]
# } )
#names(pgs_min) <- pgs_allele[pgs_allele%in%V(allnet_sl)$name]
adj[adj==0] <- NA

pgs_min <- map(pgs_allele,function(x) {
  d=min(adj[,x],na.rm=T)
  d
 #d[d < thr]#%>%as.numeric()
} ) %>% unlist()
names(pgs_min) <- pgs_allele
#adj[,'ybl008w-a']
ggplot(nones.exp.data %>% filter(int_density>=0),aes(x=group,y=int_density))+
  stat_summary(fun.y = mean, geom = "point",position = position_dodge(width=1)) +
    stat_summary(fun.data = mean_se, geom = "errorbar",position = position_dodge(width=1))
independence_test(group~int_density,data = nones.exp.data %>% mutate(group=as.factor(group)) %>% filter(int_density>=0))

``{R eval=FALSE} library(coin) for(i in 1:nrow(pgs.df)){ name <- pgs.df$orf_name[i] %>% as.character() width <- width(pgs.data[pgs.data$name==name]) deg <- pgs_deg[names(pgs_deg)==name] # strain_ids %>% # filter(Systematic gene name==name) %>% # dplyr::select(Allele Gene name) %>% # pull()] min <- pgs_min[names(pgs_min)==strain_ids %>% filter(Systematic gene name==name) %>% dplyr::select(Allele Gene name) %>% pull()] pgs.df$deg[i] <- ifelse(length(deg)==0,0,deg) pgs.df$min[i] <- ifelse(length(min)==0,NA,min) #tss.d <- tss.dist[tss.dist$orf_name==name,] pgs.df$tss.bool[i] <- ifelse(pgs.df$tss[i]>200,1,0) pgs.df$nucs[i] <- ifelse(countOverlaps(pgs.data[i],nucs)>0,1,0)#pgs.atac$data[pgs.atac$orf_name==name] pgs.df$int_dens[i] <- nones.exp.data$int_density[nones.exp.data$Systematic gene name`==name] }

library(ggpubr) library(ggsignif)

ggplot(pgs.df%>%unnest(),aes(x=tss,y=abs(min)))+geom_point()+geom_smooth(method=lm, se=FALSE)+stat_cor(method = "pearson")+

ylab('negative interaction strength')+xlab('distance to tss')+theme_bw()

ggsave('../analysis/figures/0805_paper/negative02_distancetss_scatter.pdf')

                                    #+scale_x_log10()+scale_y_log10()

convertBool <- function(x) ifelse(x>0,1,0)

pdf('../analysis/figures/0805_paper/nfr_degree_nosdthr_nosdthr.pdf')

p=pvalue(independence_test(nucs~deg,data=pgs.df %>% mutate(nucs=as.factor(nucs)))) nucs_deg <- ggplot(pgs.df %>% mutate(nucs=as.factor(nucs)),aes(x=nucs,y=deg))+ stat_summary(fun.y = mean, geom = "bar",position = position_dodge(width=1),fill='lightblue') + stat_summary(fun.data = mean_se,width=0.5, geom = "errorbar",position = position_dodge(width=1))+theme_minimal()+ylab('Number of synthetic\nlethal interactions')+xlab('Nucleosome occupied')+ scale_x_discrete(labels=c('No','Yes'))+theme(axis.title = element_text(size=8),axis.text = element_text(size=6),panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank())+coord_cartesian(ylim = c(0,18))#+ geom_signif(annotations=c(paste0("p=",formatC(p,digits=3))),xmin=1,xmax=2, y_position=17,tip_length = 0.005,textsize = 5)

dev.off()

pdf('../analysis/figures/0805_paper/nfr_min_nosdthr.pdf')

p=pvalue(independence_test(nucs~min,data=pgs.df %>% mutate(nucs=as.factor(nucs))%>%unnest())) nucs_min <- ggplot(pgs.df %>% mutate(nucs=as.factor(nucs))%>%unnest(),aes(x=nucs,y=abs(min)))+ stat_summary(fun.y = mean, geom = "bar",position = position_dodge(width=1),fill='lightblue') + stat_summary(fun.data = mean_se,width=0.5, geom = "errorbar",position = position_dodge(width=1))+theme_minimal()+ylab('Strongest negative\n interaction')+xlab('Nucleosome occupied')+ scale_x_discrete(labels=c('No','Yes'))+theme(axis.title = element_text(size=8),axis.text = element_text(size=6),panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank())+coord_cartesian(ylim = c(0,0.8))#+ geom_signif(annotations=c(paste0("p=",formatC(p,digits=3))),xmin=1,xmax=2, y_position=0.75,tip_length = 0.05,textsize = 5)

nucs_grid <- cowplot::plot_grid(nucs_deg,nucs_min,nrow=2) ggsave('../analysis/figures/0812_poster/nfr_data_03.pdf',nucs_grid,width=3,height=6,units = 'in')

dev.off()

pdf('../analysis/figures/0805_paper/tss_degree_nosdthr.pdf')

p=pvalue(independence_test(tss.bool~deg,data=pgs.df %>% mutate(tss.bool=as.factor(tss.bool))))

tss_deg <- ggplot(pgs.df %>% mutate(tss.bool=as.factor(tss.bool)),aes(x=tss.bool,y=deg))+ stat_summary(fun.y = mean, geom = "bar",position = position_dodge(width=1),fill='lightblue') + stat_summary(fun.data = mean_se,width=0.5, geom = "errorbar",position = position_dodge(width=1))+theme_minimal()+ylab('Number of synthetic\nlethal interactions')+xlab('Distance to downstream \nTSS >200bp')+ scale_x_discrete(labels=c('No','Yes'))+theme(axis.title = element_text(size=8),axis.text = element_text(size=6),panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank())+coord_cartesian(ylim = c(0,18))#+ geom_signif(annotations=c(paste0("p=",formatC(p,digits=3))),xmin=1,xmax=2, y_position=17,tip_length = 0.005,textsize = 5)

dev.off()

pdf('../analysis/figures/0805_paper/tss_negintstr_02nosdthr.pdf')

p=pvalue(independence_test(tss.bool~min,data=pgs.df %>% mutate(tss.bool=as.factor(tss.bool))%>%unnest())) tss_min <- ggplot(pgs.df %>% mutate(tss.bool=as.factor(tss.bool))%>%unnest(),aes(x=tss.bool,y=abs(min)))+ stat_summary(fun.y = mean, geom = "bar",position = position_dodge(width=1),fill='lightblue') + # stat_summary(fun.data = n_fun, geom = "text",position=position_dodge(width=1))+ #stat_summary(aes(label=round(..y..,2)), fun.y=mean, geom="text", size=6,vjust = -0.5)+ stat_summary(fun.data = mean_se,width=0.5, geom = "errorbar",position = position_dodge(width=1))+theme_minimal()+ylab('Strongest negative\n interaction')+xlab('Distance to downstream \nTSS >200bp')+ scale_x_discrete(labels=c('No','Yes'))+theme(axis.title = element_text(size=8),axis.text = element_text(size=6),panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank())+coord_cartesian(ylim = c(0,0.8))#+ geom_signif(annotations=c(paste0("p=",formatC(p,digits=3))),xmin=1,xmax=2, y_position=0.75,tip_length = 0.05,textsize = 5)

tss_grid <- cowplot::plot_grid(tss_deg,tss_min,nrow=2) cowplot::plot_grid(nucs_deg,nucs_min,tss_deg,tss_min) ggsave('../analysis/figures/0816_paper/tssnucs_data.pdf',width=4,height=4,units = 'in')

dev.off()

map(pgs.df$min,length)%>%unlist library(coin) independence_test(tss.bool~min,data=pgs.df %>% mutate(tss.bool=as.factor(tss.bool))) independence_test(nucs~min,data=pgs.df %>% mutate*nucs=as.factor(nucs)))

alldata.bool <- as_tibble(pgs.df) %>% mutate_if(is.numeric,convertBool)

fisher.test(with(alldata.bool,table(nucs,int_dens))) fisher.test(with(alldata.bool,table(tss.bool,int_dens))) with(alldata.bool,table(tss,deg)) ft

```r
net_df_significant <- net_df_significant %>% mutate(group=ifelse(`Query Strain ID`%in%pgs|`Array Strain ID`%in%pgs,'proto-gene',ifelse(`Query Strain ID`%in%nones.exp.data$`Systematic gene name`==F&`Array Strain ID`%in%nones.exp.data$`Systematic gene name`==F,'essential','nonessential')))

ggplot(net_df_significant,aes(y=`Double mutant fitness standard deviation`,x=`Genetic interaction score (ε)`))+geom_point(aes(color=group))+xlim(NA,-0.2)+ylim(0,0.2)

ggplot(net_df_significant%>%filter(`Genetic interaction score (ε)`<=-0.35),aes(y=`Double mutant fitness standard deviation`,x=group))+
  stat_summary(fun.y = mean, geom = "point",position = position_dodge(width=1)) +
  stat_summary(fun.data = mean_se, geom = "errorbar",position = position_dodge(width=1))

ggplot(net_df_significant%>%filter(`Genetic interaction score (ε)`<=-0.2),aes(x=`Double mutant fitness standard deviation`,color=group,y=..ndensity..))+geom_freqpoly(binwidth=0.01)+geom_vline(xintercept = 0.1)
#  stat_summary(fun.y = mean, geom = "point",position = position_dodge(width=1)) +
 # stat_summary(fun.data = mean_se, geom = "errorbar",position = position_dodge(width=1))


n_fun <- function(x){
  return(data.frame(y = median(x)*1.15, label = paste0(length(x))))
}
exp.data <- exp.data %>% mutate(bin = as.factor(bin))#num_cat = case_when(


ggplot(nones.exp.data %>% filter(int_density>=0),aes(color=group,x=int_density))+geom_freqpoly(aes(y=..density..),binwidth=0.1)+scale_x_log10()

independence_test(group~int_density,exp.data%>%filter(group!='essential'&cat!='nxes.only')%>%mutate(group=factor(group)))

pdf('../analysis/figures/0709_interactiondensity/03501sdgroupedintdensity.pdf')

#ggplot(exp.data,aes(x=deg,y=num,color=group))+geom_point()#+scale_x_log10()+scale_x_log10()
pltdata=exp.data%>%filter(deg>0&group!='essential'&bin%in%exp.data$bin[exp.data$group=='proto-gene']&cat%in%exp.data$cat[exp.data$group=='proto-gene'])
ggplot(pltdata,aes(x=bin,y=int_density,color=group,fill=cat))+#geom_boxplot()
  stat_summary(fun.y = mean, geom = "point",position = position_dodge(width=1)) +
  stat_summary(fun.data = mean_se, geom = "errorbar",position = position_dodge(width=1))+facet_grid(cat~.)

pdf('../analysis/figures/0709_interactiondensity/intdensity_nonespgs_nonxesonly02.pdf')
,color=group

ggplot(exp.data%>%filter(group!='essential'&cat!='nxes.only'),aes(x=group,y=int_density,fill=group))+#geom_boxplot()
  stat_summary(fun.y = mean, geom = "bar",position = position_dodge(width=1)) +
  stat_summary(fun.data = mean_se, geom = "errorbar",position = position_dodge(width=1))+
  stat_summary(fun.data = n_fun, geom = "text",position=position_dodge(width=1))+ylab("Interaction density")+xlab("")+ coord_cartesian(ylim=c(0, 0.008))

dev.off()


oacar/pgsNetwork documentation built on Oct. 1, 2019, 9:15 a.m.