MERIT: MastEr Regulator Inference Tool

# using identifications based on iTAK platform to identify regulator families involved
tf.families = read.table("experiments/Arabidopsis_thaliana-all.txt", header = FALSE, fill = TRUE, stringsAsFactors = FALSE)[,1:2]
tf.families$V1 <- gsub("\\..*", "",tf.families$V1)
tf.families <- unique(tf.families)

v.tf_families <- tf.families$V2
names(v.tf_families) <- tf.families$V1

v.regulators <-  v.tf_families
v.regulators <- v.regulators[!grepl("RLK", v.regulators) & !grepl("PPC", v.regulators) & !grepl("AGC", v.regulators) & !grepl("Group", v.regulators)  & !grepl("mTERF", v.regulators) & !grepl("Tify", v.regulators) & !grepl("Aur", v.regulators)  & !grepl("CAMK", v.regulators)  & !grepl("TRAF", v.regulators) & !grepl("MED7", v.regulators)]


df.condition_tissue <- unique(df.experiment_condition_annotation[,c("condition", "condition_optional", "tissue", "unique_ID")])


m.fc <- l.data$m.foldChange_differentialExpression
m.de <- l.data$m.pvalue_differentialExpression

m.fc[m.fc > 0] <- 1
m.fc[m.fc < 0] <- -1

m.de[m.de <= th.diffexp] <- 10
m.de[m.de <= 1] <- 0
m.de[m.de > 0] <- 1

m.de <- m.fc * m.de

# subset to growth & defense genes
df = read.csv("datasets/growth_defense_genes.csv")
v.mes = intersect(rownames(m.fc), df$ID)
m.de.gd = m.de[v.gns,]

# TFs
df.tfs = read.table("datasets/tfs_groups.csv", sep = "\t", header = T)
df.net = read.table("datasets/network.csv", sep = "\t", header = T)

# select indices ...
tb.condition_treatments <- l.data$tb.condition_treatments
tb.condition_tissues <- l.data$tb.condition_tissues

ids.exps = unique(l.data$df.experiment_condition_annotation$unique_ID)

# blueprint network (large enough statistics but static)
m.grn <- as.matrix(l.res.grn_tfbs$m.lead_support_w_motif.grn)
hist(m.grn[m.grn > 0])
length(m.grn[m.grn > 0])
dim(m.grn)
m.grn[m.grn > 0] = 1
sum(m.grn)


library(psych)
# get TFs as enriched
v.gns <- vector(mode = "list", length = 2)
v.gns[[1]] <- unique(subset(df, df$Group == "growth")$ID)
v.gns[[2]] <- unique(subset(df, df$Group == "defense")$ID)
names(v.gns) = c("growth", "defense")

groups <- c(rep(names(v.gns)[1], length(v.gns[[1]])), rep(names(v.gns)[2], length(v.gns[[2]])))
names(groups) <- c(v.gns[[1]], v.gns[[2]])

doms <- df$Domain
names(doms) <- df$ID


message("track the epxression pattern of metabolic genes")
expr <- vector(mode = "list", length = 2)
expr[[1]] <- matrix(0, nrow=2, ncol=length(ids.exps))
expr[[2]] <- matrix(0, nrow=2, ncol=length(ids.exps))
names(expr) = c("growth", "defense")
rownames(expr[[1]]) <- rownames(expr[[2]]) <- c("-1", "1")
colnames(expr[[1]]) <- colnames(expr[[2]]) <- colnames(m.de)

for(j in 1:length(ids.exps)){

  id <- ids.exps[j]
  id <- as.character(id)

  for(k in 1:2){
    n.gns <- length(v.gns[[k]])
    t <- table(m.de[v.gns[[k]],id])
    if("-1" %in% names(t)){
      expr[[k]][1, id] <- round(t["-1"] / n.gns * 100, 1)
    }
    if("1" %in% names(t)){
      expr[[k]][2, id] <- round(t["1"] / n.gns * 100, 1)
    }
  }

}

message("correlation across conditions between ratios of up and down regulated metabolic enzumes")
cor(expr[[1]][1,], expr[[1]][2,])
cor(expr[[2]][1,], expr[[2]][2,])

df.data <- data.frame(repressed = expr[[1]][1,], expressed = expr[[1]][2,], col = "blue")
df.data <- rbind(df.data, data.frame(repressed = expr[[2]][1,], expressed = expr[[2]][2,], col = "red"))

plot(df.data$expressed, df.data$repressed, col=df.data$col)
legend(x="topright", legend = c("growth", "defense"), col=c("blue","red"), pch=1)


####


df["repressed"] = 0
df["expressed"] = 0
df["col"] = ifelse(df$Group == "growth", "blue", "red")

for(i in 1:nrow(df)){

  gn = df$ID[i]
  df$Group[i]
  t <- table(m.de[gn,])
  if("-1" %in% names(t)){
    df$repressed[i] = round(t["-1"] / ncol(m.de) * 100, 1)
  }
  if("1" %in% names(t)){
    df$expressed[i] = round(t["1"] / ncol(m.de) * 100, 1)
  }

}

plot(df$expressed, df$repressed, col=df$col)
legend(x="topright", legend = c("growth", "defense"), col=c("blue","red"), pch=1)



## repression and expression
v.tfs <- rownames(m.grn)

df.set <- data.frame(repressed = rep(0, ncol(m.grn)),
                     expressed = rep(0, ncol(m.grn)),
                     col = rep(0, ncol(m.grn)))

for(i in 1:ncol(m.grn)){
  gn = colnames(m.grn)[i]
  t <- table(m.de[gn,])
  if("-1" %in% names(t)){
    df.set$repressed[i] = round(t["-1"] / ncol(m.de) * 100, 1)
  }
  if("1" %in% names(t)){
    df.set$expressed[i] = round(t["1"] / ncol(m.de) * 100, 1)
  }

  if(gn %in% v.tfs){
    df.set$col[i] = "red"
  }else if(gn %in% v.mes){
    df.set$col[i] = "blue"
  }else{
    df.set$col[i] = "gray"
  }

}

plot(df.set$expressed, df.set$repressed, col=df.set$col)
legend(x="topright", legend = c("TFs", "MEs", "Other"), col=c("red", "blue", "gray"), pch=1)


message("differential expression of transcription factors ")
hist(round(rowSums(abs(m.de[v.tfs, ])) / dim(m.de)[2]*100,1), 50)
hist(round(rowSums(abs(m.de[v.mes, ])) / dim(m.de)[2]*100,1), 50)
hist(round(rowSums(abs(m.de)) / dim(m.de)[2]*100,1), 50)

median(rowSums(abs(m.de[v.tfs, ])) )
median(rowSums(abs(m.de[v.mes, ])))
median(rowSums(abs(m.de)))



m.crosstalk <- matrix(0, nrow=length(v.tfs), ncol=2)
rownames(m.crosstalk) = v.tfs

for(i in 1:length(v.tfs)){

  v.tgs <- names(which(m.grn[v.tfs[i],] != 0))
  n.tgs <- length(v.tgs)
  n.genome <- dim(m.grn)[2]

  th.min.samples = 2

  for(k in 1:2){

    n.gns <- length(v.gns[[k]])
    n.set <- length(intersect(v.gns[[k]], v.tgs))

    hitInSample <- n.set
    sampleSize <- n.gns
    hitInPop <- n.tgs
    popSize <- n.genome
    failInPop <- popSize - hitInPop
    fc <- ((hitInSample / sampleSize) / (hitInPop / popSize))

    if(fc > 1 & hitInSample >= th.min.samples){
      p.val <- phyper(hitInSample-1, hitInPop, failInPop, sampleSize, lower.tail= FALSE)
      if(p.val <= 0.05)
        m.crosstalk[i,k] = round(n.set / n.gns * 100, 1)
    }
  }
}

v.tfs.sel <- names(which(rowSums(m.crosstalk) > 0))
plot(m.crosstalk[v.tfs.sel,1], m.crosstalk[v.tfs.sel,2], xlab = "growth", ylab = "defense")

message("percentage of growth / defense enriched regulators ")
round(length(v.tfs.sel) / length(v.tfs) * 100, 1)

#####


m.conditions <- matrix(0, nrow=length(v.tfs), ncol=length(tb.condition_treatments))
rownames(m.conditions) = v.tfs
colnames(m.conditions) = names(tb.condition_treatments)

m.tissues <- matrix(0, nrow=length(v.tfs), ncol=length(tb.condition_tissues))
rownames(m.tissues) = v.tfs
colnames(m.tissues) = names(tb.condition_tissues)

for(i in 1:length(v.tfs)){

  i.conds <- names(which(m.de[v.tfs[i],] != 0))
  # v.tgs <- names(which(m.grn[v.tfs[i],] != 0))

  id <- ids.exps[j]
  df.j <- subset(df.condition_tissue, df.condition_tissue$unique_ID %in% as.numeric(i.conds))
  cond <- c(df.j$condition, df.j$condition_optional)
  cond <- cond[cond != ""]
  tis <- df.j$tissue

  cond <- table(cond)
  tis <- table(tis)

  m.conditions[v.tfs[i], names(cond)] = round(cond / tb.condition_treatments[names(cond)] * 100, 1)
  m.tissues[v.tfs[i], names(tis)] = round(tis / tb.condition_tissues[names(tis)] * 100, 1)

}



library(reshape2)
library(ggplot2)

df.heatmap <- melt(t(m.tissues))

df.MR_vs_conditions <- df.heatmap
df.MR_vs_conditions <- subset(df.MR_vs_conditions, !is.na(df.MR_vs_conditions$value))

df.heatmap$Var2 <- paste(df.heatmap$Var2, "(", v.tf_families[df.heatmap$Var2], ")", sep = "")

v.breaks = c(0, 5,25,50,75,100)
v.lables <- c("0", "5","25","50", "75", "100")

ggheatmap <- ggplot(df.heatmap, aes(Var1, Var2, fill = value)) + geom_tile(color = "black") + scale_fill_gradient2(low = "yellow", high = "red", mid = "white", midpoint = 0, name = "percentage", breaks=v.breaks, labels=v.lables) + theme_minimal() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1, color = "black")) + theme(axis.title.y=element_blank(), axis.text.y = element_text(angle = 0, vjust = , size = 10, hjust = 1, color = "black"))

filename <- paste("tmp/", "transcription_factors_vs_tissues.pdf", sep = "")
pdf(filename, width=8, height=120)
plot(ggheatmap)
dev.off()




v.tgs <- c(v.gns[[1]], v.gns[[2]])
idx.tgs <-seq(1, length(v.tgs), 1)
names(idx.tgs) <- v.tgs

idx.map <- seq(1, length(ids.exps), 1)
names(idx.map) <- as.character(ids.exps)

master_regulator <- array(rep(0, length(v.tfs.sel)*2*length(idx.map)), dim=c(length(v.tfs.sel), 2, length(idx.map)))
master_targets <- array(rep(0, length(v.tfs.sel)*length(idx.map)*length(idx.tgs)), dim=c(length(v.tfs.sel), length(idx.map), length(idx.tgs)))


pb <- txtProgressBar(min = 0, max = length(v.tfs.sel), style = 3)
for(i in 1:length(v.tfs.sel)){
  setTxtProgressBar(pb, i)
  tf <- v.tfs.sel[i]
  i.conds <- names(which(m.de[tf,] != 0))
  v.tgs <- names(which(m.grn[tf,] != 0))
  n.tgs <- length(v.tgs)

  df.j <- subset(df.condition_tissue, df.condition_tissue$unique_ID %in% as.numeric(i.conds))
  cond <- c(df.j$condition, df.j$condition_optional)
  cond <- cond[cond != ""]
  tis <- df.j$tissue
  cond <- table(cond)
  tis <- table(tis)

  for(k in 1:2){ # less than or more than? -> putative "master" regulators => cross talk? -> general blueprint -> activated per condition?

    n.pop = length(v.gns[[k]])

    if(m.crosstalk[tf,k] > 0){
      # n.gns <- length(v.gns[[k]])
      v.tgs.k <- intersect(v.gns[[k]], v.tgs)
      n.tgs.k <- length(v.tgs.k)

      for(j in 1:length(i.conds)){

        c = i.conds[j]
        v.tgs.kc <- names(which(m.de[v.tgs.k, c] != 0)) # all targets with active conditions
        n.tgs.kc <- length(v.tgs.kc)

        hitInSample <- n.tgs.kc
        sampleSize <- n.tgs.k
        hitInPop <- length(which(m.de[v.gns[[k]], c] != 0))
        popSize <- n.pop
        failInPop <- popSize - hitInPop
        fc <- ((hitInSample / sampleSize) / (hitInPop / popSize))

        if(fc > 1 & hitInSample >= th.min.samples){
          p.val <- phyper(hitInSample-1, hitInPop, failInPop, sampleSize, lower.tail= FALSE)
          if(p.val <= 0.05){

            master_regulator[i,k,idx.map[c]] = round(n.tgs.kc / n.tgs.k * 100, 1)
            master_targets[i, idx.map[c], idx.tgs[v.tgs.kc]] = m.de[v.tgs.kc, c]

          }
        }
      }
    }
  }
}
close(pb)


# plot 98 patterns
folder = "tmp/roles/"
for(i in 1:length(v.tfs.sel)){

  filename <- paste(folder, v.tfs.sel[i], ".pdf", sep = "")
  pdf(filename, width, height)
  plot(master_regulator[i,1,], master_regulator[i,2,], xlab = "growth", ylab = "defense")
  dev.off()

}


library(textshape)

folder = "tmp/targets/"

for(i in 1:length(v.tfs.sel)){

  m <- master_targets[i,,]

  vals <- paste(df.condition_tissue$condition, df.condition_tissue$tissue, sep = ", ")
  names(vals) <- df.condition_tissue$unique_ID

  rownames(m) <- vals[idx.map]
  colnames(m) <- names(idx.tgs)

  m.heatmap <- t(m)

  m.heatmap <- m.heatmap[rowSums(abs(m.heatmap)) > 0, ,drop = F]
  m.heatmap <- m.heatmap[, colSums(abs(m.heatmap)) > 0 ,drop = F]

  if(dim(m.heatmap)[1] > 1 & dim(m.heatmap)[2] > 1)
    m.heatmap <- cluster_matrix(m.heatmap)

  df.heatmap <- melt(m.heatmap)

  if(nrow(df.heatmap) > 0){

    df.heatmap$Var1 <- paste(df.heatmap$Var1, " (", groups[as.character(df.heatmap$Var1)], " - ", doms[as.character(df.heatmap$Var1)], " )", sep = "")

    v.breaks = c(-1, 0, 1)
    v.lables <- c("-1", "0", "1")

    ggheatmap <- ggplot(df.heatmap, aes(Var1, Var2, fill = value)) + geom_tile(color = "black") + scale_fill_gradient2(low = "green", high = "red", mid = "black", midpoint = 0, name = "expression", breaks=v.breaks, labels=v.lables, limits = c(-1,1)) + theme_minimal() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1, color = "black")) + theme(axis.title.y=element_blank(), axis.text.y = element_text(angle = 0, vjust = , size = 10, hjust = 1, color = "black"))

    width = max(5, length(unique(df.heatmap$Var1)) / 2)
    height = max(5, length(unique(df.heatmap$Var2)) / 3)

    filename <- paste(folder, v.tfs.sel[i], ".pdf", sep = "")
    pdf(filename, width, height)
    plot(ggheatmap)
    dev.off()

  }

}









  ## require similar behavior? - maybe to stringent

  ## separate growth and defense



  # requirement - co-diffexpression in the same condition

  n.tgs <- length(v.tgs)
  n.genome <- dim(m.grn)[2]

  #for(j in 1:length(ids.exps)){

  #id <- ids.exps[j]
  #id <- as.character(id)


  # TF behaviour across conditions or other TFs
  # consistent conditions?
  # isolated events - or

  th.min.samples = 2

  for(k in 1:2){ # less than or more than? -> putative "master" regulators => cross talk? -> general blueprint -> activated per condition?

    n.gns <- length(v.gns[[k]])
    n.set <- length(intersect(v.gns[[k]], v.tgs))

    hitInSample <- n.set
    sampleSize <- n.gns
    hitInPop <- n.tgs
    popSize <- n.genome
    failInPop <- popSize - hitInPop
    fc <- ((hitInSample / sampleSize) / (hitInPop / popSize))

    if(fc > 1 & hitInSample >= th.min.samples){
      p.val <- phyper(hitInSample-1, hitInPop, failInPop, sampleSize, lower.tail= FALSE)
      if(p.val <= 0.05)
        m.crosstalk[i,k] = round(n.set / n.gns * 100, 1)
    }
  }


}

v.tfs.sel <- names(which(rowSums(m.crosstalk) > 0))
plot(m.crosstalk[v.tfs.sel,1], m.crosstalk[v.tfs.sel,2], xlab = "growth", ylab = "defense")
# m.crosstalk[v.tfs.sel,] # in blue print

message("percentage of growth / defense enriched regulators ")
round(length(v.tfs.sel) / length(v.tfs) * 100, 1)





# unsupervised clustering - with labels?
# identify multiple TFs to explain combined behaviour?

# TODO: check with transcription factors and general genes?
# TODO: transcription factor regulation of booth genes

for(k in 1:2){
  n.gns <- length(v.gns[[k]])
  for(l in 1:n.gns){
    gn = v.gns[[k]][l]
    t <- table(m.de[gn,])

  }


  t <- table(m.de[v.gns[[k]],id])


}




# identify regulators that may function on their own


#  TODO: crosstalk per condition - as function

# AT2G01760 - https://www.uniprot.org/uniprot/Q8L9Y3

# characterize all 90 TFs
# also pairwise ...
message("regulators of crosstalk?")
plot(m.crosstalk[v.tfs.sel,1], m.crosstalk[v.tfs.sel,2]) #, col=df$col)
# legend(x="topright", legend = c("growth", "defense")) #, col=c("blue","red"), pch=1)



# filename = paste("tmp/", "growth_expression_ratios.pdf", sep = "")
# df.data <- data.frame(cond = names(tb.condition_tissue_differentialExpression), nr = as.numeric(tb.condition_tissue_differentialExpression))
# p <- ggplot(df.data, aes(cond, nr))
# p <- p + geom_bar(stat = "identity") + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ylab("cumulative number of active (expressed and repressed) genes at differential expression p-value < 0.05")
# pdf(filename, 18,8)
# p
# dev.off()
#
#
#
# library(ggplot2)
#
# ggplot(df.data, aes(x=repressed, y=expressed)) + geom_point()
#
#
#
#
#
# filename = paste("tmp/", "growth_expression_ratios.pdf", sep = "")
# df.data <- data.frame(cond = names(tb.condition_tissue_differentialExpression), nr = as.numeric(tb.condition_tissue_differentialExpression))
# p <- ggplot(df.data, aes(cond, nr))
# p <- p + geom_bar(stat = "identity") + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ylab("cumulative number of active (expressed and repressed) genes at differential expression p-value < 0.05")
# pdf(filename, 18,8)
# p
# dev.off()


for(j in 1:length(ids.exps)){

  # get meta information about the condition? (heat, etc.. time course )

  id <- ids.exps[j]
  df.j <- subset(df.experiment_condition_annotation, df.experiment_condition_annotation$unique_ID == id)
  cond <- c(df.j$condition[1], df.j$condition_optional[1])
  cond <- cond[cond != ""]
  tis <- df.j$tissue[1]
  id <- as.character(id)

  print(cond)
  print(tis)

  # print(paste(cond, " - ",  tis))

  print(table(m.de[v.gns[[1]],id]))
  print(table(m.de[v.gns[[2]],id]))

  # TODO: requirement => identify co-differentially expressed

  # general gene expression behavior in
  print(table(m.de.gd[,id])) # selection growth and defense subgroup
  print(table(m.de[,id]))  # genome


  # get the blueprint ?
  # get the differential expression of the TFs
  for(k in 1:2){

    print(names(v.gns)[k])

    idx <- which(m.grn[,v.gns[[k]]] > 0, arr.ind = T) # get growth regulators
    v.tfs <- names(which(rowSums(m.grn[,v.gns[[k]]]) > 0)) # from blueprint

    n.tfs = length(v.tfs)
    table(m.de[v.tfs,id]) # TODO: motivation to isolate specific TFs that are "active" in a given condition
    v.tfs <- names(which(m.de[v.tfs,id] != 0)) # instantiate in condition? - condition active TFs
    table(m.de[v.tfs,id])
    print(length(v.tfs) / n.tfs)

    v.mes <- names(which(m.de[v.gns[[k]],id] != 0))
    print(length(v.mes) / length(v.gns[[k]]))

    # active regulation
    rowSums(m.grn[v.tfs, v.mes])

    # make strong statistical case to link regulator to targets to condition
    # TODO: temporal case for the exceptions in the dataset (temporal up and down-regulation of growth and defense)

    # TODO: separate into similar condtion groups (box plots) - TFs, MEs
    # TODO: boxplot - regular links? - links per TF?
    print(table(rowSums(m.grn[v.tfs, v.mes])))

    # distribution - growth vs defense
    # TODO: making percentage of regulator extrapolation ...?

  }


  for(i in 1:length(doms)){

    # annotated growth?
    df.i <- subset(df, df$Domain == doms[i])
    v.gns.i <- unique(df.i$ID)
    m.de.i <- m.de[v.gns.i,]

  }

  # TF identity
  # regulation identity

}


# gene pair - statistics across conditions (groups)
# single condition - statistics across genes (groups)

tb.condition_treatments <- l.data$tb.condition_treatments
tb.condition_tissues <-  l.data$tb.condition_tissues
df.experiment_condition_annotation <- l.data$df.experiment_condition_annotation

doms <- unique(df$Domain)

for(i in 1:length(doms)){

  # annotated growth?
  df.i <- subset(df, df$Domain == doms[i])
  v.gns.i <- unique(df.i$ID)
  m.de.i <- m.de[v.gns.i,]

  # general bayesian statistics on the total number of hits ...
  # l.data$tb.condition_treatments
  # l.data$tb.condition_tissues

  for(j in 1:length(ids.exps)){

    id <- ids.exps[j]
    df <- subset(df.experiment_condition_annotation, df.experiment_condition_annotation$unique_ID == id)
    cond <- c(df$condition[1], df$condition_optional[1])
    cond <- cond[cond != ""]
    tis <- df$tissue[1]
    id <- as.character(id)

    cond
    tis

    print(table(m.de.i[,id]))

  }



}



### turn the argument around ##

### work with the experimentally validated defense / growth genes
### having enough (not necessary complete) set of conditions to showcase defense vs. growth and regulation behaviour