tasty:

# Analysis script for the tissues
# data files from a variety of sources
# genomics and transcriptomics: TCGA
# proteomics and phosphoproteomics Mertins et al, Nature 2016

setwd("/users/roota/Documents/tasty/expdata")
#library(data.table)
# read the path db and tft db
#pdb <- read.table("DB_Paths_111418.txt",sep="\t",header=TRUE,stringsAsFactors = FALSE)
pdbT <- fread("DB_Paths_111418.txt")
#tdb <- read.table("DB_TFtargets.txt",sep="\t",header=TRUE,stringsAsFactors = FALSE)
tdbT <- fread("DB_TFtargets.txt")

# read the data files
cnaT <- fread("Tissues_CNA.txt")
rnaT <- fread("Tissues_RNA.txt")
mutT <- fread("Tissues_MUT.txt")
proT <- fread("Tissues_PRO.txt")
styT <- fread("Tissues_STY.txt")

setkey(cnaT, gene)
setkey(rnaT, gene)
setkey(mutT, gene)
setkey(proT, gene)
setkey(styT, gene)

# Check for duplicate rows
cT <- data.frame(table(cnaT$gene))
cT[cT$Freq == 2, ]

rT <- data.frame(table(rnaT$gene))
rT[rT$Freq == 2, ]

mT <- data.frame(table(mutT$gene))
mT[mT$Freq == 2, ]

pT <- data.frame(table(proT$gene))
pT[pT$Freq == 2, ]

yT <- data.frame(table(styT$gene))
yT[yT$Freq == 2, ]

library(plyr)
proT <- ddply(proT, .(gene), summarize,
              norm = mean(norm, na.rm=TRUE))
class(proT)
proT <- as.data.table(proT)
setkey(proT, gene)

styT <- ddply(styT, .(gene), summarize,
              norm = mean(norm, na.rm=TRUE))
class(styT)
styT <- as.data.table(styT)
setkey(styT, gene)





## Add a column for transcription factors, which differs depending on pathlength
pdbT[ ,('TF') := 'NA']
pdbT[pdbT$pathLength == 3, ]$TF <- pdbT[pdbT$pathLength == 3, ]$Gene3
pdbT[pdbT$pathLength == 4, ]$TF <- pdbT[pdbT$pathLength == 4, ]$Gene4
pdbT[pdbT$pathLength == 5, ]$TF <- pdbT[pdbT$pathLength == 5, ]$Gene5
pdbT[pdbT$pathLength == 6, ]$TF <- pdbT[pdbT$pathLength == 6, ]$Gene6


pdbT[ ,('pTF') := 'NA']
pdbT[pdbT$pathLength == 3, ]$pTF <- pdbT[pdbT$pathLength == 3, ]$Node3
pdbT[pdbT$pathLength == 4, ]$pTF <- pdbT[pdbT$pathLength == 4, ]$Node4
pdbT[pdbT$pathLength == 5, ]$pTF <- pdbT[pdbT$pathLength == 5, ]$Node5
pdbT[pdbT$pathLength == 6, ]$pTF <- pdbT[pdbT$pathLength == 6, ]$Node6

# Prepare the TF scores

### Prepare the TF scores
head(tdbT)
head(rnaT)

tdbT2 <- merge(tdbT, rnaT, by.y='gene', by.x = "Target", all.x=TRUE, all.y=FALSE)
head(tdbT2)
library(plyr)
tdbT2[tdbT2$ControlType %in% 'Repression', ]$norm <- -1 * tdbT2[tdbT2$ControlType %in% 'Repression', ]$norm
tfT <- ddply(tdbT2, .(TF), summarize, TFscore = sum(norm, na.rm=TRUE))
head(tfT)
class(tfT)
summary(tfT)
tfT[order(abs(tfT$TFscore), decreasing=TRUE), ][1:50, ]

tfT2 <- as.data.table(tfT)
setkey(tfT2, TF)

tfT2$TFscore <- scale(tfT2$TFscore, center=TRUE, scale=TRUE)
tfT2[tfT2$TFscore > 5, ]$TFscore <- 5
tfT2[tfT2$TFscore < -5, ]$TFscore <- -5

summary(tfT2$TFscore)


sort(unique(pdbT$TF))
sort(unique(pdbT$pTF))

## Apply a filter to reduce the search space

# reduce the size of PDB
sty2keepT <- sort(unique(styT[!is.na(styT$norm), ]$gene))
sty2keepT
#sty2keepT <- c(sty2keepT, NA)

g2keepT <- sort(unique(c(cnaT[abs(cnaT$norm) >= 1, ]$gene,
                        rnaT[abs(rnaT$norm) >= 1, ]$gene,
                        proT[abs(proT$norm) >= 1, ]$gene),NA))
p2keepT <- sort(unique(proT[!is.na(proT$norm), ]$gene))
#p2keep <- c(p2keep, NA)
p2keepT

p2keepT[p2keepT %in% pdbT$Node1]

table(pdbT$Node2 %in% sty2keepT &
        pdbT$Node3 %in% sty2keepT &
        pdbT$Node4 %in% sty2keepT &
        pdbT$Node5 %in% sty2keepT &
        pdbT$Node6 %in% sty2keepT)

## This filter requires receptor detection at protein level and 1 phosphosite
pdbT2 <- pdbT[pdbT$Node1 %in% p2keepT &
               #pdbT$TF %in% tf2keep &
               (pdbT$Node2 %in% sty2keepT |
                  pdbT$Node3 %in% sty2keepT |
                  pdbT$Node4 %in% sty2keepT |
                  pdbT$Node5 %in% sty2keepT |
                  pdbT$Node6 %in% sty2keepT), ] #&
sort(unique(pdbT2$Node1))


# Debug the errors
pdb=pdbT2
tdb=tfT2 # input databases
cna=cnaT
rna=rnaT
mut=mutT
pro=proT
sty=styT #input data
c_cna=0.10
c_rna=0.10
c_mut=0.10
c_pro=0.30
c_sty=0.30
c_tdb=0.10 # input constants
nPerms=10





outT <- scorePaths(pdb=pdbT2, tdb=tfT2, # input databases
                  cna=cnaT,rna=rnaT,mut=mutT,pro=proT,sty=styT, #input data
                  c_cna=0.10 , c_rna=0.10, c_mut=0.10, c_pro=0.30, c_sty=0.30, c_tdb=0.10, # input constants
                  nPerms=10)
head(outT)
sort(unique(outT$Node1))
sort(unique(outT$TF))

outT2 <- ddply(outT, .(Node1, Gene2, Gene3, Gene4, Gene5, Gene6), summarize,
              maxScore = max(sumScore),
              adjpval = min(padj))
outT2[order(outT2$maxScore, decreasing=TRUE), ][1:50, ]
outT2[order(outT2$maxScore, decreasing=FALSE), ][1:50, ]