load("~/Documents/nempi_backup/backup3/TCGA-BRCA_nempi.rda")
M <- Pmut
patients.mut <- unlist(lapply(colnames(M), function(x) {
y <- paste(unlist(strsplit(x, "\\-"))[1:3], collapse = "-")
return(y)
}))
snv_matrix <- M
D <- readRDS("~/Documents/mutclust/TCGA-BRCA_counts.rds")
DT <- D$T
DN <- D$N
patients.exprs <- unlist(lapply(colnames(DT), function(x) {
y <- paste(unlist(strsplit(x, "\\-"))[1:3], collapse = "-")
return(y)
}))
# colnames(DN) <- unlist(lapply(colnames(DN), function(x) {
# y <- paste(unlist(strsplit(x, "\\-"))[1:3], collapse = "-")
# return(y)
# }))
DTN <- cbind(DT, DN)
samples <- colnames(DT)[which(patients.exprs %in% patients.mut)]
expression_matrix <- DTN
sample_origins <- c(rep("tumor",ncol(DT)),rep("normal",ncol(DN)))
colnames(expression_matrix) <- unlist(lapply(colnames(expression_matrix), function(x) {
y <- paste(unlist(strsplit(x, "\\-"))[1:4], collapse = "-")
y <- unlist(strsplit(y, ""))
y <- paste(y[-length(y)], collapse = "")
return(y)
}))
library('biomaRt')
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
genesymbols <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","hgnc_symbol"), values=rownames(expression_matrix), mart= mart)
rownames(expression_matrix) <- genesymbols[match(rownames(expression_matrix), genesymbols[, 1]), 2]
library(PRODIGY)
# Load STRING network data
data(STRING_network)
network = STRING_network
# Get differentially expressed genes (DEGs) for all samples
expression_matrix = expression_matrix[which(rownames(expression_matrix) %in% unique(c(network[,1],network[,2]))),]
library(DESeq2)
DEGs = get_DEGs(expression_matrix,samples,sample_origins=sample_origins,beta=2,gamma=0.05)
names(DEGs) <- unlist(lapply(names(DEGs), function(x) {
y <- paste(unlist(strsplit(x, "\\-"))[1:4], collapse = "-")
y <- unlist(strsplit(y, ""))
y <- paste(y[-length(y)], collapse = "")
return(y)
}))
# Run PRODIGY
samples <- colnames(snv_matrix)
all_patients_scores = PRODIGY_cohort(snv_matrix,expression_matrix,network=network,samples=samples,DEGs=DEGs,alpha=0.05,pathwayDB="reactome",num_of_cores=1,sample_origins=sample_origins,write_results = TRUE, results_folder = "prodigy/",beta=2,gamma=0.05,delta=0.05)
tmp <- PRODIGY(snv_matrix,expression_matrix,network,
sample=names(DEGs)[1],DEGs[[1]],
sample_origins=sample_origins,
write_results = TRUE,
results_folder = "prodigy/")
# Get driver gene rankings for all samples
results = analyze_PRODIGY_results(all_patients_scores)
## try dawnrank:
library(DawnRank)
DT2 <- DT
colnames(DT2) <- unlist(lapply(colnames(DT2), function(x) {
y <- paste(unlist(strsplit(x, "\\-"))[1:4], collapse = "-")
y <- unlist(strsplit(y, ""))
y <- paste(y[-length(y)], collapse = "")
return(y)
}))
DT2 <- DT2[,which(colnames(DT2) %in% colnames(M))]
M <- M[,which(colnames(M) %in% colnames(DT2))]
DT2 <- DT2[,colnames(M)]
DT2.log2 <- log2(DT2 + 1)
DN.log2 <- log2(DN + 1)
file <- "~/Documents/testing/nempi/other/nempi_dawnrank_data.rds"
if (file.exists(file)) {
normalizedDawn <- readRDS(file)
} else {
normalizedDawn <- DawnNormalize(tumorMat = DT2.log2, normalMat = DN.log2)
saveRDS(normalizedDawn, file = file)
}
n <- length(unique(as.vector(network[,1:2])))
pathway <- matrix(0, n, n)
rownames(pathway) <- colnames(pathway) <- sort(unique(as.vector(network[,1:2])))
a <- match(network[,1],rownames(pathway))
b <- match(network[,2],rownames(pathway))
pathway[cbind(a,b)] <- 1
library('biomaRt')
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
genesymbols <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","hgnc_symbol"), values=rownames(normalizedDawn), mart= mart)
rownames(normalizedDawn) <- genesymbols[match(rownames(normalizedDawn), genesymbols[, 1]), 2]
tmp <- which(rownames(M) %in% rownames(normalizedDawn))
M <- M[tmp,]
tmp <- which(rownames(normalizedDawn) %in% rownames(M))
normalizedDawn <- normalizedDawn[tmp,]
normalizedDawn <- rowsum(normalizedDawn, rownames(normalizedDawn))/as.numeric(table(rownames(normalizedDawn)))
tmp <- rownames(M)[which(!(rownames(M) %in% rownames(pathway)))]
pathway2 <- matrix(0, length(tmp)+nrow(pathway),length(tmp)+ncol(pathway))
pathway2[1:nrow(pathway),1:ncol(pathway)] <- pathway
rownames(pathway2) <- colnames(pathway2) <- c(rownames(pathway),tmp)
tmp <- which(colnames(M) %in% colnames(normalizedDawn))
M <- M[,tmp]
M <- M[rownames(M),colnames(M)]
normalizedDawn <- normalizedDawn[rownames(M),colnames(M)]
pathway2 <- pathway2[rownames(M),rownames(M)]
my.genes <- c("TBX3","MAP2K4","GATA3","CBFB","NCOR1","PTPRD","GPS2","CDKN1B")
## load("~/Documents/testing/nempi/other/Dawnrank_temp.rda")
file <- "~/Documents/testing/nempi/other/nempi_dawnrank.rds"
if (!file.exists(file)) {
# get the DawnRank Score Get some coffee, this might take a while!
dawnRankScore <- DawnRank(adjMatrix = pathway2, mutationMatrix = M,
expressionMatrix = normalizedDawn, mu = 3,
parallel = NULL)
saveRDS(dawnRankScore,file=file)
} else {
dawnRankScore <- readRDS(file)
}
rank.mat <- matrix(NA, length(my.genes), ncol(M))
rownames(rank.mat) <- my.genes
colnames(rank.mat) <- colnames(M)
for (gene in 1:length(my.genes)) {
idx <- which(rownames(dawnRankScore[[2]]) == my.genes[gene])
if (length(idx) > 0) {
rank.mat[gene,] <- dawnRankScore[[2]][idx,]
}
}
Msub <- M[which(rownames(M) %in% my.genes),]
Msub <- Msub[rownames(rank.mat),]
rank.mat.sub <- rank.mat[,which(apply(Msub, 2, sum) != 0)]
Msub <- Msub[,which(apply(Msub, 2, sum) != 0)]
cut <- 90
TP <- sum(rank.mat.sub > cut & Msub == 1)
FP <- sum(rank.mat.sub > cut & Msub == 0)
TN <- sum(rank.mat.sub <= cut & Msub == 0)
FN <- sum(rank.mat.sub <= cut & Msub == 1)
TP/(TP+FP)
TP/(TP+FN)
gamsave <- rank.mat.sub/100
Gamma <- Msub
auc <- roc <- 0
ppv <- rec <- spec <- NULL
for (cut in c(2,seq(1,0, length.out = 100),-1)) {
gamtmp <- apply(gamsave, 2, function(x) {
y <- x*0
y[which(x > cut)] <- 1
return(y)
})
gamtmp2 <- Gamma
tp <- sum(gamtmp == 1 & gamtmp2 == 1)
fp <- sum(gamtmp == 1 & gamtmp2 == 0)
tn <- sum(gamtmp == 0 & gamtmp2 == 0)
fn <- sum(gamtmp == 0 & gamtmp2 == 1)
ppvtmp <- tp/(tp+fp)
if (is.na(ppvtmp)) { ppvtmp <- 0.5 }
rectmp <- tp/(tp+fn)
spectmp <- 1-tn/(tn+fp)
if (length(ppv) > 0) {
auc <- auc +
(rectmp-rec[length(rec)])*(ppvtmp+ppv[length(ppv)])/2
roc <- roc +
(spectmp-spec[length(spec)])*(rectmp+rec[length(rec)])/2
}
ppv <- c(ppv, ppvtmp)
rec <- c(rec, rectmp)
spec <- c(spec, spectmp)
}
par(mfrow=c(1,3))
plot(ppv)
plot(rec)
plot(spec)
pdf("~/Documents/temp.pdf", width = 16, height = 8)
tmp <- rank.mat.sub
colnames(tmp) <- NULL
csc <- numeric(ncol(tmp))
csc[which(apply(Msub, 2, sum) != 0)] <- 4
epiNEM::HeatmapOP(tmp/100,
breaks=seq(0,1,length.out=100),
col="RdBu",
bordercol="transparent",
colSideColors=csc)
dev.off()
## check with mutation matrix:
if (file.exists("nempi_dawnrank_agg.rds")) {
aggregateDawnRankScore <- condorcetRanking(scoreMatrix =dawnRankScore[[2]],mutationMatrix = M,parallel = NULL)
saveRDS(aggregateDawnRankScore,file="nempi_dawnrank_agg.rds")
} else {
aggregateDawnRankScore <- readRDS("nempi_dawnrank_agg.rds")
}
tmp <- aggregateDawnRankScore[[2]]
tmp[which(names(tmp) %in% my.genes)]
## process mutations for 2020 plus (makes no sense?):
mutSave <- mut
for (i in 1:4) {
mut[[i]] <- as.data.frame(mut[[i]])
}
m2020 <- do.call("rbind",mut)
write.table(m2020, file = "~/Documents/2020plus/data/brca.txt")
blad <- read.delim("~/Documents/2020plus/data/bladder.txt")
brca <- read.table("~/Documents/2020plus/data/brca.txt")
colnames(brca)[which(colnames(brca) == "Gene")] <- "EnsembleId"
colnames(brca)[which(colnames(brca) == "SYMBOL")] <- "Gene"
colnames(brca)[which(colnames(brca) == "Tumor_Sample_Barcode")] <- "Tumor_Sample"
colnames(brca)[which(colnames(brca) == "Tumor_Seq_Allele1")] <- "Tumor_Allele"
colnames(brca)[which(colnames(brca) == "HGVSp")] <- "Protein_Change"
colnames(brca)[which(colnames(brca) == "HGVSc")] <- "DNA_Change"
brca <- data.frame(brca, Tumor_Type = rep("Breast Cancer", nrow(brca)))
# "Symbol" "Tumor_Sample_Barcode" "?" "Chromosome" "Start_Position" "End_Position" "Variant_Classification" "Reference_Allele" "Allele/Tumor_Seq_Allele1/2!!!" "HGVSp/HGVSp_Short" "HGVSc"
brca <- brca[,which(colnames(brca) %in% colnames(blad))]
brca <- brca[,colnames(blad)]
brca[,"Tumor_Sample"] <- unlist(lapply(brca[,"Tumor_Sample"], function(x) {
y <- paste(unlist(strsplit(x,"-"))[1:3],collapse="-")
return(y)
}))
brca$Start_Position <- as.numeric(brca$Start_Position)
brca$End_Position <- as.numeric(brca$End_Position)
## brca_save <- brca
tmp <- which(table(brca$Start_Position) >= 3)
brca <- brca[which(brca$Start_Position %in% names(tmp)),]
brca <- brca[!duplicated(brca$Start_Position),]
write.table(brca, file = "~/Documents/2020plus/data/brca_reduced.txt", quote = FALSE, sep = "\t")
## 2020+
rm error.txt
rm output.txt
ram=32000 # data: 64gb, wilcox sc: 8gb for 32 nodes, edger: 64gb, voom: 64gb
queue=24
cores=1
bsub -M ${ram} -q normal.${queue}h -n ${cores} -e error.txt -o output.txt -R "rusage[mem=${ram}]" 'snakemake -s Snakefile pretrained_predict -p --cores 1 \
--config mutations="data/brca.txt" output_dir="brca/" trained_classifier="data/2020plus_100k.Rdata"'
## read results:
res2020 <- read.delim("~/Mount/Eulershare/2020plus/2020plus/brca/pretrained_output/results/r_random_forest_prediction.txt")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.