nempi: Inferring unobserved perturbations from gene expression data

pipe <- as.character(commandArgs(TRUE)[1])

type <- as.character(commandArgs(TRUE)[2])

bsruns <- as.numeric(commandArgs(TRUE)[3]) # bsruns <- 10

startrun <- as.numeric(commandArgs(TRUE)[4]) # startrun <- 10

if (type %in% c("rf", "svm", "nn")) {
    options(expressions = 500000)
}

safeSave <- function(..., list = character(), file = stop("'file' must be specified"),
                     ascii = FALSE, version = NULL, envir = parent.frame(), compress = isTRUE(!ascii),
                     compression_level, eval.promises = TRUE, precheck = TRUE) {

    i <- 1
    while(file.exists(paste0(file, "_", i))) {
        i <- i + 1
    }

    save(..., list = list, file = paste0(file, "_", i),
         ascii = ascii, version = version, envir = envir, compress = compress,
         compression_level = compression_level, eval.promises = eval.promises, precheck = precheck)
}

aucs <- function(a,b) {
    auc <- roc <- 0
    ppv <- rec <- spec <- NULL
    for (cut in c(2,seq(1,0, length.out = 100),-1)) {
        tmp <- a*0
        tmp[which(a > cut)] <- 1
        tp <- sum(tmp == 1 & b == 1)
        fp <- sum(tmp == 1 & b == 0)
        tn <- sum(tmp == 0 & b == 0)
        fn <- sum(tmp == 0 & b == 1)
        ppvtmp <-  tp/(tp+fp)
        if (is.na(ppvtmp)) { ppvtmp <- 0.5 }
        rectmp <- tp/(tp+fn)
        spectmp <- 1-tn/(tn+fp)
        if (length(ppv) > 0) {
            auc <- auc + (rectmp-rec[length(rec)])*(ppvtmp+ppv[length(ppv)])/2
            roc <- roc + (spectmp-spec[length(spec)])*(rectmp+rec[length(rec)])/2
        }
        ppv <- c(ppv, ppvtmp)
        rec <- c(rec, rectmp)
        spec <- c(spec, spectmp)
    }
    return(list(auc=auc,roc=roc,ppv=ppv,rec=rec,spec=spec))
}

path <- "/cluster/work/bewi/members/mpirkl/"

if (pipe %in% "loo") { 
    library(naturalsort)
    library(nem)
    library(cluster)
    library(Rcpp)
    library(Rgraphviz)
    library(randomForest)
    library(e1071)
    source("mnems.r")
    source("mnems_low.r")
    sourceCpp(code=readChar("mm.cpp", file.info("mm.cpp")$size))
    source("nempi_main.r")
    source("nempi_low.r")
    load("TCGA-BRCA_nempi.rda")
    ## sourceCpp("~/Documents/mnem/src/mm.cpp"); source("~/Documents/mnem/R/mnems.r"); source("~/Documents/mnem/R/mnems_low.r"); source("~/Documents/nempi/R/nempi_main.r"); source("~/Documents/nempi/R/nempi_low.r")
    ## load("nempi_backup/backup3/TCGA-BRCA_nempi.rda")
    P <- Pmut
    P <- P[which(rownames(P) %in% rownames(Rho)), which(colnames(P) %in% colnames(Rho))]
    P <- P[order(rownames(P)), order(colnames(P))]
    P[which(P > 1)] <- 1
    Ptmp <- cbind(P, matrix(0, nrow(P), sum(!(colnames(Rho) %in% colnames(P)))))
    colnames(Ptmp) <- c(colnames(P), colnames(Rho)[which(!(colnames(Rho) %in% colnames(P)))])
    P <- Ptmp[, colnames(Rho)]
    D4 <- D2
    colnames(D4) <- apply(Rho, 2, function(x) {
        Sgenes <- paste(sort(rownames(Rho)[which(x > 0)]), collapse = "_")
        return(Sgenes)
    })
    D5 <- D4[, which(apply(P, 2, sum) != 0)]
    genes.med <- apply(D5, 1, median)
    D5 <- D5[which(genes.med != 0), ]
    PM <- P[, which(apply(P, 2, sum) != 0)]
    tp <- fp <- fn <- tn <- prauc <- numeric(ncol(D5))
    if (type %in% "nempi") {
        converged <- 1
        for (i in startrun:(startrun+bsruns-1)) {
            if (i > ncol(D5)) { break() }
            D6 <- D5[, -i]
            Rho2 <- PM[, -i]
            Rho3 <- PM
            Rho3[, i] <- 0
            nempitmp <- nempi(D6, Gamma = Rho2, full = TRUE, converged = converged)
            Rho4 <- Rho3
            Rho4[, -i] <- nempitmp$Gamma
            nempitmp <- nempi(D5, Gamma = Rho4, full = FALSE, converged = converged)
            Rho5 <- t(mytc(nempitmp$res$adj))%*%nempitmp$Gamma
            tp[i] <- sum(PM[, i] == 1 & Rho5[, i] >= 1/8)
            fp[i] <- sum(PM[, i] == 0 & Rho5[, i] >= 1/8)
            fn[i] <- sum(PM[, i] == 1 & Rho5[, i] < 1/8)
            tn[i] <- sum(PM[, i] == 0 & Rho5[, i] < 1/8)
            prauc[i] <- aucs(Rho5[, i],PM[, i])$auc
        }
        safeSave(prauc, tp, fp, fn, tn, file = paste0(path, "nempi_bs/nempi_loo.rda"))
    } else if (type %in% "svm") {
        for (i in startrun:(startrun+bsruns-1)) {
            if (i > ncol(D5)) { break() }
            D6 <- D5[, -i]
            Rho2 <- PM[, -i]
            Rho3 <- PM
            Rho3[, i] <- 0
            svmres <- classpi(D6, full = TRUE, method = "svm")
            Rho4 <- Rho3
            Rho4[, -i] <- svmres$Gamma
            D7 <- D5
            colnames(D7)[i] <- ""
            svmres <- classpi(D7, full = FALSE, method = "svm")
            Rho5 <- svmres$Gamma
            tp[i] <- sum(PM[, i] == 1 & Rho5[, i] >= 1/8)
            fp[i] <- sum(PM[, i] == 0 & Rho5[, i] >= 1/8)
            fn[i] <- sum(PM[, i] == 1 & Rho5[, i] < 1/8)
            tn[i] <- sum(PM[, i] == 0 & Rho5[, i] < 1/8)
            prauc[i] <- aucs(Rho5[, i],PM[, i])$auc
        }
        safeSave(prauc, tp, fp, fn, tn, file = paste0(path, "nempi_bs/svm_loo.rda"))
    } else if (type %in% "rf") {
        for (i in startrun:(startrun+bsruns-1)) {
            if (i > ncol(D5)) { break() }
            D6 <- D5[, -i]
            Rho2 <- PM[, -i]
            Rho3 <- PM
            Rho3[, i] <- 0
            svmres <- classpi(D6, full = TRUE, method = "randomForest")
            Rho4 <- Rho3
            Rho4[, -i] <- svmres$Gamma
            D7 <- D5
            colnames(D7)[i] <- ""
            svmres <- classpi(D7, full = FALSE, method = "randomForest")
            Rho5 <- svmres$Gamma
            tp[i] <- sum(PM[, i] == 1 & Rho5[, i] >= 1/8)
            fp[i] <- sum(PM[, i] == 0 & Rho5[, i] >= 1/8)
            fn[i] <- sum(PM[, i] == 1 & Rho5[, i] < 1/8)
            tn[i] <- sum(PM[, i] == 0 & Rho5[, i] < 1/8)
            prauc[i] <- aucs(Rho5[, i],PM[, i])$auc
        }
        safeSave(prauc, tp, fp, fn, tn, file = paste0(path, "nempi_bs/rf_loo.rda"))
    } else if (type %in% "nn") {
        for (i in startrun:(startrun+bsruns-1)) {
            if (i > ncol(D5)) { break() }
            D6 <- D5[, -i]
            Rho2 <- PM[, -i]
            Rho3 <- PM
            Rho3[, i] <- 0
            library(nnet)
            svmres <- classpi(D6, full = TRUE, method = "nnet", MaxNWts = 50000, size = 1)
            Rho4 <- Rho3
            Rho4[, -i] <- svmres$Gamma
            D7 <- D5
            colnames(D7)[i] <- ""
            svmres <- classpi(D7, full = FALSE, method = "nnet", MaxNWts = 50000, size = 1)
            Rho5 <- svmres$Gamma
            tp[i] <- sum(PM[, i] == 1 & Rho5[, i] >= 1/8)
            fp[i] <- sum(PM[, i] == 0 & Rho5[, i] >= 1/8)
            fn[i] <- sum(PM[, i] == 1 & Rho5[, i] < 1/8)
            tn[i] <- sum(PM[, i] == 0 & Rho5[, i] < 1/8)
            prauc[i] <- aucs(Rho5[, i],PM[, i])$auc
        }
        safeSave(prauc, tp, fp, fn, tn, file = paste0(path, "nempi_bs/nn_loo.rda"))
    } else if (type %in% "mf") {
        for (i in startrun:(startrun+bsruns-1)) {
            if (i > ncol(D5)) { break() }
            D6 <- D5
            colnames(D6)[i] <- ""
            Rho2 <- PM
            Rho2[, i] <- 0
            Rho3 <- PM
            Rho3[, i] <- 0
            mfdata <- cbind(as.data.frame(t(D6)), factor(colnames(D6)))
            mfdata[which(mfdata == "", arr.ind = TRUE)] <- NA
            library(missForest)
            mfimp <- missForest(mfdata)
            colnames(D6) <- mfimp$ximp[, ncol(mfimp$ximp)]
            tmp <- nem(D6)
            ures <- list()
            ures$Gamma <- apply(getGamma(D6), 2, function(x) return(x/sum(x)))
            ures$res <- list()
            ures$res$adj <- tmp$adj
            ures$null <- TRUE
            ures$combi <- 1
            Rho5 <- ures$Gamma
            tp[i] <- sum(PM[, i] == 1 & Rho5[, i] >= 1/8)
            fp[i] <- sum(PM[, i] == 0 & Rho5[, i] >= 1/8)
            fn[i] <- sum(PM[, i] == 1 & Rho5[, i] < 1/8)
            tn[i] <- sum(PM[, i] == 0 & Rho5[, i] < 1/8)
            prauc[i] <- aucs(Rho5[, i],PM[, i])$auc
        }
        safeSave(prauc, tp, fp, fn, tn, file = paste0(path, "nempi_bs/mf_loo.rda"))
    } else if (type %in% "knn") {
        for (i in startrun:(startrun+bsruns-1)) {
            if (i > ncol(D5)) { break() }
            D6 <- D5
            colnames(D6)[i] <- ""
            Rho2 <- PM
            Rho2[, i] <- 0
            Rho3 <- PM
            Rho3[, i] <- 0
            library(class)
            train <- t(D6[, which(colnames(D6) != "")])
            test <- t(D6[, which(colnames(D6) == "")])
            knn0 <- 0
            cl <- colnames(D6)[which(colnames(D6) != "")]
            knnres <- knn(train, test, cl, prob = TRUE)
            D3 <- D6
            colnames(D3)[which(colnames(D3) %in% "")] <- as.character(knnres)
            tmp <- nem(D3)
            ures <- list()
            ures$Gamma <- apply(getGamma(D3), 2, function(x) return(x/sum(x)))
            ures$res <- list()
            ures$res$adj <- tmp$adj
            ures$null <- TRUE
            ures$combi <- 1
            Rho5 <- ures$Gamma
            tp[i] <- sum(PM[, i] == 1 & Rho5[, i] >= 1/8)
            fp[i] <- sum(PM[, i] == 0 & Rho5[, i] >= 1/8)
            fn[i] <- sum(PM[, i] == 1 & Rho5[, i] < 1/8)
            tn[i] <- sum(PM[, i] == 0 & Rho5[, i] < 1/8)
            prauc[i] <- aucs(Rho5[, i],PM[, i])$auc
        }
        safeSave(prauc, tp, fp, fn, tn, file = paste0(path, "nempi_bs/knn_loo.rda"))
    } else if (type %in% "rand") {
        for (i in startrun:(startrun+bsruns-1)) {
            if (i > ncol(D5)) { break() }
            Rho5 <- PM
            Rho5[,i] <- runif(nrow(Rho5))
            tp[i] <- sum(PM[, i] == 1 & Rho5[, i] >= 1/8)
            fp[i] <- sum(PM[, i] == 0 & Rho5[, i] >= 1/8)
            fn[i] <- sum(PM[, i] == 1 & Rho5[, i] < 1/8)
            tn[i] <- sum(PM[, i] == 0 & Rho5[, i] < 1/8)
            prauc[i] <- aucs(Rho5[, i],PM[, i])$auc
        }
        safeSave(prauc, tp, fp, fn, tn, file = paste0(path, "nempi_bs/rand_loo.rda"))
    }
    stop("successfull loo")
} else {

    library(snowfall)
    library(TCGAbiolinks)

    if (is.null(types)) {
        types <- TCGAbiolinks:::getGDCprojects()$project_id
        ## special "FM-AD"
        dont <- paste0(unique(gsub("-.*", "", types)), "-")
        dont <- dont[-grep("TCGA", dont)]
        samplenr <- matrix(NA, length(types), 2)
        rownames(samplenr) <- types
        donr <- TRUE
        snrcount <- 0
    } else {
        dont <- "AGCT"
        donr <- FALSE
    }

    sizemat <- matrix(0, 1, 2)
    colnames(sizemat) <- c("Tumor", "Normal")
    rownames(sizemat) <- ""

    path <- "mutclust/"

    for (type in types) {
        if (donr) {
            snrcount <- snrcount + 1
        }
        if (length(grep(paste(dont, collapse = "|"), type)) > 0) { next() }
        print(type)
        if (file.exists(paste0(path, type, "_final.rda")) & !newmut & !newllr & !newsave) {
            load(paste0(path, type, "_final.rda"))
        } else {

            summ <- TCGAbiolinks:::getProjectSummary(type)
            library(SummarizedExperiment)

            ## get methylation:
            if (file.exists(paste0(path, type, "_meth.rda"))) {
                load(paste0(path, type, "_meth.rda"))
            } else {
                data <- GDCquery(project = paste(type, sep = ""),
                                 sample.type = "Primary solid Tumor",
                                 data.category = "DNA methylation",
                                 data.type = "Methylation beta value",
                                 legacy = TRUE
                                 )
                GDCdownload(data)
                data <- GDCprepare(data, summarizedExperiment = 0)
                ## map methy sites to genes:
                library(methyAnalysis)
                if (is.data.frame(data)) {
                    meth2 <- as.matrix(data[, -(1:3)])
                    rownames(meth2) <- gsub(";.*", "", data[, 1])
                } else {
                    meth <- data@rowRanges
                    library(TxDb.Hsapiens.UCSC.hg19.knownGene)
                    probe2gene <- annotateDMRInfo(meth, 'TxDb.Hsapiens.UCSC.hg19.knownGene')
                    meth2 <- assay(data)
                    rownames(meth2) <- probe2gene$sigDMRInfo@elementMetadata@listData$GeneSymbol
                }
                meth <- meth2
                meth <- meth[which(apply(meth, 1, function(x) return(any(is.na(x)))) == FALSE), ]
                methm <- meth[which(duplicated(rownames(meth)) == FALSE), ]
                count <- 0
                for (i in which(duplicated(rownames(meth)) == FALSE)) {
                    count <- count + 1
                    methm[count, ] <- apply(meth[which(rownames(meth) %in% rownames(methm)[i]), , drop = FALSE], 2, median)
                }
                meth <- methm
                meth <- meth[order(rownames(meth)), order(colnames(meth))]
                colnames(meth) <- gsub("A$", "", lapply(strsplit(colnames(meth), "-"), function(x) { y <- x[1:4]; y <- paste(y, collapse = "-"); return(y) }))
                methm <- meth[, which(duplicated(colnames(meth)) == FALSE)]
                for (i in which(duplicated(colnames(meth)) == TRUE)) {
                    j <- which(colnames(methm) == colnames(meth)[i])
                    methm[, j] <- apply(meth[, which(colnames(meth) %in% colnames(methm)[i]), drop = FALSE], 2, median)
                }
                meth <- methm
                meth <- meth[order(rownames(meth)), order(colnames(meth))]
                save(meth, file = paste0(path, type, "_meth.rda"))
            }

            print("meth done")

            ## get copy number variation:
            if (file.exists(paste0(path, type, "_cnv.rda"))) {
                load(paste0(path, type, "_cnv.rda"))
            } else {
                data <- getGistic(gsub("TCGA-", "", type), type = "thresholded")
                ## data <- GDCquery(project = paste(type, sep = ""),
                ##                  sample.type = "Primary solid Tumor",
                ##                  data.category = "Copy Number Variation",
                ##                  data.type = "Copy Number Segment",
                ##                  )
                ## GDCdownload(data)
                ## data <- GDCprepare(data)
                cnv <- data[, -(1:3)]
                cnv <- apply(cnv, c(1,2), as.numeric)
                rownames(cnv) <- data[, 1]
                colnames(cnv) <- gsub("A$", "", lapply(strsplit(colnames(cnv), "-"), function(x) { y <- x[1:4]; y <- paste(y, collapse = "-"); return(y) }))
                cnv <- cnv[order(rownames(cnv)), order(colnames(cnv))]
                save(cnv, file = paste0(path, type, "_cnv.rda"))
            }

            print("cnv done")

            ## get expression
            if (file.exists(paste0(path, type, "_query.rda"))) {
                load(paste0(path, type, "_query.rda"))
            } else {
                if (length(grep("-AML$|-LAML$", type)) > 0) {
                    sampletype <- c("Primary Blood Derived Cancer - Peripheral Blood", "Solid Tissue Normal")
                } else {
                    sampletype <- c("Primary Tumor", "Solid Tissue Normal")
                }
                data <- GDCquery(project = paste(type, sep = ""),
                                 sample.type = sampletype,
                                 data.category = "Transcriptome Profiling",
                                 data.type = "Gene Expression Quantification",
                                 workflow.type = "HTSeq - Counts"
                                 )
                GDCdownload(data)
                if (is.null(data)) {
                    print("is null")
                    next()
                }
                data <- GDCprepare(data)
                save(data, file = paste0(path, type, "_query.rda"))
            }
            ## process expression data:
            D <- assay(data)
            class <- data@colData@listData$definition

            print("gene expression done")

            ## get mutation
            if (file.exists(paste0(path, type, "_mut.rda"))) {
                load(paste0(path, type, "_mut.rda"))
            } else {
                type2 <- gsub(paste(paste0(unique(gsub("-.*", "", types)), "-"), collapse = "|"), "", type)
                library(data.table)
                GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "varscan2") # mutation file
                GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "muse") # mutation file
                GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "somaticsniper") # mutation file
                GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "mutect2") # mutation file
                mut <- list()
                count <- 1
                type3 <- gsub("-", "\\.", type)
                for (i in list.files("GDCdata")) {
                    if (length(grep(type3, i)) > 0 & length(grep("csv", i)) > 0) {
                        mut[[count]] <- fread(paste0("GDCdata/", i))
                        count <- count + 1
                    }
                }
                save(mut, file = paste0(path, type, "_mut.rda"))
            }

            ## clinical
            if (file.exists(paste0(path, type, "_clin.rda"))) {
                load(paste0(path, type, "_clin.rda"))
            } else {
                clinical <- GDCquery_clinic(project = type, type = "clinical")
                save(clinical, file = paste0(path, type, "_clin.rda"))
            }

            ## process mutation data: (https://cancer.sanger.ac.uk/cosmic/help/mutation/overview)
            if (file.exists(paste0(path, type, "_mut0.rda")) & !newmut) {
                load(paste0(path, type, "_mut0.rda"))
            } else {
                n <- nmut # try something to get all patients with at least one mutation
                library(snowfall)
                countSamples <- function(i, mut.org, genes, mut.mat, coln) {
                    i <- which(rownames(mut.mat) %in% genes[i])
                    tmp <- mut.org[which(mut.org$Hugo_Symbol %in% rownames(mut.mat)[i]), coln]
                    tmp2 <- mut.mat[i, ]
                    tmp2[which(colnames(mut.mat) %in% tmp)] <- 1
                    return(tmp2)
                }
                typeSamples <- function(i, mut.org, genes, type.mat, coln, coln2) {
                    i <- which(rownames(type.mat) %in% genes[i])
                    tmp <- mut.org[which(mut.org$Hugo_Symbol %in% rownames(type.mat)[i]), coln]
                    tmp3 <- mut.org[which(mut.org$Hugo_Symbol %in% rownames(type.mat)[i]), coln2]
                    tmp2 <- type.mat[i, ]
                    tmp2[which(colnames(type.mat) %in% tmp)] <- tmp3
                    return(tmp2)
                }
                biggl <- list()
                for (i in length(mut)) {
                    mutation <- mut[[i]]
                    biggl[[i]] <- mutation$Hugo_Symbol
                }
                freq <- sort(table(unlist(biggl)), decreasing = TRUE)
                if (n == 0) {
                    allsub <- names(freq)
                } else {
                    allsub <- names(freq)[1:n]
                }
                M <- Mtype <- list()
                for (i in 1:length(mut)) {
                    mutation <- mut[[i]]
                    mut.mat <- matrix(0, length(allsub), length(unique(mutation$Tumor_Sample_Barcode)))
                    type.mat <- matrix("", length(allsub), length(unique(mutation$Tumor_Sample_Barcode)))
                    colnames(type.mat) <- colnames(mut.mat) <- sort(unique(mutation$Tumor_Sample_Barcode))
                    rownames(type.mat) <- rownames(mut.mat) <- allsub
                    coln <- which(colnames(mutation) %in% "Tumor_Sample_Barcode")
                    coln2 <- which(colnames(mutation) %in% "Variant_Classification")
                    mut.org <- mutation[which(mutation$Hugo_Symbol %in% allsub), ]
                    sfInit(parallel = T, cpus = 4)
                    sfExport("mut.mat", "coln")
                    tmp <- sfLapply(as.list(1:length(allsub)), countSamples, mut.org, allsub, mut.mat, coln)
                    sfStop()
                    tmp <- do.call("rbind", tmp)
                    rownames(tmp) <- allsub
                    colnames(tmp) <- colnames(mut.mat)
                    M[[i]] <- tmp
                    sfInit(parallel = T, cpus = 4)
                    sfExport("type.mat", "coln2")
                    tmp <- sfLapply(as.list(1:length(allsub)), typeSamples, mut.org, allsub, type.mat, coln, coln2)
                    sfStop()
                    tmp <- do.call("rbind", tmp)
                    rownames(tmp) <- allsub
                    colnames(tmp) <- colnames(mut.mat)
                    Mtype[[i]] <- tmp
                }
                samples <- intersect(intersect(intersect(colnames(M[[1]]), colnames(M[[2]])), colnames(M[[3]])), colnames(M[[4]]))
                M0 <- M[[1]][, which(colnames(M[[1]]) %in% samples)]
                Mtype0 <- Mtype[[1]][, which(colnames(Mtype[[1]]) %in% samples)]
                for (i in 2:length(M)) {
                    M0 <- M0 + M[[i]][, which(colnames(M[[i]]) %in% samples)]
                    Mtype0 <- matrix(paste(Mtype0, Mtype[[i]][, which(colnames(Mtype[[i]]) %in% samples)]), nrow(Mtype0))
                }
                rownames(Mtype0) <- rownames(M0)
                colnames(Mtype0) <- colnames(M0)
                save(M0, Mtype0, file = paste0(path, type, "_mut0.rda"))
            }

            ## process expression data:
            D <- assay(data)
            class <- data@colData@listData$definition
            M <- M0
            Mtype <- Mtype0
            colnames(M) <- lapply(colnames(M), function(x) {
                y <- unlist(strsplit(x, "-"))
                y <- paste(y[1:4], collapse = "-")
                y <- unlist(strsplit(y, ""))
                y <- paste(y[1:(length(y)-1)], collapse = "")
                return(y)
            })
            colnames(D) <- lapply(colnames(D), function(x) {
                y <- unlist(strsplit(x, "-"))
                y <- paste(y[1:4], collapse = "-")
                y <- unlist(strsplit(y, ""))
                y <- paste(y[1:(length(y)-1)], collapse = "")
                return(y)
            })
            colnames(M) <- gsub("A$", "", lapply(strsplit(colnames(M), "-"), function(x) { y <- x[1:4]; y <- paste(y, collapse = "-"); return(y) }))
            M <- M[order(rownames(M)), order(colnames(M))]
            Mtype <- Mtype[order(rownames(Mtype)), order(colnames(Mtype))]

            print("mutation done")

            ## log odds:
            if (file.exists(paste0(path, type, "_llr.rda")) & !newllr) {
                load(paste0(path, type, "_llr.rda"))
            } else {
                library(edgeR)
                if (sum(class %in% "Solid Tissue Normal") < 10) {
                    distrParNC <- function(i, data) {
                        data[i, ] <- data[i, ] - median(data[, i])
                        llrcol <- numeric(ncol(data))
                        div0 <- quantile(data[i, ], 0.25)
                        div1 <- quantile(data[i, ], 0.75)
                        sigma <- sd(data[i, ])
                        for (j in 1:ncol(data)) {
                            if (data[i, j] <= 0) {
                                llrcol[j] <- log2(div0/data[i, j])
                            } else {
                                llrcol[j] <- log2(data[i, j]/div1)
                            }
                        }
                        return(llrcol)
                    }
                    highcounts <- which(apply(D, 1, median) >= 10)
                    DC <- D[highcounts, ]
                    genenames <- rownames(D)[highcounts, ]
                    nf <- calcNormFactors(DC)
                    DC <- t(t(DC)/nf) # DC <- DC2
                    ## this is very adventurous:
                    ## DC <- t(scale(t(DC)))
                    ## DC <- abs(DC)
                    ## pc <- 1-min(DC)
                    ## tmp <- log2(DC+pc)
                    ## hist(tmp)
                    sfInit(parallel = T, cpus = 4)
                    sfExport("DC")
                    tmp <- do.call("rbind", sfLapply(1:nrow(DC),
                                                     distrParNC, DC))
                    colnames(tmp) <- colnames(DC)
                    rownames(tmp) <- genenames
                    sfStop()
                    DF <- DC
                } else {
                    library(ks)
                    mykcde <- function(x, k) {
                        a <- which.min(abs(k$eval.points - x))
                        b <- k$estimate[a]
                        b <- min(b, 1-b)
                        return(b)
                    }
                    distrParKs <- function(i, data, C) {
                        llrcol <- numeric(ncol(data))
                        ddistr <- list()
                        dogenes <- unique(colnames(data))[which(!(unique(colnames(data)) %in% ""))]
                        for (j in dogenes) {
                            D <- which(colnames(data) %in% j)
                            ddistr[[j]] <- kcde(data[i, D])
                        }
                        cdistr <- kcde(data[i, C])
                        for (j in which(!(colnames(data) %in% ""))) {
                            gene <- colnames(data)[j]
                            llrcol[j] <- log2(mykcde(data[i, j], ddistr[[gene]])/mykcde(data[i,j], cdistr))
                        }
                        llrcol <- llrcol[-C]
                        return(llrcol)
                    }
                    DN <- D[, which(class %in% "Solid Tissue Normal")]
                    DT <- D[, which(class %in% "Primary solid Tumor")]
                    DF <- cbind(DT, DN)
                    C <- (ncol(DT)+1):ncol(DF)
                    highcounts <- which(apply(DF, 1, median) >= 10)
                    DF <- DF[highcounts, ]
                    genenames <- rownames(DF)
                    colnames(DF)[1:ncol(DT)] <- "P" # not knock-down specific!
                    colnames(DF)[grep(paste(unique(gsub("-.*", "", types)), collapse = "|"), colnames(DF))] <- ""
                    nf <- calcNormFactors(DF)
                    DF <- t(t(DF)/nf)
                    sfInit(parallel = T, cpus = 4)
                    sfExport("DF", "C", "mykcde")
                    sfLibrary(ks)
                    tmp <- do.call("rbind", sfLapply(1:nrow(DF),
                                                     distrParKs, DF, C))
                    sfStop()
                    colnames(tmp) <- colnames(DT)
                }
                save(tmp, DF, file = paste0(path, type, "_llr.rda"))
            }

            rownames(tmp) <- rownames(DF)

            par(mfrow=c(1,3))
            hist(tmp)

            tmp[which(is.na(tmp) | is.infinite(tmp))] <- 0

            hist(tmp)

            D <- tmp

            print("expression done")

            ## sd.glob <- sd(tmp)

            ## tmp <- tmp[-which(apply(tmp, 1, sd) < sd.glob), ]

            ## hist(tmp)

            ## prep clinical data:
            clinical[which(clinical$vital_status%in% "dead"), which(colnames(clinical) %in% "vital_status")] <- 1
            clinical[which(clinical$vital_status%in% "alive"), which(colnames(clinical) %in% "vital_status")] <- 0
            count <- 0
            for (stage in sort(unique(clinical$tumor_stage))) {
                clinical[which(clinical$tumor_stage%in% stage), which(colnames(clinical) %in% "tumor_stage")] <- count
                count <- count + 1
            }
            clinical$tumor_stage <- as.numeric(clinical$tumor_stage)
            clinical[which(is.na(clinical$days_to_death)), which(colnames(clinical) %in% "days_to_death")] <- clinical[which(is.na(clinical$days_to_death)), which(colnames(clinical) %in% "days_to_last_follow_up")]
            clinical$vital_status<- as.numeric(clinical$vital_status)

            print("clinical done")

            save(clinical, D, M, Mtype, DF, class, meth, cnv, file = paste0(path, type, "_final.rda"))
        }
        print(table(class))
        sizemat <- rbind(sizemat, table(class))
        rownames(sizemat)[nrow(sizemat)] <- type
        if (donr) {
            samplenr[snrcount, 1] <- sum(class %in% "Primary solid Tumor")
            samplenr[snrcount, 2] <- sum(class %in% "Solid Tissue Normal")
        }
    }

    stop("tcga done")

}

## commands for hpc:

system("scp ~/Documents/mnem/src/mm.cpp euler:")
system("scp ~/Documents/mnem/R/mnems.r euler:")
system("scp ~/Documents/mnem/R/mnems_low.r euler:")
system("scp ~/Documents/nempi/R/nempi_main.r euler:")
system("scp ~/Documents/nempi/R/nempi_low.r euler:")

system("scp testing/nempi/other/nempi_loo.r euler:")

do="loo"

rm error.txt
rm output.txt

ram=8000 # nempi 8000, rf 64000, svm 20000, mf 8000, knn 8000, nn 20000
queue=4 # nempi 2min, rf 90min, svm 10min, mf 2min, knn 1min, nn 40min
bsruns=10
startrun=1
R="R/bin/R"

type="nn" # nempi, svm, rf, nn, knn, mf

if [ ${type} == 'rf' ] || [ ${type} == 'svm' ] || [ ${type} == 'nn' ]
then
maxpp=500000
else
maxpp=50000
fi
if [ ${type} == 'rf' ]
then
ram=64000
bsruns=1
fi
if [ ${type} == 'svm' ]
then
ram=20000
fi
if [ ${type} == 'nn' ]
then
ram=20000
bsruns=1
fi

bsub -M ${ram} -q normal.${queue}h -n 1 -e error.txt -o output.txt -R "rusage[mem=${ram}]" "${R} --vanilla --max-ppsize ${maxpp} --silent --no-save --args '${do}' '${type}' '${bsruns}' '${startrun}' < nempi_loo.r"

end=$(expr 91 + $bsruns )
end=$(expr $end / $bsruns )
## end=85

for (( i=2; i<=$end; i++ )); do
a=$(expr ${i} - 1 )
b=$(expr ${a} \* ${bsruns} )
startrun=$(expr ${b} + 1 )
echo $startrun
bsub -M ${ram} -q normal.${queue}h -n 1 -e error.txt -o output.txt -R "rusage[mem=${ram}]" "${R} --vanilla --max-ppsize ${maxpp} --silent --no-save --args '${do}' '${type}' '${bsruns}' '${startrun}' < nempi_loo.r"
done

for i in {2..10}; do
bsub -M ${ram} -q normal.${queue}h -n 1 -e error.txt -o output.txt -R "rusage[mem=${ram}]" "${R} --vanilla --max-ppsize ${maxpp} --silent --no-save --args '${do}' '${type}' '${bsruns}' '${startrun}' < nempi_loo.r"
done

## read in loo results:

methods <- c("nempi", "svm", "nn", "rf", "mf", "knn", "rand")
methlist <- list()
comparePre <- compareRec <- comparePrauc <- NULL
for (method in methods) {
    print(method)
    methlist[[method]] <- list()
    tp2 <- tn2 <- fp2 <- fn2 <- prauc2 <- numeric(91)
    for (file in list.files("~/Mount/Eulershare/nempi_bs")) {
        if (length(grep(paste0("^", method, "_"), file)) > 0 &
            length(grep(paste0("_loo\\."), file)) > 0) {
            load(paste0("~/Mount/Eulershare/nempi_bs/", file))
            tp2 <- tp2 + tp
            tn2 <- tn2 + tn
            fp2 <- fp2 + fp
            fn2 <- fn2 + fn
            prauc2 <- prauc2 + prauc
        }
    }
    methlist[[method]][["tp"]] <- tp2
    methlist[[method]][["tn"]] <- tn2
    methlist[[method]][["fp"]] <- fp2
    methlist[[method]][["fn"]] <- fn2
    methlist[[method]][["pre"]] <- tp2/(tp2+fp2)
    methlist[[method]][["rec"]] <- tp2/(tp2+fn2)
    methlist[[method]][["rec"]][which(is.na(methlist[[method]][["rec"]]))] <- 0
    methlist[[method]][["prauc"]] <- prauc2
    comparePre <- cbind(comparePre, methlist[[method]][["pre"]])
    compareRec <- cbind(compareRec, methlist[[method]][["rec"]])
    comparePrauc <- cbind(comparePrauc, methlist[[method]][["prauc"]])
}

setEPS()
postscript("temp.eps", height = 7, width = 10)
layout.mat <- matrix(c(rep(c(1,4,7),each=2),10,rep(c(2,5,8),each=2),11,rep(c(3,6,9),each=2),12),7)
par(mfrow=c(1,2))
col <- c("red", "blue", "darkgreen", "brown", "orange", "turquoise","grey")
mnem:::myboxplot(comparePre, dens = 0, col = col, xaxt = "n", main = "precision of mutation calls", ylab = "precision",box=1,border = col,medcol="black")
axis(1, 1:7, round(apply(comparePre,2,mean), 2))
mnem:::myboxplot(compareRec, dens = 0, col = col, xaxt = "n", main = "recall of mutation calls", ylab = "recall",box=1,border = col,medcol="black")
axis(1, 1:7, round(apply(compareRec,2,mean), 2))
dev.off()


setEPS()
postscript("temp.eps", height = 5, width = 5)
layout.mat <- matrix(c(rep(rep(1,each=3),3),2:4),4,byrow=1)
layout(layout.mat)
wtest <- numeric(ncol(comparePrauc))
for (i in 1:length(wtest)) {
    wtest[i] <- wilcox.test(comparePrauc[,1],comparePrauc[,i],alternative="greater")$p.value
}
col <- c("red", "blue", "darkgreen", "brown", "orange", "turquoise","grey")
mnem:::myboxplot(comparePrauc, dens = 0, col = col, xaxt = "n", main = "area under the precision-recall curve of mutation calls", ylab = "area under the precision-recall curve",box=1,border = col,medcol="black")
axis(1, 1:7, round(apply(comparePrauc,2,mean), 2))
par(mar=rep(0, 4))
plot.new()
legend("topleft",legend=c(expression(NEM~pi), "svm", "neural net"),col=c("red", "blue", "darkgreen"),fill=c("red", "blue", "darkgreen"),box.col = "transparent",cex = cex.leg,ncol = 1,border="transparent")
plot.new()
legend("topleft",legend=c("random forest", "missForest", "knn"),col=c("brown", "orange", "turquoise"),fill=c("brown", "orange", "turquoise"),box.col = "transparent",cex = cex.leg,ncol = 1,border="transparent")
plot.new()
legend("topleft",legend=c("random"),col=c("grey"),fill=c("grey"),box.col = "transparent",cex = cex.leg,ncol = 1,border="transparent")
dev.off()
cbg-ethz/nempi documentation built on Nov. 5, 2024, 11:27 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
cbg-ethz/nempi
Inferring unobserved perturbations from gene expression data

other/nempi_loo.r
In cbg-ethz/nempi: Inferring unobserved perturbations from gene expression data

R Package Documentation

Browse R Packages

We want your feedback!

cbg-ethz/nempi Inferring unobserved perturbations from gene expression data

other/nempi_loo.r In cbg-ethz/nempi: Inferring unobserved perturbations from gene expression data

R Package Documentation

Browse R Packages

We want your feedback!

cbg-ethz/nempi
Inferring unobserved perturbations from gene expression data

other/nempi_loo.r
In cbg-ethz/nempi: Inferring unobserved perturbations from gene expression data