other/perturbseq.r

data <- as.character(commandArgs(TRUE)[1])
set <- as.numeric(commandArgs(TRUE)[2])
path <- as.character(commandArgs(TRUE)[3])
cores <- as.numeric(commandArgs(TRUE)[4])
subset <- as.numeric(commandArgs(TRUE)[5])

## path <- "~/Mount/Eulershare/perturbseq2/"

print(data)
print(set)
print(path)
print(cores)

library(Rcpp)
library(naturalsort)

source("mnems.r")
source("mnems_low.r")
sourceCpp(code=readChar("mm.cpp", file.info("mm.cpp")$size))
devtools::load_all("CausalPathways/dce/")

## source("~/Documents/mnem/R/mnems.r"); source("~/Documents/mnem/R/mnems_low.r"); sourceCpp("~/Documents/mnem/src/mm.cpp"); library(Rgraphviz);

if (data == "csv") {
    M <- readRDS(paste0(path, "Counts.rds"))
    for (i in unique(colnames(M))) {
        write.csv(M[,which(colnames(M)==i)],file=paste0(path, "Counts_", i, ".csv"))
    }
} else if (data == "mnem") {
    D <- mnem::createApp(sets = set, types = "data", n = 100)
    saveRDS(D, file = paste0("Perturbseq_data_", set, ".rds"))
} else if (data == "perturbseq2") {
    M <- list()
    exp.id <- list(e1 = c("75","01"), e2 = c("77","05"), e3 = c("81","10"))
    for (i in 1:3) {
        id1 <- exp.id[[i]][1]
        id2 <- exp.id[[i]][2]
        M[[i]] <- Matrix::readMM(paste0(path, "GSM24066", id1, "_10X0", id2, "_matrix.mtx.txt"))
        gc()
        M[[i]] <- as(M[[i]], "matrix")
        gc()
        genes <- read.table(paste0(path, "GSM24066", id1, "_10X0", id2, "_genes.tsv"))
        cells <- read.csv(paste0(path, "GSM24066", id1, "_10X0", id2, "_cell_identities.csv"))
        barcodes <- read.table(paste0(path, "GSM24066", id1, "_10X0", id2, "_barcodes.tsv"))
        rownames(M[[i]]) <- genes[,2]
        colnames(M[[i]]) <- barcodes[,1]
        cells <- cells[which(cells$good.coverage == TRUE | cells$good.coverage == "True"),]
        M[[i]] <- M[[i]][,which(colnames(M[[i]]) %in% cells$cell.BC)]
        colnames(M[[i]]) <- cells$guide.identity[match(colnames(M[[i]]), cells$cell.BC)]
        if (i == 1) {
            colnames(M[[i]]) <- gsub("_pD.*|_pM.*|_pB.*|_only.*", "", colnames(M[[i]]))
            colnames(M[[i]]) <- gsub("\\*.*", "Unknwn", colnames(M[[i]]))
            colnames(M[[i]])[grep("mod",colnames(M[[i]]))] <- "Ctrl_Pilot"
        } else if (i == 2) {
            treatments <- gsub(".*-","",barcodes[,1])
            treatments <- treatments[which(barcodes[,1] %in% cells$cell.BC)]
            colnames(M[[i]]) <- gsub("_pD.*|_pM.*|_pB.*|_only.*", "", colnames(M[[i]]))
            colnames(M[[i]]) <- gsub("\\*.*", "Unknwn", colnames(M[[i]]))
            colnames(M[[i]]) <- gsub("3x_neg_ctrl","Ctrl_Epi",colnames(M[[i]]))
            colnames(M[[i]]) <- paste0(colnames(M[[i]]), "_", treatments)
        } else if (i == 3) {
            treatments <- gsub(".*-","",barcodes[,1])
            treatments <- treatments[which(barcodes[,1] %in% cells$cell.BC)]
            colnames(M[[i]]) <- gsub("_pD.*|_pM.*|_pB.*|_only.*", "", colnames(M[[i]]))
            colnames(M[[i]]) <- gsub("\\*.*", "Unknwn", colnames(M[[i]]))
            colnames(M[[i]])[grep("mod",colnames(M[[i]]))] <- "Ctrl_Main"
            colnames(M[[i]]) <- paste0(colnames(M[[i]]), "_", treatments)
        }
        gc()
    }
    saveRDS(M[[1]], file = paste0(path, "Counts_Pilot.rds"))
    saveRDS(M[[2]], file = paste0(path, "Counts_Epistasis.rds"))
    saveRDS(M[[3]], file = paste0(path, "Counts_Main.rds"))
    genes <- rownames(M[[1]])
    for (i in 2:3) {
        genes <- intersect(genes, rownames(M[[i]]))
    }
    for (i in 1:3) {
        M[[i]] <- M[[i]][genes,]
    }
    M <- do.call("cbind", M)
    gc()
    saveRDS(M, file = paste0(path, "Counts.rds"))
} else if (data == "edger") {
    print("start")
    for (i in subset) {
        print(i)
        if (i == 4) {
            M <- readRDS(paste0(path, "Counts_Main.rds"))
            genes.med <- apply(M, 1, median)
            M <- M[which(genes.med > 0),]
            treatments <- gsub(".*_","",colnames(M))
            colnames(M) <- gsub("_.*","",colnames(M))
            colnames(M) <- gsub("Ctrl","0_Ctrl",colnames(M))
            if (set != 2) {
                design <- model.matrix(~colnames(M)+factor(treatments))
                colnames(design) <- gsub("colnames\\(M\\)", "", colnames(design))
                genes <- unique(colnames(M))
                genes <- genes[-grep("Ctrl",genes)]
                library(edgeR)
                for (j in genes) {
                    print(j)
                    cells <- grep(paste0("Ctrl|",j),colnames(M))
                    Mtmp <- M[,cells]
                    designTmp <- design[cells,]
                    designTmp <- designTmp[,which(apply(designTmp,2,sum)>0)]
                    y <- DGEList(counts=Mtmp)
                    y <- calcNormFactors(y)
                    y <- estimateDisp(y,design=designTmp)
                    fit <- glmQLFit(y,design=designTmp)
                    P <- matrix(NA, nrow(Mtmp), 1)
                    colnames(P) <- j
                    rownames(P) <- rownames(M)
                    F <- P
                    qlf <- glmQLFTest(fit,coef=2)
                    P[,1] <- qlf$table$PValue
                    F[,1] <- qlf$table$logFC
                    saveRDS(P, file = paste0(path, "P_edgeR_", i, "_", j, ".rds"))
                    saveRDS(F, file = paste0(path, "F_edgeR_", i, "_", j, ".rds"))
                }
            }
        } else if (i %in% 1:3) {
            M <- readRDS(paste0(path, "Counts_Epistasis.rds"))
            M <- M[,grep(paste0("_",i),colnames(M))]
            genes.med <- apply(M, 1, median)
            M <- M[which(genes.med > 0),]
            colnames(M) <- gsub("Ctrl","0_Ctrl",colnames(M))
            design <- model.matrix(~colnames(M))
            colnames(design) <- gsub("colnames\\(M\\)", "", colnames(design))
        } else if (i == 5) {
            M <- readRDS(paste0(path, "Counts_Pilot.rds"))
            genes.med <- apply(M, 1, median)
            M <- M[which(genes.med > 0),]
            colnames(M) <- gsub("Ctrl","0_Ctrl",colnames(M))
            design <- model.matrix(~colnames(M))
            colnames(design) <- gsub("colnames\\(M\\)", "", colnames(design))
        }
        gc()
        if (set == 1 & i %in% c(1:3,5)) {
            print("bulk")
            library(edgeR)
            y <- DGEList(counts=M)
            y <- calcNormFactors(y)
            y <- estimateDisp(y,design=design)
            fit <- glmQLFit(y,design=design)
            P <- matrix(NA, nrow(M), ncol(design)-1)
            colnames(P) <- colnames(design)[-1]
            rownames(P) <- rownames(M)
            F <- P
            for (j in 1:ncol(P)) {
                print(colnames(P)[j])
                qlf <- glmQLFTest(fit,coef=j+1)
                P[,j] <- qlf$table$PValue
                F[,j] <- qlf$table$logFC
            }
            saveRDS(P, file = paste0(path, "P_edgeR_", i, ".rds"))
            saveRDS(F, file = paste0(path, "F_edgeR_", i, ".rds"))
        } else if (set == 2 & i %in% c(1:5)) {
            print("single cell")
            colnames(M) <- gsub("_Epi.*|_Pilot.*|_Main.*","",colnames(M))
            colnames(M)[which(!(colnames(M) == "0_Ctrl"))] <- paste0(colnames(M)[which(!(colnames(M) == "0_Ctrl"))], "_", 1:sum(!(colnames(M) == "0_Ctrl")))
            design <- model.matrix(~colnames(M))
            colnames(design) <- gsub("colnames\\(M\\)", "", colnames(design))
            library(edgeR)
            doCell <- function(x) {
                designTmp <- design[,c(1,x+1)]
                idx <- c(which(apply(design,1,sum)==1),which(designTmp[,-1]==1))
                Mtmp <- M[,idx]
                designTmp <- designTmp[idx,]
                y <- DGEList(counts=Mtmp)
                y <- calcNormFactors(y)
                y <- estimateDisp(y,design=designTmp)
                fit <- glmFit(y, design=designTmp)
                qlf <- glmLRT(fit,coef=2)
                P <- qlf$table$PValue
                F <- qlf$table$logFC
                return(cbind(P,F))
            }
            if (cores > 1) {
                library(snowfall)
                sfInit(parallel = TRUE, cpus = cores)
                sfExport("design","M")
                sfLibrary(edgeR)
                tmp <- sfLapply(1:(ncol(design)-1),doCell)
                sfStop()
            } else {
                tmp <- lapply(1:(ncol(design)-1),doCell)
            }
            PF <- do.call("cbind",tmp)
            F <- PF[,grep("F",colnames(PF))]
            P <- PF[,grep("P",colnames(PF))]
            colnames(P) <- colnames(F) <- colnames(design)[-1]
            saveRDS(P, file = paste0(path, "P_edgeR_", i, "_sc.rds"))
            saveRDS(F, file = paste0(path, "F_edgeR_", i, "_sc.rds"))
        }
    }
} else if (data == "pca") {
    if (subset %in% c(1,2,3)) {
        M <- readRDS(paste0(path, "Counts_Epistasis.rds"))
        M <- M[,grep(paste0("_",i),colnames(M))]
    } else if (subset == 4) {
        M <- readRDS(paste0(path, "Counts_Main.rds"))
    } else if (subset == 5) {
        M <- readRDS(paste0(path, "Counts_Pilot.rds"))
    }
    genes.med <- apply(M, 1, median)
    M <- M[which(genes.med > 0),]
    pca <- prcomp(t(log(M+1)))
    col <- unique(colnames(M))
    col <- match(colnames(M),col)
    varpct <- pca$sdev^2
    varpct <- round(varpct/sum(varpct),2)
    comps <- c(1,2)
    pdf(paste0(path, "pca_", subset, ".pdf"), width = 20, height = 20)
    par(mfrow=c(1,2))
    plot(pca$x[, comps], col = col, xlab = varpct[comps[1]], ylab = varpct[comps[2]], main = "log(x+1) transformed raw data")
    plot(sort(unique(col)),col=sort(unique(col)))
    legend("bottomright",legend=unique(colnames(M)),fill=sort(unique(col)),col=sort(unique(col)))
    dev.off()
} else if (data == "lods") {
    print("start")
    for (i in subset) {
        print(i)
        if (i == 4) {
            M <- readRDS(paste0(path, "Counts_Main.rds"))
            genes.med <- apply(M, 1, median)
            M <- M[which(genes.med > 0),]
            colnames(M) <- gsub("_.*","",colnames(M))
        } else if (i %in% 1:3) {
            M <- readRDS(paste0(path, "Counts_Epistasis.rds"))
            M <- M[,grep(paste0("_",i),colnames(M))]
            genes.med <- apply(M, 1, median)
            M <- M[which(genes.med > 0),]
        } else if (i == 5) {
            M <- readRDS(paste0(path, "Counts_Pilot.rds"))
            genes.med <- apply(M, 1, median)
            M <- M[which(genes.med > 0),]
        }
        M <- Linnorm::Linnorm(M,DataImputation=FALSE)
        print("log odds")
        colnames(M) <- gsub("_Epi.*|_Pilot.*|_Main.*","",colnames(M))
        ctrls <- which(colnames(M)=="Ctrl")
        doGene <- function(x,ctrls=ctrls) {
            ctrlcde <- ks::kcde(M[x,ctrls])
            kdidx <- which(!(colnames(M) == "Ctrl"))
            Z0 <- length(ctrlcde$eval.points)
            L <- numeric(length(kdidx))
            for (i in 1:length(kdidx)) {
                kd <- colnames(M)[kdidx[i]]
                kds <- which(colnames(M)==kd)
                if (all(M[x,kds]==0)) {
                    L[i] <- 0
                    next()
                }
                kdcde <- ks::kcde(M[x,kds])
                Z1 <- length(kdcde$eval.points)
                pkd <- min(sum(kdcde$eval.points >= M[x,i])/Z1,sum(kdcde$eval.points <= M[x,i])/Z1)
                pctrl <- min(sum(ctrlcde$eval.points >= M[x,i])/Z0,sum(ctrlcde$eval.points <= M[x,i])/Z0)
                if (pkd == 0) {
                    pkd <- min(0.01,pctrl)
                }
                if (pctrl == 0) {
                    pctrl <- min(0.01,pkd)
                }
                if (pctrl == 0 & pkd == 0) {
                    L[i] <- 0
                } else {
                    L[i] <- log(pkd/pctrl)
                }
            }
            print(x)
            return(L)
        }
        if (cores > 1) {
            library(snowfall)
            sfInit(parallel = TRUE, cpus = cores)
            sfExport("ctrls","M")
            sfLibrary(ks)
            tmp <- sfLapply(1:nrow(M),doGene,ctrls)
            sfStop()
        } else {
            tmp <- lapply(1:nrow(M),doGene,ctrls)
        }
        L <- do.call("rbind",tmp)
        colnames(L) <- colnames(M)[which(!(colnames(M) == "Ctrl"))]
        rownames(L) <- rownames(M)
        saveRDS(L, file = paste0(path, "L_linnorm_", i, "_sc.rds"))
    }
}
stop("success")

## hpc commands:

system("scp ~/Documents/mnem/R/* euler.ethz.ch:")
system("scp ~/Documents/testing/nest/nest_app.r euler.ethz.ch:")

rm error.txt
rm output.txt

ram=16000 # data: 64gb, wilcox sc: 8gb for 32 nodes, edger: 64gb, voom: 64gb
queue=24
cores=8

data="lods"
set=2
subset=5
path=/cluster/work/bewi/members/mpirkl/perturbseq2/

bsub -M ${ram} -q normal.${queue}h -n ${cores} -e error.txt -o output.txt -R "rusage[mem=${ram}]" "R/bin/R --vanilla --silent --no-save --args '${data}' '${set}' '${path}' '${cores}' '${subset}' < nest_app.r"

## temp:

rm error.txt
rm output.txt

queue=24
ram=64000
cores=4

bsub -M ${ram} -q normal.${queue}h -n ${cores} -e error.txt -o output.txt -R "rusage[mem=${ram}]" "R/bin/Rscript -e 'library(DawnRank); load(\"DawnRank_temp.rda\"); dawnRankScore <- DawnRank(adjMatrix = pathway2, mutationMatrix = M, expressionMatrix = normalizedDawn, mu = 3, parallel = ${cores}); saveRDS(dawnRankScore,file=\"nempi_dawnrank2.rds\")'"

## escape quotes like \"

rm error.txt
rm output.txt

queue=24
ram=4000
cores=1

Rcmd=/cluster/work/bewi/members/mpirkl/miniconda3/bin/Rscript
## Rcmd=~/R/bin/Rscript

variable=latent
values=5
reps=1

bsub -M ${ram} -q normal.${queue}h -n ${cores} -e error.txt -o output.txt -R "rusage[mem=${ram}]" "${Rcmd} main.R --variable ${variable} --values ${values} --append FALSE --replicates ${reps} --link identity --output test.csv"

for i in {1..100}
do
bsub -M ${ram} -q normal.${queue}h -n ${cores} -e error.txt -o output.txt -R "rusage[mem=${ram}]" "${Rcmd} main.R --variable ${variable} --values ${values} --append TRUE --replicates ${reps} --link identity --output test.csv"
done


bsub -M ${ram} -q normal.${queue}h -n ${cores} -e error.txt -o output.txt -R "rusage[mem=${ram}]" "${Rcmd} main.R --variable node.num --values 100 --methods \"dce,dce.log,dce.tpm,dce.tpmlog\""

for i in {1..99}
do
bsub -M ${ram} -q normal.${queue}h -n ${cores} -e error.txt -o output.txt -R "rusage[mem=${ram}]" "${Rcmd} main.R --variable node.num --values 100 --methods \"dce,dce.log,dce.tpm,dce.tpmlog\""
done

## results:

## load functions:

library(Rcpp)
library(naturalsort)
source("~/Documents/mnem/R/mnems.r"); source("~/Documents/mnem/R/mnems_low.r"); sourceCpp("~/Documents/mnem/src/mm.cpp"); library(Rgraphviz);

##

for (i in 1:5) {
    L <- readRDS(paste0("~/Mount/Eulershare/perturbseq2/L_linnorm_", i, "_sc.rds"))
    print(dim(L))
}

## edger results:

P <- readRDS("~/Mount/Eulershare/perturbseq2/P_edgeR.rds")
F <- readRDS("~/Mount/Eulershare/perturbseq2/F_edgeR.rds")

##

P <- V$p.value[,-1]
F <- V$coefficients[,-1]

##

discFDR <- function(P) {
    Q <- P*0
    Q[1:length(Q)] <- p.adjust(P)
    E <- P*0
    E[which(Q < 0.1)] <- 1
    return(E)
}

llrFc <- function(F) {
    E <- abs(F)-mean(abs(F))
    return(E)
}

##

D <- E
colnames(D) <- gsub("_[1-3]","",colnames(D))
D <- D[,-grep("_",colnames(D))]
D <- D[,which(!(colnames(D)=="Unknwn"))]
colnames(D)[which(colnames(D) == "Gal4-4(mod)")] <- "GAL4"
Sgenes <- getSgenes(D)

res <- mnem(D,search="estimate",k=1,complete=TRUE,starts=10)
res$ll
pdf("temp.pdf", width = 8, height = 7)
plot(res, cells = 0, bestCell = 0, egenes = 0, showdata = TRUE)
dev.off()

##

phi <- res$comp[[1]]$phi*0

for (i in 1:1000) {
    Ds <- D[sample(1:nrow(D), nrow(D), replace = TRUE),]
    ress <- nemEst(Ds,method=method)
    phi <- phi + mnem::transClose(ress$phi)
    ## cat(paste0(i,"."))
}

phi2 <- phi/1010
phi2[which(phi2 < 0.99)] <- 0

pdf("temp.pdf", width = 20, height = 40)
mnem::plotDnf(transitive.reduction(phi2))
dev.off()

## bum model:

hist(P,freq=FALSE)
lines(density(punif(seq(0,1,length.out=1000))),col=2)
lines(density(pbeta(seq(0,1,length.out=1000),10,1)),col=2)

lods <- as.vector(P)
names(lods) <- rownames(P)
bum <- BioNet::bumOptim(lods)
dbeta <- dbeta(lods, bum$a, 1)
lods <- log(dbeta)
lods <- matrix(lods, nrow = nrow(P))
rownames(lods) <- rownames(P)
colnames(lods) <- colnames(P)

lods <- dbeta(P, 0.1, 1)
E <- lods
method <- "llr"

## gene length:

counts.csv <- read.csv("~/Mount/Eulershare/perturbseq2/Counts_COPB1.csv")

library("biomaRt")
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
gene.info <- getBM(filters = "hgnc_symbol",
                     attributes = c("entrezgene_id", "hgnc_symbol","transcript_length"),
                     values = counts.csv[,1], mart = mart)

D <- as(tmp[,-1],"matrix")
rownames(D) <- tmp[,1]
rownames(D) <- gsub("-.*","",gsub("\\..*","",rownames(D)))
D <- rowsum(D, rownames(D))

gene.info <- getBM(filters = "hgnc_symbol",
                     attributes = c("entrezgene_id", "hgnc_symbol","transcript_length"),
                     values = rownames(D), mart = mart)

## logical model:

t <- 2
P <- readRDS(paste0("~/Mount/Eulershare/perturbseq2/P_edgeR_",t,"_.rds"))
F <- readRDS(paste0("~/Mount/Eulershare/perturbseq2/F_edgeR_",t,"_.rds"))
rownames(P) <- rownames(F) <- 1:nrow(F)

source("~/Documents/testing/booltest/booltest.r")

E <- discFDR(P)
method <- "disc"

D <- E
colnames(D) <- gsub("_[1-3]","",colnames(D))
D <- D[,-grep("_",colnames(D))]
D <- D[,which(!(colnames(D)=="Unknwn"))]
colnames(D)[which(colnames(D) == "Gal4-4(mod)")] <- "GAL4"
Sgenes <- getSgenes(D)

res <- mnem(D,search="exhaustive",k=1,complete=TRUE,starts=10)
res$ll
pdf("temp.pdf", width = 8, height = 7)
plot(res, cells = 0, bestCell = 0, egenes = 0, showdata = TRUE)
dev.off()

D <- E
colnames(D) <- gsub("_[1-3]","",colnames(D))
D <- D[,which(!(colnames(D)=="Unknwn"))]
if (method == "disc") {
    D <- D[which(apply(D,1,sum) > 0),]
} else {
    D <- D[which(apply(D,1,max) > 0),]
}
Sgenes <- getSgenes(D)

pdf("temp.pdf",width=5,height=40)
epiNEM::HeatmapOP(D,bordercol="transparent",aspect="iso")
dev.off()

lres <- ltest(D,method="exhaustive",search=search)
lres$pvals

for (i in 1:4) {
    print(lres$nemC[[i]]$adj)
    print(lres$nems[[i]]$adj)
}

for (i in 1:4) {
    lres$nemC[[i]]$adj <- transClose(lres$nemC[[i]]$adj)
    lres$nems[[i]]$adj <- transClose(lres$nems[[i]]$adj)
}

for (bs in 1:10) {
    lresbs <- ltest(D[sample(1:nrow(D), nrow(D), replace = TRUE), ],method = method,search=search)
    for (i in 1:4) {
        lres$nemC[[i]]$adj <- lres$nemC[[i]]$adj + transClose(lresbs$nemC[[i]]$adj)
        lres$nems[[i]]$adj <- lres$nems[[i]]$adj + transClose(lresbs$nems[[i]]$adj)
        lres$pvals <- lres$pvals + lresbs$pvals
    }
}

for (i in 1:4) {
    print(lres$nemC[[i]]$adj)
    print(lres$nems[[i]]$adj)
}
cbg-ethz/nempi documentation built on Nov. 9, 2023, 3:46 p.m.