spiapcc-demo: Demo code for spiapcc package

Documented in run

#' @title Main function
#' @description Main function
#' @details Place holder
#' @param datasetNameFile File with names of datasets for processing, one item per line, e.g. GSE1145
#' @param methodNameFile File with names of methods for each dataset, one item per line, e.g. spiapcc
#' @param alpha Alpha value for any method that considers alpha
#' @param perm Number of permutations for any method that does bootstrap
#' @return Returns nothing. Writes all results to files under working directory
#' @export
run <- function(datasetNameFile=NULL,methodNameFile=NULL,alpha=0.05, perm=2000){
	if(is.null(datasetNameFile)){
		stop("ERROR in run: data not specified.")
	}
	datasets <- .readFileAsList(datasetNameFile)
	
	if(is.null(methodNameFile)){
		stop("ERROR in run: method not specified.")
	}
	methods <- .readFileAsList(methodNameFile)

	# prepare pathway gene list and network files (mostly for EB)	
	pwy.files <- list.files(file.path(system.file("extdata",package="spiapcc.demo"),"keggxml"),pattern="*.xml",full.names=TRUE)
	pwys <- sapply(pwy.files, parseKGML)
	
	# Push gs,grn into global env.
	# Not a good practise
	# Necessary to bypass memory issue since R doesn't pass by pointer
	.GlobalEnv$gs <- .extractKeggGS(pwys)
	.GlobalEnv$grn <- .compileGRNFromKEGG(pwys)

	require(KEGGdzPathwaysGEO)
	require(KEGGandMetacoreDzPathwaysGEO)

	# loop over data sets
	for(i in 1:length(datasets)){
		# load dataset
		data(list=datasets[[i]])
		.GlobalEnv$dataset <- get(datasets[[i]])
		
		# name of dataset, e.g. "GSE1145"
		name <- dataset@experimentData@name

		cat("\n======================================\n")
		cat(paste("Starting dataset:", name, sep=" "))
		cat("\n======================================\n")
		
		# create folder
		if(!dir.exists(name)){
			dir.create(name)
		}

		# get local working directory path
		wd <- paste(getwd(),"/",name,sep="")
		
		# target pathway, e.g. "05410"
		target <- dataset@experimentData@other$targetGeneSets
		
		sum <- list(name=name,target=target)
	
		.GlobalEnv$EBSE <- .preprocessEB(dataset=dataset)

		# serial routine
		for(j in 1:length(methods)){
			method=methods[[j]]
			cat("\n======================================\n")
			cat(paste("Starting method:", method, sep=" "))
			cat("\n======================================\n")
 			res <- NULL
       			if(method=="spiapcc"){
        			res <- .call_spiapcc(alpha=alpha, perm=perm, wd=wd)
        		}
        		else if(method=="spia"){
                		res <- .call_spia(alpha=alpha,perm=perm,wd=wd)
        		}
        		else if(method=="gwspia"){
                		res <- .call_gwspia(alpha=alpha,perm=perm,wd=wd)
        		}
				else if(method=="fopa"){
						res <- .call_fopa(alpha=alpha,perm=perm,wd=wd)
				}
        		else{
                		res <- .call_EB(method=method, alpha=alpha, perm=perm, wd=wd)
        		}
				cat("\n======================================\n")
			cat(paste("Finished method:", method, sep=" "))
			cat("\n======================================\n")
			sum <- c(sum,res)
		}
		# combine result and write to file
		sum <- data.frame(sum)
		write.csv(sum, paste(wd,"/",name,".summary.csv",sep=""))

		cat("\n======================================\n")
		cat(paste("Finished dataset:",name,sep=" "))
		cat("\n======================================\n")

		if(exists(datasets[[i]])){
			rm(list=datasets[[i]],envir=.GlobalEnv)
		}

		if(exists("dataset")){
			rm(list="dataset",envir=.GlobalEnv)
		}

		if(exists("EBSE")){
			rm(list="EBSE",envir=.GlobalEnv)
		}
	}
	
	if(exists("gs")){
		rm(list="gs",envir=.GlobalEnv)
	}

	if(exists("grn")){
		rm(list="grn",envir=.GlobalEnv)
	}

	results <- .combineSummaryFiles()
	write.csv(results,paste(getwd(),"Summary.csv",sep="/"))
	return(results)
}

.call_spiapcc <- function(alpha=0.05, perm=2000, wd=NULL){
	dataset <- get("dataset")
	
	# name of dataset, e.g. "GSE1145"
	name <- dataset@experimentData@name

	# target pathway, e.g. "05410"
	target <- dataset@experimentData@other$targetGeneSets
	
	# make required SPIA data file if one doesn't exist
	if(!file.exists(system.file("extdata/hsaSPIA.RData",package="spiapcc"))){
		SPIA::makeSPIAdata(kgml.path=system.file("extdata/keggxml",package="spiapcc.demo"),organism="hsa",out.path=system.file("extdata",package="spiapcc"))
	}

	# calling preprocess to generated needed parameters
	pp <- spiapcc::preprocess(dataset=dataset, alpha=alpha, plots=TRUE, plots.dir=wd)

	res_tn <- spiapcc::spiapcc(de=pp$de, all=pp$all, gse_madat2=pp$gse_madat2, normal=pp$normal, tumor=pp$tumor, organism="hsa", data.dir=paste(system.file("extdata",package="spiapcc"),"/",sep=""), pathids=NULL, nB=perm, plots=FALSE, verbose=TRUE, beta=NULL, combine="fisher",flag="tn")
	res_nt <- spiapcc::spiapcc(de=pp$de, all=pp$all, gse_madat2=pp$gse_madat2, normal=pp$normal, tumor=pp$tumor, organism="hsa", data.dir=paste(system.file("extdata",package="spiapcc"),"/",sep=""), pathids=NULL, nB=perm, plots=FALSE, verbose=TRUE, beta=NULL, combine="fisher",flag="nt")
	res_abs <- spiapcc::spiapcc(de=pp$de, all=pp$all, gse_madat2=pp$gse_madat2, normal=pp$normal, tumor=pp$tumor, organism="hsa", data.dir=paste(system.file("extdata",package="spiapcc"),"/",sep=""), pathids=NULL, nB=perm, plots=FALSE, verbose=TRUE, beta=NULL, combine="fisher",flag="abs")

	# spit out results in .csv format to directory specified with wd	
	write.csv(res_tn, file=paste(wd,"/",name,".spiapcc.tn.csv",sep=""))
	write.csv(res_nt, file=paste(wd,"/",name,".spiapcc.nt.csv",sep=""))
	write.csv(res_abs, file=paste(wd,"/",name,".spiapcc.abs.csv",sep=""))
	
	# get summary
	sum_tn <- .summarize(res_tn, target, "TN")
	sum_nt <- .summarize(res_nt, target, "NT")
	sum_abs <- .summarize(res_abs, target, "ABS")


	#return(sum_tn)
	return(c(sum_tn, sum_nt, sum_abs))
}

.call_spia <- function(alpha=0.05, perm=2000, wd=NULL){
	dataset <- get("dataset")

	# name of dataset, e.g. "GSE1145"
	name <- dataset@experimentData@name

	# target pathway, e.g. "05410"
	target <- dataset@experimentData@other$targetGeneSets
	
	# make required SPIA data file if one doesn't exist
	if(!file.exists(system.file("extdata/hsaSPIA.RData",package="SPIA"))){
		SPIA::makeSPIAdata(kgml.path=system.file("extdata/keggxml",package="spiapcc.demo"),organism="hsa",out.path=system.file("extdata",package="SPIA"))
	}

	# preprocess data to generated needed parameters
	se <- EnrichmentBrowser::probe2gene(dataset)
	se <- EnrichmentBrowser::normalize(se)
	colData(se)$GROUP <- ifelse(colData(se)$Group == "d",1,0)
	se <- EnrichmentBrowser::deAna(se, padj.method="BH")
	
	all_de <- rowData(se, use.names=TRUE)
	tg <- all_de[all_de$ADJ.PVAL < alpha,]
	tg <- data.frame(tg)
	de <- tg$FC
	names(de) <- as.vector(rownames(tg))
	all <- rownames(all_de)

	# call gwspia
	res <- SPIA::spia(de=de, all=all, organism="hsa", data.dir=paste(system.file("extdata",package="SPIA"),"/",sep=""), pathids=NULL, nB=perm, plots=FALSE, verbose=TRUE, beta=NULL, combine="fisher")
	
	colnames(res)[which(names(res)=="pG")] <- "PVAL"
	colnames(res)[which(names(res)=="pGFdr")] <- "FDR.BH"
	res$FDR.BY <- p.adjust(res$PVAL,"BY")

	# spit out results in .csv format to directory specified with wd	
	write.csv(res, file=paste(wd,"/",name,".spia.csv",sep=""))
	

	# get summary
	sum <- .summarize(res, target, "spia")
	return(sum)
}

.call_gwspia <- function(alpha=0.05, perm=2000,  wd=NULL){
	dataset <- get("dataset")

	# name of dataset, e.g. "GSE1145"
	name <- dataset@experimentData@name

	# target pathway, e.g. "05410"
	target <- dataset@experimentData@other$targetGeneSets
	
	# make required SPIA data file if one doesn't exist
	if(!file.exists(system.file("extdata/hsaSPIA.RData",package="GWSPIA"))){
		SPIA::makeSPIAdata(kgml.path=system.file("extdata/keggxml",package="spiapcc.demo"),organism="hsa",out.path=system.file("extdata",package="GWSPIA"))
	}

	# preprocess data to generated needed parameters
	kgml.path=system.file("extdata/keggxml",package="GWSPIA")
	mdir=kgml.path
	paths <- dir(mdir,pattern=".xml")

	se <- EnrichmentBrowser::probe2gene(dataset)
	se <- EnrichmentBrowser::normalize(se)
	colData(se)$GROUP <- ifelse(colData(se)$Group == "d",1,0)
	se <- EnrichmentBrowser::deAna(se, padj.method="BH")
	
	all_de <- rowData(se, use.names=TRUE)
	tg <- all_de[all_de$ADJ.PVAL < alpha,]
	tg <- data.frame(tg)
	de <- tg$FC
	names(de) <- as.vector(rownames(tg))
	all <- rownames(all_de)

	allt <- all_de$limma.STAT
	names(allt) <- rownames(all_de)

	BC1 <- GWSPIA::BC(mdir,paths)
	SP1 <- GWSPIA::SP(mdir,paths)
	IF1 <- GWSPIA::IF(mdir,paths,de,all)
	wi1 <- GWSPIA::wi(SP1,IF1)

	# call gwspia
	res <- GWSPIA::gwspia(de=de, all=all, allt=allt, betweennesslist=BC1, DEdegree=wi1, organism="hsa", data.dir=paste(system.file("extdata",package="GWSPIA"),"/",sep=""), pathids=NULL, nB=perm, plots=FALSE, verbose=TRUE, beta=NULL, combine="fisher")
	
	colnames(res)[which(names(res)=="pG")] <- "PVAL"
	colnames(res)[which(names(res)=="pGFdr")] <- "FDR.BH"
	res$FDR.BY <- p.adjust(res$PVAL,"BY")

	# spit out results in .csv format to directory specified with wd	
	write.csv(res, file=paste(wd,"/",name,".gwspia.csv",sep=""))

	# get summary
	sum <- .summarize(res, target, "gwspia")

	return(sum)
}




.call_EB <- function(method=NULL, alpha=0.05, perm=2000, wd=NULL){
	dataset <- get("dataset")
	EBSE <- get("EBSE")
	gs <- get("gs")
	grn <- get("grn")

	# name of dataset, e.g. "GSE1145"
	name <- dataset@experimentData@name

	# target pathway, e.g. "05410"
	target <- dataset@experimentData@other$targetGeneSets
	
	sum <- tryCatch({		
		res <- NULL
		if(method %in% nbeaMethods()){
			if(is.null(grn)){
				stop("Invalid argument to EB: grn is NULL")
			}
			res <- nbea(method=method, se=EBSE, gs=gs, grn=grn, alpha=alpha, perm=perm)$res.tbl
		}else if(method %in% sbeaMethods()){
			res <- sbea(method=method, se=EBSE, gs=gs, alpha=alpha, perm=perm)$res.tbl
		}else{
			stop(paste("Method",method,"in EB not found.",sep=" "))
		}

		res$FDR.BH <- p.adjust(res$PVAL,"BH")
		res$FDR.BY <- p.adjust(res$PVAL,"BY")
		res <- data.frame(res)

		write.csv(res, file=paste(wd,"/",name,".",method,".csv",sep=""))
			
		sum <- .summarize(res, target, method)
	}, error = function(e){
		write.csv(toString(e), file=paste(wd,"/",name,".",method,".csv",sep=""))
		
		sum <- list(R="ERR", N="ERR", P="ERR", FDR.BH="ERR", FDR.BY="ERR")
		names(sum) <- c(paste(method,"R",sep="."),paste(method,"N",sep="."),paste(method,"P",sep="."),paste(method,"FDR.BH",sep="."),paste(method,"FDR.BY",sep="."))

		return(sum)
	})
	
	return(sum)
}

.call_fopa <- function(alpha=0.05, perm=2000, wd=NULL){
	dataset <- get("dataset")

	# name of dataset, e.g. "GSE1145"
	name <- dataset@experimentData@name

	# target pathway, e.g. "05410"
	target <- dataset@experimentData@other$targetGeneSets

	# Change wd to launch FoPA in its root dir
	old_wd <- getwd()
	setwd("~/github/FoPA")

	# Generate DEG and gene list via FoPA's included R script
	DEG <- paste("./data/input_genes",paste(name,"DEG.txt",sep="_"),sep="/")
	ALL <- paste("./data/input_genes",paste(name,"ALL.txt",sep="_"),sep="/")
	cmd <- paste("./data/input_genes/generate_input_files.R",name,alpha,DEG,ALL,sep=" ")
	cat(paste("DEBUG: executing",cmd," \n",sep=" "))
	# Execute via shell
	system(cmd)

	# Calling FoPA via shell
	DEG <- paste(name,"DEG.txt",sep="_")
	ALL <- paste(name,"ALL.txt",sep="_")
	OUT <- paste(wd,"/",name,".FoPA_raw.txt",sep="")
	cmd <- paste("python","~/github/FoPA/src/FoPA.py","-n",DEG,ALL,OUT)
	cat(paste("DEBUG: executing",cmd," \n",sep=" "))
	system(cmd)

	# restore wd
	setwd(old_wd)

	# Read in FoPA result as table since it's not structured
	res <- read.table(OUT,header=FALSE,sep="\t",quote="")
	# Add column names
	colnames(res) <- c("ID","Pathway","Score","PVAL")
	# From table to dataframe
	res <- data.frame(res)
	# prepend '0' to ID since it's only 4 digits
	res$ID <- paste("0",res$ID,sep="")
	# Append FDR stats
	res$FDR.BH <- p.adjust(res$PVAL,"BH")
	res$FDR.BY <- p.adjust(res$PVAL,"BY")

	# spit out results in .csv format to directory specified with wd	
	write.csv(res, file=paste(wd,"/",name,".FoPA.csv",sep=""))

	# get summary
	sum <- .summarize(res, target, "fopa")

	return(sum)
}