R/~old/R/setupGSEArun.R

Defines functions setupGSEArun

Documented in setupGSEArun

#' Sets up and runs GSEA using population-based FST
#' table containing GSEA results (Geneset, Size, ES, NES, NominalP, FDR, FWER)
#'
#' @param realF (char) path to file with real SNP association statistics
#' @param phenoPerm (logical) runs GSEA using phenotype permutations
#' 		(i.e., randomly re-shuffing case/control labels n times).
#' 		if FALSE, will run GSEA using genotype permutations (see setupGSEArun.R)
#' @param permFST_F (char) path to files with association statistics from
#' 		permuted case/control labels. If phenoPerm is FALSE, leave this blank.
#' @param pathFile (char) path to pathway definitions GMT file
#' @param setPerm (integer) set cycle of permutations to run
#' 		default=1000
#' @param snp2geneF (char) path to file with snp2gene mappings. Output of
#' 		mapSNP2gene() (found in GWAS2Pathway)
#' @param snp2genedist (integer) value for GSEA --distance.
#'		Max. distance between SNP and gene for their association in snp2geneF
#' @param minGene (integer) value for GSEA --setmin.
#' 		Min. number of genes in a gene set to be considered
#' @param maxGene (integer) value for GSEA --setmax.
#' 		Max. number of genes in a gene set to be considered
#' @param setSeed (integer) value for GSEA --seed
#' @param CALC_GSEA (char) path to calculate_gsea.pl
#' @param COMB_GSEA (char) path to combine_gsea.pl
#' @return (char) list object with the GSEA output filenames
#' @export

setupGSEArun <- function(realF, phenoPerm, permFST_F, pathFile, snp2geneF,
												 setPerm=1000L, snp2genedist=500000L,
												 minGene=10L, maxGene=300L, setSeed=42L,
												 CALC_GSEA, COMB_GSEA, outDir) {

	# Setup and run GSEA
	# NOTE (adapted from the GenGen webpage) Re:importance of perm files
	# In the absence of a permutation association results file, the command will
	# run a genotype-based permutation, that is, permuting the FST
	# values between SNPs. This is not recommended for SNP arrays
	# with high marker-marker linkage disequilibrium patterns, as such
	# permutation disrupts the LD structure and may generate biased results.
	# Whenever possible, please use raw genotype data to perform phenotype
	# permutation and re-calculate SNP association statistics.
	phenoPermCom <- paste("perl %s %s %s --mapfile %s --permfile %s",
										    "--setstatfile %s --leout %s --setmin %i --setmax %i",
									   	  "--seed %i --distance %i --log %s/calculate_gsea.log")

	genoPermCom <- paste("perl %s %s %s --mapfile %s --cycle %i",
										   "--setstatfile %s --leout %s --setmin %i --setmax %i",
										   "--seed %i --distance %i --log %s/calculate_gsea.log")

 	statOut <- sprintf("%s/gseaStatFile.txt", outDir)
 	leOut	  <- sprintf("%s/gseaLEout.txt", outDir)
 	resOut  <- sprintf("%s/results.txt", outDir)

	if (phenoPerm == TRUE) {
		permF <- sprintf("%s/marker%s_perm.txt", outDir, snpStat)
		perm_stats <- list.files(path=permFST_F, pattern="[0-9].txt$", full.names=T)

		# Pool all permuted stats
	  cat("Pooling perm .txt\n")
		system(sprintf("cat /dev/null > %s.tmp", permF))
		for (f in perm_stats) {
			print(f)
			system(sprintf("cat %s | cut -f 2 > %s.tmp", f, f)) #get second columns
		}																											#with FST values

		inDir <- dirname(perm_stats[1])
		system(sprintf("paste -d, %s/*.tmp > %s/perm_stats.txt", inDir, outDir))

		# Now write markerFST_perm.txt
		system(sprintf("cat %s | cut -f1 > %s/marker.tmp", realF, outDir))
		system(sprintf("paste %s/marker.tmp %s/perm_stats.txt > %s.tmp",
						 			 outDir, outDir, permF))
		system(sprintf('sed "1s/.*/Marker\tCHI2_PERM/" %s.tmp > %s', permF, permF))

		# Remove all .tmp and redundant files
		unlink(sprintf("%s/*.tmp", permFST_F))
		unlink(sprintf("%s/*.tmp", outDir))
		unlink(sprintf("%s/perm_stats.txt", outDir))

		# Run phenotype-permuted GSEA
		cat("* Running phenotype-based permutations\n\n")
		cmd <- sprintf(phenoPermCom, CALC_GSEA, realF, pathFile, snp2geneF,
									 permF, statOut, leOut, minGene, maxGene,
									 setSeed, snp2genedist, outDir)
		system(cmd)
  } else {
		# Run genotype-permuted GSEA
		cat("* Running genotype-based permutations\n\n")
		cmd <- sprintf(genoPermCom, CALC_GSEA, realF, pathFile, snp2geneF,
									 setPerm, statOut, leOut, minGene, maxGene,
									 setSeed, snp2genedist, outDir)
    system(cmd)
	}

  # Combine GSEA results
	system(sprintf("perl %s %s/calculate_gsea.log > %s/combined_res.log",
		  					 COMB_GSEA, outDir, outDir))

  # Format results for output
	cat("Formatting results for output...")
  dat <- read.delim(sprintf("%s/combined_res.log", outDir),skip=1,h=F,as.is=T)
  dat[,1] <- sub("Geneset=","",	dat[,1])
  dat[,2] <- sub("Size=","",		dat[,2])
  dat[,3] <- sub("ES=","",		dat[,3])
  dat[,4] <- sub("NES=","",		dat[,4])
  dat[,5] <- sub("NominalP=","",	dat[,5])
  dat[,6] <- sub("FDR=","",		dat[,6])
  dat[,7] <- sub("FWER=","",		dat[,7])
  colnames(dat) <- c("Geneset", "Size", "ES", "NES", "NominalP", "FDR", "FWER")
  dat <- dat[order(dat$FDR),]
  write.table(dat, file=resOut, sep="\t", col=T, row=F, quote=F)
	cat(" done.\n")

	# Write output to excel format
	#write.xlsx(dat,file=sprintf("%s/results.xlsx", outDir), col=T, row=F)

	# Tally number of pathways at sig. threshold FDR < 0.05 and FDR < 0.1
	sigPaths1 <- which(dat$FDR <= 0.1)
	sigPaths2 <- which(dat$FDR <= 0.05)
	cat("\n------SIGNIFICANT GSEA RESULTS------\n")
	cat("Number of significantly enriched pathways with FDR <= 0.1 =",
			length(sigPaths1))
	cat("\nNumber of significantly enriched pathways with FDR <= 0.05 =",
			length(sigPaths2))

	# Write table of significant pathways to output directory
	write.table(dat[sigPaths1,], file=sprintf("%s/pathways_FDR0.1.txt",
							outDir), sep="\t", col=T, row=F, quote=F)
	write.table(dat[sigPaths2,], file=sprintf("%s/pathways_FDR0.05.txt",
							outDir), sep="\t", col=T, row=F, quote=F)

  # Format results for Cytoscape enrichment map
	cat("\n\nFormatting output for use in Cytoscape...")
  dat <- dat %>% mutate( Description = Geneset )
  dat <- subset(dat, select=c(Geneset, NominalP, FDR, Description))
  dat <- dat[c("Geneset", "Description", "NominalP", "FDR")]
  write.table(dat, file=sprintf("%s/results_forEM.txt", outDir),
        			sep="\t", col=T, row=F, quote=F)
	cat(" done.\n")

	return(list(statOut, leOut, resOut))

  # Cleanup
  unlink(sprintf("%s/combined_res.log", outDir))

}
BaderLab/POPPATHR documentation built on Dec. 17, 2021, 9:53 a.m.