GIGSEA_tutorial.R
In GIGSEA: Genotype Imputed Gene Set Enrichment Analysis

## -----------------------------------------------------------------------------
library(GIGSEA)

## -----------------------------------------------------------------------------
# runGIGSEA( MetaXcan="software/MetaXcan.py" , 
# model_db_path="eQTL/DGN-WB_0.5.db", 
# covariance="reference/covariance.DGN-WB_0.5.txt.gz", 
# gwas_folder="data/GWAS_summary", gwas_file_pattern="heart.sumstats", 
# output_dir="result/GIGSEA", permutation_num=1000)

## -----------------------------------------------------------------------------
data(heart.metaXcan)
head(heart.metaXcan)

## -----------------------------------------------------------------------------
gene = heart.metaXcan$gene_name
# extract the imputed Z-score of differential gene expression, which follows 
# the normal distribution
fc <- heart.metaXcan$zscore
# use the prediction R^2 and fraction of imputation-used SNPs as weights 
usedFrac <- heart.metaXcan$n_snps_used / heart.metaXcan$n_snps_in_model
r2 <- heart.metaXcan$pred_perf_r2
weights <- usedFrac*r2
# build a new data frame for the following weighted linear regression-based 
# enrichment analysis
data <- data.frame(gene,fc,weights)
head(data)

## -----------------------------------------------------------------------------
# load data
data(MSigDB.KEGG.Pathway)
# MSigDB.KEGG.Pathway is a list comprising two components: net and annot
class(MSigDB.KEGG.Pathway)
names(MSigDB.KEGG.Pathway)
dim(MSigDB.KEGG.Pathway$net)
# the column is the pathway and the row is the gene
head(colnames(MSigDB.KEGG.Pathway$net))
head(rownames(MSigDB.KEGG.Pathway$net))
# the annotation of the pathway
head(MSigDB.KEGG.Pathway$annot)
# the net takes discrete values of 0 or 1
head(MSigDB.KEGG.Pathway$net[,1:30])

## -----------------------------------------------------------------------------
# load data
data(TargetScan.miRNA)
# TargetScan.miRNA is a list comprising two components: net and annot
class(TargetScan.miRNA)
names(TargetScan.miRNA)
dim(TargetScan.miRNA$net)
# the column is the miRNA and the row is the gene
head(colnames(TargetScan.miRNA$net))
head(rownames(TargetScan.miRNA$net))
# the annotation of the miRNA
head(TargetScan.miRNA$annot)
# the net takes continuous values
head(TargetScan.miRNA$net[,1:20])

## -----------------------------------------------------------------------------
# downlaod the gmt file
gmt <- readLines( paste0('http://amp.pharm.mssm.edu/CREEDS/download/',
                        'single_drug_perturbations-v1.0.gmt') )
# obtain the index of up-regulated and down-regulated gene sets
index_up <- grep('-up',gmt)
index_down <- grep('-dn',gmt)
# transform the gmt file into gene sets. The gene set is a data frame, 
# comprising three vectors: term (here is drug), geneset (a gene symbol list 
# separated by comma), and value (1 and -1 separated by comma) 
gff_up <- gmt2geneSet( gmt[index_up] , termCol=c(1,2) , singleValue = 1 )
gff_down <- gmt2geneSet( gmt[index_down] , termCol=c(1,2) , singleValue = -1 )

# as following, combine the up and down-regulated gene sets together, 
# and use value of 1 and -1 to indicate their direction:
# extract the drug names
term_up <- sapply( gff_up$term , function(x) gsub('-up','',x) )
term_down <- sapply( gff_down$term , function(x) gsub('-dn','',x) )
all(term_up==term_down)
# combine up and down-regulated gene names for each drug perturbation
geneset <- sapply( 1:nrow(gff_up) , function(i) 
  paste(gff_up$geneset[i],gff_down$geneset[i],sep=',') )
# use 1 and -1 to indicate direction of up-regulated and down-regulated genes
value <- sapply( 1:nrow(gff_up) , function(i) 
  paste(gff_up$value[i],gff_down$value[i],sep=',') )
# transform the gene set into matrix, where the row represents the gene, 
# the column represents the drug perturbation, and each entry takes values of 
# 1 and -1
net1 <- geneSet2Net( term=term_up , geneset=geneset , value=value )
# transform the gene set into sparse matrix, where the row represents the gene,
# the column represents the drug perturbation, and each entry takes values of 
# 1 and -1
net2 <- geneSet2sparseMatrix( term=term_up , geneset=geneset , value=value )
tail(net1[,1:30])
tail(net2[,1:30])
# the size of sparse matrix is much smaller than the matrix
format( object.size(net1), units = "auto")
format( object.size(net2), units = "auto")

## -----------------------------------------------------------------------------
# take MSigDB.KEGG.Pathway as an example
net <- MSigDB.KEGG.Pathway$net
# intersect the permuted genes with the gene sets of interest
data2 <- orderedIntersect( x = data , by.x = data$gene , by.y = rownames(net) )
net2 <- orderedIntersect( x = net , by.x = rownames(net) , by.y = data$gene  )
all( rownames(net2) == as.character(data2$gene) )
# the SGSEA.res1 uses the weighted simple linear regression model, while 
# SGSEA.res2 used the weighted Pearson correlation. The latter one takes 
# substantially less time. 
system.time( SGSEA.res1 <- permutationSimpleLm( fc=data2$fc , net=net2 , 
                              weights=data2$weights , num=100 ) )
system.time( SGSEA.res2 <- permutationSimpleLmMatrix( fc=data2$fc , 
                              net=net2 , weights=data2$weights , num=100 ) )
head(SGSEA.res2)

## -----------------------------------------------------------------------------
# MGSEA.res1 uses the weighted multiple linear regression model 
system.time( MGSEA.res1 <- permutationMultipleLm( fc=data2$fc , net=net2 , 
                              weights=data2$weights , num=1000 ) )
# MGSEA.res2 used the matrix solution
system.time( MGSEA.res2 <- permutationMultipleLmMatrix( fc=data2$fc , 
                              net=net2 , weights=data2$weights , num=1000 ) )
head(MGSEA.res2)

## -----------------------------------------------------------------------------
# import packages and prepare data as above
library(GIGSEA)

# prepare the dataset
data(heart.metaXcan)
gene = heart.metaXcan$gene_name
fc <- heart.metaXcan$zscore
usedFrac <- heart.metaXcan$n_snps_used / heart.metaXcan$n_snps_in_cov
r2 <- heart.metaXcan$pred_perf_r2
weights <- usedFrac*r2
data <- data.frame(gene,fc,weights)

# run one-step GIGSEA 
#weightedGSEA(data, geneCol='gene', fcCol='fc', weightCol= 'weights',
#geneSet=c("MSigDB.KEGG.Pathway","Fantom5.TF","TargetScan.miRNA","GO"), 
#permutationNum=10000, outputDir="./GIGSEA" )

#dir("./GIGSEA")