goseq.R
In goseq: Gene Ontology analyser for RNA-seq and other length biased data

### R code from vignette source 'goseq.Rnw'

###################################################
### code chunk number 1: load_library
###################################################
library(goseq)


###################################################
### code chunk number 2: set_width
###################################################
options(width=84)


###################################################
### code chunk number 3: read.data (eval = FALSE)
###################################################
## gene.vector=as.integer(assayed.genes%in%de.genes)
## names(gene.vector)=assayed.genes
## head(gene.vector)


###################################################
### code chunk number 4: supported_genomes (eval = FALSE)
###################################################
## supportedOrganisms()


###################################################
### code chunk number 5: getLengthDataFromUCSC (eval = FALSE)
###################################################
## txsByGene=transcriptsBy(txdb,"gene")
## lengthData=median(width(txsByGene))


###################################################
### code chunk number 6: edger_1
###################################################
library(edgeR)
table.summary=read.table(system.file("extdata","Li_sum.txt",package='goseq'),
	                 sep='\t',header=TRUE,stringsAsFactors=FALSE)
counts=table.summary[,-1]
rownames(counts)=table.summary[,1]
grp=factor(rep(c("Control","Treated"),times=c(4,3)))
summarized=DGEList(counts,lib.size=colSums(counts),group=grp)


###################################################
### code chunk number 7: edger_2
###################################################
disp=estimateCommonDisp(summarized)
disp$common.dispersion
tested=exactTest(disp)
topTags(tested)


###################################################
### code chunk number 8: edger_3
###################################################
genes=as.integer(p.adjust(tested$table$PValue[tested$table$logFC!=0],
		method="BH")<.05)
names(genes)=row.names(tested$table[tested$table$logFC!=0,])
table(genes)


###################################################
### code chunk number 9: head_organisms
###################################################
head(supportedOrganisms())


###################################################
### code chunk number 10: head_geneids
###################################################
supportedOrganisms()[supportedOrganisms()$Genome=="hg19",]


###################################################
### code chunk number 11: pwf
###################################################
pwf=nullp(genes,"hg19","ensGene")
head(pwf)


###################################################
### code chunk number 12: GO.wall
###################################################
GO.wall=goseq(pwf,"hg19","ensGene")
head(GO.wall)


###################################################
### code chunk number 13: GO.samp
###################################################
GO.samp=goseq(pwf,"hg19","ensGene",method="Sampling",repcnt=1000)


###################################################
### code chunk number 14: head_samp
###################################################
head(GO.samp)


###################################################
### code chunk number 15: plot_wal_v_samp
###################################################
plot(log10(GO.wall[,2]), log10(GO.samp[match(GO.wall[,1],GO.samp[,1]),2]), 
     xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)", 
     xlim=c(-3,0))
abline(0,1,col=3,lty=2)


###################################################
### code chunk number 16: GO.nobias
###################################################
GO.nobias=goseq(pwf,"hg19","ensGene",method="Hypergeometric")
head(GO.nobias)


###################################################
### code chunk number 17: plot_wal_v_hyper
###################################################
plot(log10(GO.wall[,2]), log10(GO.nobias[match(GO.wall[,1],GO.nobias[,1]),2]), 
     xlab="log10(Wallenius p-values)", ylab="log10(Hypergeometric p-values)", 
     xlim=c(-3,0), ylim=c(-3,0))
abline(0,1,col=3,lty=2)


###################################################
### code chunk number 18: GO.limited
###################################################
GO.MF=goseq(pwf,"hg19","ensGene",test.cats=c("GO:MF"))
head(GO.MF)


###################################################
### code chunk number 19: enriched_GO
###################################################
enriched.GO=GO.wall$category[p.adjust(GO.wall$over_represented_pvalue,
            method="BH")<.05]
head(enriched.GO)


###################################################
### code chunk number 20: GO_explained
###################################################
library(GO.db)
for(go in enriched.GO[1:10]){
	print(GOTERM[[go]])
	cat("--------------------------------------\n")
}


###################################################
### code chunk number 21: getlength
###################################################
len=getlength(names(genes),"hg19","ensGene")
length(len)
length(genes)
head(len)


###################################################
### code chunk number 22: getgo
###################################################
go=getgo(names(genes),"hg19","ensGene")
length(go)
length(genes)
head(go)


###################################################
### code chunk number 23: conv_table
###################################################
goseq:::.ID_MAP
goseq:::.ORG_PACKAGES


###################################################
### code chunk number 24: norm_analysis (eval = FALSE)
###################################################
## pwf=nullp(genes,"hg19","ensGene")
## go=goseq(pwf,"hg19","ensGene")


###################################################
### code chunk number 25: verbose_analysis (eval = FALSE)
###################################################
## gene_lengths=getlength(names(genes),"hg19","ensGene")
## pwf=nullp(genes,bias.data=gene_lengths)
## go_map=getgo(names(genes),"hg19","ensGene")
## go=goseq(pwf,"hg19","ensGene",gene2cat=go_map)


###################################################
### code chunk number 26: KEGG_mappings (eval = FALSE)
###################################################
## # Get the mapping from ENSEMBL 2 Entrez
## en2eg=as.list(org.Hs.egENSEMBL2EG)
## # Get the mapping from Entrez 2 KEGG
## eg2kegg=as.list(org.Hs.egPATH)
## # Define a function which gets all unique KEGG IDs 
## # associated with a set of Entrez IDs
## grepKEGG=function(id,mapkeys){unique(unlist(mapkeys[id],use.names=FALSE))}
## # Apply this function to every entry in the mapping from 
## # ENSEMBL 2 Entrez to combine the two maps
## kegg=lapply(en2eg,grepKEGG,eg2kegg)
## head(kegg)


###################################################
### code chunk number 27: KEGG (eval = FALSE)
###################################################
## pwf=nullp(genes,"hg19","ensGene")
## KEGG=goseq(pwf,gene2cat=kegg)
## head(KEGG)


###################################################
### code chunk number 28: KEGG_goseq
###################################################
pwf=nullp(genes,'hg19','ensGene')
KEGG=goseq(pwf,'hg19','ensGene',test.cats="KEGG")
head(KEGG)


###################################################
### code chunk number 29: KEGG_from_db
###################################################
kegg=as.list(org.Hs.egPATH)
head(kegg)


###################################################
### code chunk number 30: countbias
###################################################
countbias=rowSums(counts)[rowSums(counts)!=0]
length(countbias)
length(genes)


###################################################
### code chunk number 31: GO.counts
###################################################
pwf.counts=nullp(genes,bias.data=countbias)
GO.counts=goseq(pwf.counts,"hg19","ensGene")
head(GO.counts)


###################################################
### code chunk number 32: setup
###################################################
sessionInfo()