inst/doc/movAPA_on_scAPAtrap_results.R

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  fig.width = 6,
  fig.height = 5.5,
  collapse = TRUE,
  warning = FALSE,
  comment = "#>"
)

## ----load_data----------------------------------------------------------------
library(movAPA)
# load demo peaks generated by scAPAtrap, 
# which contains a list(peaks.meta, peaks.count)
data("demo_peaks")
peaks=demo_peaks
names(peaks)
head(peaks$peaks.meta)
head(peaks$peaks.count[, 1:10])

## ----create_PACds-------------------------------------------------------------
PACds=createPACdataset(counts=peaks$peaks.count, anno=peaks$peaks.meta)
PACds
rm(peaks)

## -----------------------------------------------------------------------------
summary(PACds)

## ----data_filtering-----------------------------------------------------------
PACds=subsetPACds(PACds, totPACtag = 10, minExprConds = 10, verbose=TRUE)
summary(PACds)

## ----load_bsgenome, warning=FALSE, message=FALSE------------------------------
library(BSgenome.Hsapiens.UCSC.hg38, quietly = TRUE, verbose = FALSE)
bsgenome<-BSgenome.Hsapiens.UCSC.hg38

head(GenomeInfoDb::seqnames(bsgenome))
head(unique(PACds@anno$chr))

## ----remove_IP----------------------------------------------------------------
PACdsIP=removePACdsIP(PACds, bsgenome, returnBoth=TRUE,
                      up=-10, dn=10, conA=6, sepA=7)
length(PACdsIP$real)
length(PACdsIP$ip)

# summary of IPs and real PAs
summary(PACdsIP$real)

summary(PACdsIP$ip)

## -----------------------------------------------------------------------------
# use real PA for further analysis
PACds=PACdsIP$real

## ----load_humanRefPA, eval=FALSE----------------------------------------------
#  annodb=read.table("Human_hg38.PolyA_DB.bed", sep="\t", header = FALSE)
#  head(annodb)
#  annodb=annodb[,c(1,2,6)]
#  colnames(annodb)=c("chr","coord","strand")
#  annodb=readPACds(annodb, colDataFile = NULL)

## ----ovpIP, eval=FALSE--------------------------------------------------------
#  PACdsIP$ip@anno$UPA_start=PACdsIP$ip@anno$start
#  PACdsIP$ip@anno$UPA_end=PACdsIP$ip@anno$end
#  
#  PACdsIP$real@anno$UPA_start=PACdsIP$real@anno$start
#  PACdsIP$real@anno$UPA_end=PACdsIP$real@anno$end
#  
#  ## find overlapping IPs
#  IP.ovp=findOvpPACds(qryPACds=PACdsIP$ip, sbjPACds=annodb,
#                      d=50,
#                      qryMode='region', sbjMode='point',
#                      returnNonOvp=TRUE)

## ----combine_PA, eval=FALSE---------------------------------------------------
#  anno=rbind(PACdsIP$real@anno, IP.ovp$ovp@anno)
#  count=rbind(PACdsIP$real@counts, IP.ovp$ovp@counts)
#  PACds=createPACdataset(counts=count, anno=anno)
#  summary(PACds)

## ----annotatePAC, warning=FALSE, message=FALSE--------------------------------
library(TxDb.Hsapiens.UCSC.hg38.knownGene, quietly = TRUE, verbose = FALSE)
txdb=TxDb.Hsapiens.UCSC.hg38.knownGene

# annotate PAs
PACds=annotatePAC(PACds, txdb)

# after annotation, the gene and ftr information are present in PACds@anno.
summary(PACds)

## ----testExt3UTR--------------------------------------------------------------
testExt3UTR(PACds, seq(1000, 10000, by=1000))

## ----utr_len, warning=FALSE---------------------------------------------------
utrid=which(PACds@anno$ftr=='3UTR')
utrs=unique(PACds@anno[utrid, c('ftr_end','ftr_start')])
summary(utrs$ftr_end-utrs$ftr_start+1)

## ----extend3UTR, warning=FALSE------------------------------------------------
# extend 3UTR by 1000bp
PACds=ext3UTRPACds(PACds, 1000)

# after 3UTR extension
summary(PACds)

## ----utr_filter, warning=FALSE------------------------------------------------
PACds=PACds[PACds@anno$ftr=='3UTR']
summary(PACds)

## ----base_comp----------------------------------------------------------------
faFiles=faFromPACds(PACds, bsgenome, what='updn', fapre='updn', 
                    up=-300, dn=100, byGrp='ftr')

## ----fig.dim=c(6,4)-----------------------------------------------------------
plotATCGforFAfile(faFiles, ofreq=FALSE, opdf=FALSE, 
                   refPos=301, mergePlots = TRUE)

## -----------------------------------------------------------------------------
v=getVarGrams('mm')
priority=c(1,2,rep(3, length(v)-2))

## ----polyA_signal-------------------------------------------------------------
PACdsMM=annotateByPAS(PACds, bsgenome, grams=v, 
                      priority=priority, 
                      from=-50, to=-1, label='mm')

table(PACdsMM@anno$mm_gram)

## percent of PA without PAS
noPAS=sum(is.na(PACdsMM@anno$mm_gram))
noPAS/length(PACdsMM)

## ----seqlogo, message=FALSE, warning=FALSE------------------------------------
pas=PACdsMM@anno$mm_gram[!is.na(PACdsMM@anno$mm_gram)]
(pas)

## ----PACdsList----------------------------------------------------------------
## the two PACds for merging
PACdsList=list(ds1=PACds, ds2=PACds)

## ----merge_samples------------------------------------------------------------
pacds.merge=mergePACds(PACdsList, d=24, by='coord')

# summary of PACds#1
summary(PACds)

# summary of the merged PACds
summary(pacds.merge)

head(pacds.merge@counts[, 1:10])
head(pacds.merge@anno)

## ----load_RefPA---------------------------------------------------------------
annodb=read.table("./Human_hg38.PolyA_DB.bed", sep="\t", header = FALSE)
head(annodb)
annodb=annodb[,c(1,2,6)]
colnames(annodb)=c("chr","coord","strand")
annodb=readPACds(annodb, colDataFile = NULL)

## ----build_ref----------------------------------------------------------------
## we can build a reference PACds with human known PA and the given PACds list
## or we can use annodb only
refPA=buildRefPACdsAnno(refPACds=annodb, PACdsList=PACdsList, 
                        by='coord', d=24,
                        min.counts = 50, min.smps=10, max.width=500,
                        verbose=TRUE)

## ----merge_samples_ref--------------------------------------------------------

pacds.merge=mergePACds(PACdsList, d=24, by='coord', refPACds=refPA)

# summary of PACds#1
summary(PACds)

# summary of the merged PACds
summary(pacds.merge)

head(pacds.merge@counts[, 1:10])
head(pacds.merge@anno)

## ----annotate_merged_smps, eval=F---------------------------------------------
#  # annotate PAs
#  pacds.merge=annotatePAC(pacds.merge, txdb)
#  
#  # after annotation, the gene and ftr information are present in PACds@anno.
#  summary(pacds.merge)

## ----APA_dist-----------------------------------------------------------------
s=getNearbyPAdist(PACds)

## ----smartRUD_index-----------------------------------------------------------
# get proximal and distal PA for each gene
pd=get3UTRAPApd(pacds=PACds, minDist=50, maxDist=5000, minRatio=0.05, 
                fixDistal=FALSE, addCols='pd') 

# get smartRUD index
rud=movAPAindex(pd, method="smartRUD", sRUD.oweight=TRUE, clearPAT=1)   
head(rud$rud[, 1:5]) 
head(rud$weight[, 1:5])

## we can also calculate the RUD index
## rud1=movAPAindex(pd, method="RUD")

## ----vizAPA-------------------------------------------------------------------
library(vizAPA, quietly = TRUE)

# to choose one gene, first we get the total counts of each PA
PACds@anno$TN=rowSums(PACds@counts)
pd@anno$TN=rowSums(pd@counts)

# construct the genome annotation object
annoSource=new("annoHub")
annoSource=addAnno(annoSource, txdb)

gene=pd@anno$gene[1]
vizTracks(gene=gene, 
          PACds.list=list(pA=PACds, pd=pd), 
          PA.show=c("pos","TN"),
          annoSource=annoSource,
          PA.columns="coord", PA.width=10,
          space5=1000, space3=1000)


## ----gene_id------------------------------------------------------------------
library(Homo.sapiens)
orgdb=Homo.sapiens
genes=getAnnoGenes(orgdb)
head(genes)

genes[genes$gene_entrezid==gene, ]
genes[genes$gene_entrezid==10003, ]

## -----------------------------------------------------------------------------
sessionInfo()
BMILAB/movAPA documentation built on Jan. 3, 2024, 11:09 p.m.