R/APAlyzer.R

Defines functions download_testbam REF4PAS GENEXP_CDS REFCDS PASEXP_3UTR REF3UTR .combine_count .divxdb .getcount .getSUMsize .getsize .getname

Documented in download_testbam GENEXP_CDS PASEXP_3UTR REF3UTR REF4PAS REFCDS

#### 3'UTR APA and gene EXP analysis ####
.getname<-function(fls){
    samplenameraw=gsub(".bam","",names(fls))
    samplename=gsub(".Aligned.sortedByCoord.out","",samplenameraw)
    return(samplename)
}

.getsize<-function(DB){
    REGIONsize=as.data.frame(width(DB))
    names(REGIONsize)[3]="size"
    names(REGIONsize)[2]="gene_symbol"
    REGIONsize=REGIONsize[,c("gene_symbol","size")]
    return(REGIONsize)
}

.getSUMsize<-function(DB){
    REGIONsize=as.data.frame(sum(width(DB)))
    names(REGIONsize)[1]="length"
    REGIONsize$gene_symbol=rownames(REGIONsize)
    REGIONsize=REGIONsize[,c("gene_symbol","length")]
    return(REGIONsize)
}

.getcount<-function(DB,BMAfile,STRINFOR){
    preprocess.reads.function = NULL
    igstrand=FALSE
    if(STRINFOR == "NONE")
        {
            igstrand=TRUE
        }
    if (STRINFOR == "invert")
        {
            preprocess.reads.function = invertStrand
        }
    counttbl= summarizeOverlaps(DB, BMAfile,
    mode="IntersectionNotEmpty",
    singleEnd=TRUE, ignore.strand=igstrand,
    preprocess.reads=preprocess.reads.function)
    return(counttbl)
}

.divxdb<-function(xdb,keystr){
    x = unlist(xdb)
    xp=x[strand(x)==keystr,]
    xpdb = split(xp, xp$gene_name)
    return(xpdb)
}

.combine_count<-function(xdb,fls,STRINFOR){
    xdbP=.divxdb(xdb,'+')
    xtblP=.getcount(xdbP,fls,STRINFOR)
    xdfP=as.data.frame(assays(xtblP)$counts)
    xdfP$gene_symbol=rownames(xdfP)
    xdbN=.divxdb(xdb,'-')
    xtblN=.getcount(xdbN,fls,STRINFOR)
    xdfN=as.data.frame(assays(xtblN)$counts)
    xdfN$gene_symbol=rownames(xdfN)
    xdf=rbind(xdfP,xdfN)
    return(xdf)
}

REF3UTR<-function(refUTR){
    stopifnot(ncol(refUTR) == 6)
    colnames(refUTR)=c('gene_symbol','Chrom','Strand','First','Last','cdsend')
    ##aUTR##
    refaUTR = refUTR[,c('gene_symbol','Chrom','Strand','First','Last')]
    refaUTR$note = "__aUTR"
    refaUTR$gene_name=paste0(refaUTR$gene_symbol, refaUTR$note)
    refaUTR$Start=refaUTR$First
    refaUTR$End=refaUTR$Last
    refaUTR[which(refaUTR$Strand=='-'),]$Start=
        refaUTR[which(refaUTR$Strand=='-'),]$Last
    refaUTR[which(refaUTR$Strand=='-'),]$End=
        refaUTR[which(refaUTR$Strand=='-'),]$First
    refaUTR=refaUTR[,-c(4:6)]
    ##cUTR##
    refcUTR = refUTR[,c('gene_symbol','Chrom','Strand','cdsend','First')]
    refcUTR$note = "__cUTR"
    refcUTR$gene_name=paste0(refcUTR$gene_symbol, refcUTR$note)
    refcUTR$Start=refcUTR$cdsend
    refcUTR$End=refcUTR$First
    refcUTR[which(refcUTR$Strand=='-'),]$Start=
        refcUTR[which(refcUTR$Strand=='-'),]$First
    refcUTR[which(refcUTR$Strand=='-'),]$End=
        refcUTR[which(refcUTR$Strand=='-'),]$cdsend
    refcUTR=refcUTR[,-c(4:6)]
    ref3UTR=rbind(refaUTR, refcUTR)
    UTRdb = makeGRangesFromDataFrame(ref3UTR, keep.extra.columns = TRUE)
    UTRdb = split(UTRdb, UTRdb$gene_name)
    return(UTRdb)
}

PASEXP_3UTR<-function(UTRdb, flS, Strandtype="NONE"){
    stopifnot(is(UTRdb, "GRangesList"))
    for (k in seq_len(length(flS))){
        fls=flS[k]
        STRINFOR=Strandtype
        SPNAME=.getname(fls)
        acSIZE=.getsize(UTRdb)
        UTRdf=.combine_count(UTRdb,fls,STRINFOR)
        names(UTRdf)[1]=paste0(SPNAME,"_reads")
        readscols=names(UTRdf)[1]
        RPMcols = paste0(SPNAME,"_RPKM")
        UTRdf[RPMcols]=UTRdf[readscols]/colSums(UTRdf[readscols])*1000000
        UTRdf=merge(UTRdf,acSIZE, by="gene_symbol",all.x=TRUE)
        UTRdf[RPMcols]=UTRdf[RPMcols]/UTRdf$size*1000
        aUTRdf=UTRdf[grepl("__aUTR", UTRdf$gene_symbol),]
        cUTRdf=UTRdf[grepl("__cUTR", UTRdf$gene_symbol),]
        aUTRdf$gene_symbol = gsub('__aUTR', '', aUTRdf$gene_symbol)
        names(aUTRdf) = gsub('_RPKM', '_aRPKM', names(aUTRdf))
        names(aUTRdf) = gsub('_reads', '_areads', names(aUTRdf))
        cUTRdf$gene_symbol = gsub('__cUTR', '', cUTRdf$gene_symbol)
        names(cUTRdf) = gsub('_RPKM', '_cRPKM', names(cUTRdf))
        names(cUTRdf) = gsub('_reads', '_creads', names(cUTRdf))
        dfSRSutr=merge(x = aUTRdf, y = cUTRdf, by = "gene_symbol")
        dfSRSutr[is.na(dfSRSutr)] = 0
        dfSRSutr=dfSRSutr[,c(1,2,3,5,6)]
        dfSRSutr[,paste0(SPNAME,'_3UTR_RE')]=
            log2(dfSRSutr[,paste0(SPNAME,'_aRPKM')]/
                    dfSRSutr[,paste0(SPNAME,'_cRPKM')])
        if(k==1)
        {
            df3UTR_OUT=dfSRSutr
        } else {
            df3UTR_OUT=merge(df3UTR_OUT,dfSRSutr,
                            by = "gene_symbol", all.x=TRUE)
        }

        print(paste0(SPNAME, ", Strand: ", STRINFOR, ", finished"))
        }
    return(df3UTR_OUT)
}

REFCDS<-function(txdb,IDDB){
    CDSbygene = cdsBy(txdb, by="gene")
    x = unlist(CDSbygene)
    acc2sym = AnnotationDbi::select(IDDB, keys = names(x),
                                    keytype = "ENTREZID", columns = "SYMBOL")
    acc2sym[is.na(acc2sym$SYMBOL),]$SYMBOL =
        acc2sym[is.na(acc2sym$SYMBOL), ]$ENTREZID
    names(x) = acc2sym$SYMBOL
    CDSbygene = split(x, names(x))
    CDSbygene = reduce(CDSbygene)
    return(CDSbygene)
}

GENEXP_CDS<-function(CDSbygene, flS, Strandtype="NONE"){
    for (k in seq_len(length(flS))){
        fls=flS[k]
        STRINFOR=Strandtype
        SPNAME=.getname(fls)
        CDStbl=.getcount(CDSbygene,fls,STRINFOR)
        cdslength=.getSUMsize(CDSbygene)
        tepdf=as.data.frame(assays(CDStbl)$counts)
        tepdf$gene_symbol=rownames(tepdf)
        names(tepdf)[1]=paste0(SPNAME,"_CDSreads")
        readscols=names(tepdf)[1]
        tepdf=merge(tepdf, cdslength, by="gene_symbol")
        RPKcols=paste0(SPNAME,"_RPK")
        tepdf[,RPKcols]=tepdf[,readscols]/(tepdf[,"length"]/1000)
        TPMcols=paste0(SPNAME,"_TPM")
        tepdf[TPMcols]=tepdf[RPKcols]/colSums(tepdf[RPKcols])*1000000
        tepdf=tepdf[,c("gene_symbol", readscols, TPMcols)]
        if(k==1)
        {
            dfEXP_OUT=tepdf
        } else {
            dfEXP_OUT=merge(dfEXP_OUT,tepdf, by = "gene_symbol", all.x=TRUE)
        }

        print(paste0(SPNAME, ", Strand: ", STRINFOR, ", finished"))
        }

    return(dfEXP_OUT)
}

REF4PAS<-function(refUTRraw, dfIPAraw, dfLEraw){
	UTRdbraw=REF3UTR(refUTRraw)
	dfIPA=dfIPAraw
	dfLE=dfLEraw
	PASREF=list(UTRdbraw,dfIPA,dfLE)
	names(PASREF)=c('UTRdbraw','dfIPA','dfLE')
	return(PASREF)
}

download_testbam<-function(){
URL="https://media.githubusercontent.com/media/RJWANGbioinfo/PAS_reference_RData_and_testing_data/master/bam/"	
SAMPLES=c("Heart_rep1",
"Heart_rep2",
"Heart_rep3",
"Heart_rep4",
"Testis_rep1",
"Testis_rep2",
"Testis_rep3",
"Testis_rep4")
for(SAMPLE in SAMPLES)
{
print(paste0("Download  ",SAMPLE))
download.file(url = paste0(URL,SAMPLE,".bam")
                                   , destfile = paste0(SAMPLE,".bam"))
}
}
RJWANGbioinfo/APAlyzer documentation built on July 10, 2022, 10:30 a.m.