# description -------------------------------------------------------------
# This section of code contains most of polyAtailor's visualization capabilities.
# plotPALDistribution ---------------------------------------------------------------
#' plotPALDistribution
#'
#' @description Plot the PAL distribution of gene, umi or global.
#' @details Using the PoltPAL family of functions, the tail length distribution
#' of the gene or UMI of different sample will be plotted, as well as the
#' global tail length distribution.
#' @param taildf A dataframe that contains the tail information of all reads,
#' Contains at least three columns: PAL, gene, umi, and sample.
#' @param resultpath The specified path to save barplot.
#' @param dType The tail length affiliation relationship that you want to draw,
#' which includes three options: "global", "gene", "umi". "global" A stands
#' for uniform treatment of all tail lengths. "gene" stands for tail length
#' plotted according to genetic statistics. "umi" stands for tail length
#' plotted according to umi statistics
#' @param medianPAL A Boolean value. T means Annotating peak. F means the
#' opposite.
#' @return A density curve with peak annotations.
#' @examples
#' data(AnnotedTails)
#' p1 <- plotPALDistribution(AnnotedTails,"./","global",medianPAL=T)
#' p2 <- plotPALDistribution(AnnotedTails,"./","gene",medianPAL=T)
#' @family Visualization functions
#' @seealso [plotPADistribution()] to plot the gene region distribution of PA sites.
#' @export
plotPALDistribution <- function(taildf,resultpath,dType,medianPAL){
library(RColorBrewer)
library(pracma)
if(missing(taildf)){
stop("Tail data must be provided")
}
if(missing(resultpath)){
stop("You must provide a path to save the results")
}
if(missing(dType)){
stop("The DTYPE must be specified")
}
else{
if(!(dType %in% c("global","gene","umi"))){
stop("dType must comes from c('global','gene','umi')")
}
}
if(missing(medianPAL)){
medianPAL = T
}
if(dType=="global" & medianPAL){
data <- taildf %>%
select(PAL,sample) %>%
group_by(sample,PAL) %>%
summarise(count=n())
sample.names <- unique(data$sample)
peak.table <- data.frame(sample=NULL,
umi=NULL,
PAL=NULL,
filename=NULL,
count=NULL)
for(i in 1:length(sample.names)){
sub.data <- dplyr::filter(.data =data, sample==sample.names[i])
index <- which.max(density(sub.data$PAL)$y)
peak.id <-as.data.frame(sub.data[findpeaks(sub.data$count, npeaks=1, threshold=0, sortstr=TRUE)[,2],])
peak.id$PAL <- density(sub.data$PAL)$x[index]
peak.table <- base::rbind(peak.table,peak.id)
}
#geom_density
data$sample <- factor(data$sample,levels = sample.names)
p <- ggplot(data,aes(x=PAL,colour=sample))+
xlab("PAL(nt)")+
geom_density(size=1.1)+
theme_base()+
# scale_color_manual(values=mycolors)+
scale_color_brewer(palette = "Dark2")+
theme(legend.position="top")
colors <- brewer.pal(8,"Dark2")
for(i in 1:nrow(peak.table)){
p <- p+geom_vline(xintercept = peak.table$PAL[i],linetype = "longdash",color=colors[i],size = 0.7)+
# annotate(geom = "rect", xmin = 10, xmax = 18, ymin = 0.05, ymax = 0.07, alpha = .1, fill="navy")+
annotate(geom = "text", fontface = "bold", color=colors[i],
x = (peak.table$PAL[i])+5,y=0.000+(i-1)*0.0003,
label = round((peak.table$PAL[i]),2), size=3)
}
plot(p)
fil = str_c(resultpath,"PALDistribution.pptx")
topptx(filename = fil)
return(p)
}
if(dType=="global" & !medianPAL){
data <- taildf %>%
select(PAL,sample) %>%
group_by(sample,PAL) %>%
summarise(count=n())
sample.names <- unique(data$sample)
peak.table <- data.frame(sample=NULL,
umi=NULL,
PAL=NULL,
filename=NULL,
count=NULL)
for(i in 1:length(sample.names)){
sub.data <- dplyr::filter(.data =data, sample==sample.names[i])
index <- which.max(density(sub.data$PAL)$y)
peak.id <-as.data.frame(sub.data[findpeaks(sub.data$count, npeaks=1, threshold=0, sortstr=TRUE)[,2],])
peak.id$PAL <- density(sub.data$PAL)$x[index]
peak.table <- base::rbind(peak.table,peak.id)
}
#geom_density
data$sample <- factor(data$sample,levels = sample.names)
p <- ggplot(data,aes(x=PAL,colour=sample))+
xlab("PAL(nt))")+
geom_density(size=1.1)+
theme_base()+
# scale_color_manual(values=mycolors)+
scale_color_brewer(palette = "Dark2")+
theme(legend.position="top")
colors <- brewer.pal(8,"Dark2")
for(i in 1:nrow(peak.table)){
p <- p+geom_vline(xintercept = peak.table$PAL[i],linetype = "longdash",color=colors[i],size = 0.7)
}
plot(p)
fil = str_c(resultpath,"PALDistribution.pptx")
topptx(filename = fil)
return(p)
}
if(dType=="gene" & medianPAL){
data <- taildf %>%
select(gene,PAL,sample) %>%
group_by(sample,gene) %>%
summarise(md_PAL = median(PAL),count=n())
sample.names <- unique(data$sample)
peak.table <- data.frame(sample=NULL,
umi=NULL,
md_PAL=NULL,
filename=NULL,
count=NULL)
for(i in 1:length(sample.names)){
sub.data <- dplyr::filter(.data =data, sample==sample.names[i])
index <- which.max(density(sub.data$md_PAL)$y)
peak.id <-as.data.frame(sub.data[findpeaks(sub.data$count, npeaks=1, threshold=0, sortstr=TRUE)[,2],])
peak.id$md_PAL <- density(sub.data$md_PAL)$x[index]
peak.table <- base::rbind(peak.table,peak.id)
}
#geom_density
data$sample <- factor(data$sample,levels = sample.names)
p <- ggplot(data,aes(x=md_PAL,colour=sample))+
xlab("PAL(nt) per GENE")+
geom_density(size=1.1)+
theme_base()+
# scale_color_manual(values=mycolors)+
scale_color_brewer(palette = "Dark2")+
theme(legend.position="top")
colors <- brewer.pal(8,"Dark2")
for(i in 1:nrow(peak.table)){
p <- p+geom_vline(xintercept = peak.table$md_PAL[i],linetype = "longdash",color=colors[i],size = 0.7)+
# annotate(geom = "rect", xmin = 10, xmax = 18, ymin = 0.05, ymax = 0.07, alpha = .1, fill="navy")+
annotate(geom = "text", fontface = "bold", color=colors[i],
x = (peak.table$md_PAL[i])+5,y=0.000+(i-1)*0.0003,
label = round((peak.table$md_PAL[i]),2), size=3)
}
plot(p)
fil = str_c(resultpath,"PALDistribution.pptx")
topptx(filename = fil)
return(p)
}
if(dType=="gene" & !medianPAL){
data <- taildf %>%
select(gene,PAL,sample) %>%
group_by(sample,gene) %>%
summarise(md_PAL = median(PAL),count=n())
sample.names <- unique(data$sample)
peak.table <- data.frame(sample=NULL,
umi=NULL,
md_PAL=NULL,
filename=NULL,
count=NULL)
for(i in 1:length(sample.names)){
sub.data <- dplyr::filter(.data =data, sample==sample.names[i])
index <- which.max(density(sub.data$md_PAL)$y)
peak.id <-as.data.frame(sub.data[findpeaks(sub.data$count, npeaks=1, threshold=0, sortstr=TRUE)[,2],])
peak.id$md_PAL <- density(sub.data$md_PAL)$x[index]
peak.table <- base::rbind(peak.table,peak.id)
}
#geom_density
data$sample <- factor(data$sample,levels = sample.names)
p <- ggplot(data,aes(x=md_PAL,colour=sample))+
xlab("PAL(nt) per GENE")+
geom_density(size=1.1)+
theme_base()+
# scale_color_manual(values=mycolors)+
scale_color_brewer(palette = "Dark2")+
theme(legend.position="top")
colors <- brewer.pal(8,"Dark2")
for(i in 1:nrow(peak.table)){
p <- p+geom_vline(xintercept = peak.table$md_PAL[i],linetype = "longdash",color=colors[i],size = 0.7)
}
p
fil = str_c(resultpath,"PALDistribution.pptx")
topptx(filename = fil)
return(p)
}
if(dType=="umi" & medianPAL){
data <- taildf %>%
select(umi,PAL,sample) %>%
group_by(sample,umi) %>%
summarise(md_PAL = median(PAL),count=n())
sample.names <- unique(data$sample)
peak.table <- data.frame(sample=NULL,
umi=NULL,
md_PAL=NULL,
filename=NULL,
count=NULL)
for(i in 1:length(sample.names)){
sub.data <- dplyr::filter(.data =data, sample==sample.names[i])
index <- which.max(density(sub.data$md_PAL)$y)
peak.id <-as.data.frame(sub.data[findpeaks(sub.data$count, npeaks=1, threshold=0, sortstr=TRUE)[,2],])
peak.id$md_PAL <- density(sub.data$md_PAL)$x[index]
peak.table <- base::rbind(peak.table,peak.id)
}
#geom_density
data$sample <- factor(data$sample,levels = sample.names)
p <- ggplot(data,aes(x=md_PAL,colour=sample))+
xlab("PAL(nt) per UMI")+
geom_density(size=1.1)+
theme_base()+
# scale_color_manual(values=mycolors)+
scale_color_brewer(palette = "Dark2")+
theme(legend.position="top")
colors <- brewer.pal(8,"Dark2")
for(i in 1:nrow(peak.table)){
p <- p+geom_vline(xintercept = peak.table$md_PAL[i],linetype = "longdash",color=colors[i],size = 0.7)+
# annotate(geom = "rect", xmin = 10, xmax = 18, ymin = 0.05, ymax = 0.07, alpha = .1, fill="navy")+
annotate(geom = "text", fontface = "bold", color=colors[i],
x = (peak.table$md_PAL[i])+5,y=0.000+(i-1)*0.0003,
label = round((peak.table$md_PAL[i]),2), size=3)
}
plot(p)
fil = str_c(resultpath,"PALDistribution.pptx")
topptx(filename = fil)
return(p)
}
if(dType=="umi" & !medianPAL){
data <- taildf %>%
select(umi,PAL,sample) %>%
group_by(sample,umi) %>%
summarise(md_PAL = median(PAL),count=n())
sample.names <- unique(data$sample)
peak.table <- data.frame(sample=NULL,
umi=NULL,
md_PAL=NULL,
filename=NULL,
count=NULL)
for(i in 1:length(sample.names)){
sub.data <- dplyr::filter(.data =data, sample==sample.names[i])
index <- which.max(density(sub.data$md_PAL)$y)
peak.id <-as.data.frame(sub.data[findpeaks(sub.data$count, npeaks=1, threshold=0, sortstr=TRUE)[,2],])
peak.id$md_PAL <- density(sub.data$md_PAL)$x[index]
peak.table <- base::rbind(peak.table,peak.id)
}
#geom_density
data$sample <- factor(data$sample,levels = sample.names)
p <- ggplot(data,aes(x=md_PAL,colour=sample))+
xlab("PAL(nt) per UMI")+
geom_density(size=1.1)+
theme_base()+
# scale_color_manual(values=mycolors)+
scale_color_brewer(palette = "Dark2")+
theme(legend.position="top")
colors <- brewer.pal(8,"Dark2")
for(i in 1:nrow(peak.table)){
p <- p+geom_vline(xintercept = peak.table$md_PAL[i],linetype = "longdash",color=colors[i],size = 0.7)
}
plot(p)
fil = str_c(resultpath,"PALDistribution.pptx")
topptx(filename = fil)
return(p)
}
}
# plotPADistribution ---------------------------------------------------------------
#' plotPADistribution
#'
#' @description Plot the gene region distribution of PA sites.
#' @details Plot the gene region distribution of PA sites, including "3UTR",
#' "5UTR", "CDS", "exon", "intergenic", "intron". Finally, we will save the PA
#' locus distribution barpllot in PPTX format to the path you specified.
#' @param PAdf A dataframe that contains the PA information of all reads,
#' Contains at least three columns: PAID, gene, ftr.
#' @param resultpath The specified path to save barplot.
#' @param colorOfBar The color of barplot, as input in hexadecimal form,
#' defaults to "#196BE1".
#' @return A barplot.
#' @examples
#' data(PAs)
#' p <- plotPADistribution(PAs,"./","#9BBFDC")
#' @family Visualization functions
#' @seealso [plotGenePAnumbers()] to plot the gene frequency distribution with
#' different number of PA sites.
#' @export
plotPADistribution <- function(PAdf,resultpath,colorOfBar){
#check
if(missing(colorOfBar)){
colorOfBar = "#196BE1"
}
if(missing(resultpath)){
stop("The result path must be specified")
}
if(missing(PAdf)){
stop("PA information must be entered")
}
data1 <- PAdf %>%
dplyr::select(gene,ftr) %>%
dplyr::filter(!is.na(gene)&gene!=("character(0)"))
data2 <- data1 %>%
group_by(ftr) %>%
summarise(count=n())
data2 <- mutate(data2,rt=count/sum(count)*100)
p <- data2 %>%
ggplot() +
aes(x = reorder(ftr,-rt),y=rt) +
geom_bar(stat = 'identity',position = 'dodge',color=colorOfBar,fill=colorOfBar) +
scale_fill_manual(values=c(colorOfBar))+
scale_color_manual(values=c(colorOfBar))+
labs(x = "", y = "proportion(%)") +
geom_text(aes(label = count), position = position_dodge(0.9),vjust=-0.5)+
ggthemes::theme_base()+
theme(legend.position="top",
axis.text.x = element_text(angle=30, vjust=0.5))
plot(p)
fil = str_c(resultpath,"barPlotOfPASdistribution.pptx")
topptx(filename = fil)
return(p)
}
# plotGenePAnumbers ---------------------------------------------------------------
#' plotGenePAnumbers
#'
#' @description Plot the gene frequency distribution with different number of PA sites.
#' @details Plot the gene frequency distribution with different number of PA
#' sites. Genes were divided into five groups, each containing the number of
#' PA sites 1, 2, 3, 4, 5 and greater than 5, respectively.
#' @param PAdf A dataframe that contains the PA information of all reads,
#' Contains at least three columns: PAID, gene, ftr.
#' @param resultpath The specified path to save barplot.
#' @param colorOfBar The color of barplot, as input in hexadecimal form,
#' defaults to "#196BE1".
#' @return A barplot.
#' @examples
#' data(PAs)
#' p <- plotGenePAnumbers(PAs,"./","#DF7C7D")
#' @family Visualization functions
#' @seealso [plotGenePAnumbers()] to plot the gene frequency distribution with
#' different number of PA sites.
#' @export
plotGenePAnumbers <- function(PAdf,resultpath,colorOfBar){
if(missing(colorOfBar)){
colorOfBar = "#B681BD"
}
if(missing(PAdf)){
stop("PA information must be entered")
}
if(missing(resultpath)){
stop("The result path must be specified")
}
if("PAID" %in% colnames(PAdf)){
data1 <- PAdf %>%
select(PAID,gene,ftr) %>%
filter(!is.na(gene)&gene!=("character(0)"))
data9D<-data1 %>%
group_by(gene,PAID)%>%
summarise(counts=n()) %>%
group_by(gene) %>%
summarise(PAs=n())
}
else{
data1 <- PAdf %>%
select(gene,ftr) %>%
filter(!is.na(gene)&gene!=("character(0)"))
data9D<-data1 %>%
group_by(gene)%>%
summarise(PAs=n())
}
#data9D <- table(data9D$PAs)
data9D$type.new <- data9D$PAs
data9D$type.new[which(data9D$type.new>5)] <- ">5"
data9D$type.new <- factor(data9D$type.new,levels=c("1","2","3","4","5",">5"))
data9D.fre <- data.frame(fre=names(table(data9D$type.new)),number=as.integer(table(data9D$type.new)) )
data9D.fre$fre<-factor(data9D.fre$fre,levels = unique(data9D.fre$fre))
#table(data9D$type.new)
library(RColorBrewer)
p <- data9D.fre %>%
ggplot() +
aes(x = factor(fre),y=number,fill="#196BE1") +
geom_bar(stat = 'identity',position = 'dodge')+# +scale_fill_brewer(palette="Dark2")+
# scale_fill_manual(values=c("#196BE1"))+
#scale_color_manual(values=c("#B681BD"))+
labs(x = "Number of PA sites", y = "Gene Frequency") +
geom_text(aes(label = number), position = position_dodge(0.9),vjust=-0.5)+
ggthemes::theme_base()+
theme(legend.position="none",
axis.text.x = element_text(angle=0, vjust=0.5))
plot(p)
fil = str_c(resultpath,"PAnumbersOfGene.pptx")
topptx(filename = fil)
return(p)
}
# plotPASignals ---------------------------------------------------------------
#' plotPASignals
#'
#' @description Draw the frequency distribution diagram of PA signal.
#' @details plot the probability distribution of the occurrence of
#' user-specified or default PA signal within the first 50bp base range of PA
#' site.
#' @param PAdf A dataframe that contains the PA information of all reads,
#' Contains at least three columns: PAID, gene, ftr.
#' @param resultpath The specified path to save barplot.
#' @param colorOfBar The color of barplot, as input in hexadecimal form,
#' defaults to "#196BE1".
#' @param bsgenome Reference genomes of the species of interest.
#' @param signals The set of PA signal sequences of interest, such as
#' signals=c("AATAAA","ATTAAA","AAGAAA","AATATA").
#' @return A barplot.
#' @examples
#' library("BSgenome.Mmusculus.UCSC.mm10")
#' bsgenome = BSgenome.Mmusculus.UCSC.mm10
#' data(PAs)
#' p <- plotPASignals(PAs,"./",bsgenome = bsgenome)
#' @family Visualization functions
#' @seealso [plotGenePAnumbers()] to plot the gene frequency distribution with
#' different number of PA sites.
#' @export
plotPASignals <- function(PAdf,resultpath,bsgenome,signals,colorOfBar){
if(missing(colorOfBar)){
colorOfBar = "#7ED321"
}
if(missing(PAdf)){
stop("PA information must be entered")
}
if(missing(signals)){
print("No PA signal is specified. Statistics are performed according to the default values.")
signals=c("AATAAA","ATTAAA","AAGAAA","AAAAAG","AATACA","TATAAA","ACTAAA","AGTAAA","GATAAA",
"AATATA")
}
if(missing(bsgenome)){
stop("Reference genomes must be specified")
}
if(missing(resultpath)){
stop("The result path must be specified")
}
library(movAPA)
PAdf1 <- PAdf %>%
dplyr::select(chr,strand,coord,ftr)
PACds <- readPACds(PAdf1)
priority=c(1,2,rep(3, length(signals)-2))
PACdsHUMAN=annotateByPAS(PACds, bsgenome, grams=signals,
priority=priority, from=-50, to=-1, label='HU')
test2 <- table(PACdsHUMAN@anno$HU_gram)
data9F <- as.data.frame(test2)
all_counts = sum(data9F$Freq)
data9F <- data9F %>%
mutate(rt = (Freq / all_counts) *100)
p <- data9F %>%
ggplot() +
aes(x = reorder(Var1,-rt),y=rt) +
geom_bar(stat = 'identity',position = 'dodge',color="#57B355",fill="#57B355") +
scale_fill_manual(values=c("#57B355"))+
scale_color_manual(values=c("#57B355"))+
labs(x = "", y = "ratio(%)") +
geom_text(aes(label = Freq), position = position_dodge(0.9),vjust=-0.5)+
ggthemes::theme_base()+
theme(legend.position="top",
axis.text.x = element_text(angle=30, vjust=0.5))
plot(p)
fil = str_c(resultpath,"barPlotOfPASignaldistribution.pptx")
topptx(filename = fil)
return(p)
}
# nonAanalysis ---------------------------------------------------
#' nonAanalysis
#'
#' @description Quantify and Statistics the nonA base in tail.
#' @details After users input the tail information table, we will quantify the
#' non-A base content in each tail, and count the number of reads with non-A
#' base tail in each sample and the proportion of non-A base content in each
#' sample.
#' @param tailinfo A tail information table containing at least PAL, sample
#' name, tail sequence, chain direction, and gene infomation.
#' @param justOneTail Boolean value. Indicates whether to analyze only data of
#' type one-tail, which defaults to T.
#' @return A large list containing quantitative data tables and statistical
#' graphs.The two figures respectively represent the column chart of the
#' number and proportion of reads with non-A bases in each sample and the
#' specific occurrence frequency of non-A bases in each sample.
#' @examples
#' library(stringi)
#' data(AnnotedTails)
#' re <- nonAanalysis(AnnotedTails)
#' re$p1
#' re$p2
#' re$p3
#' nonAre <- re$nonAinfo
#' @family Visualization functions
#' @seealso [tailViso()] to visualize tails using heat maps and logos.
#' @export
nonAanalysis <- function(tailinfo,justOneTail){
if(missing(justOneTail)){
justOneTail <- T
}
if(!is.logical(justOneTail)){
stop("The justOneMail parameter must be a logical value!")
}
##First extract the useful columns of the entered dataframe
resultlist <- list()
inputlist <- tailinfo %>%
select(read_num, sample, strand, tail, PAL)
##Find the reverse complementation of all the reverse chains:polyA input
tailinfo <- tailinfo %>%
dplyr::mutate(nA=str_count(tail,"A"),rA=nA/PAL)
subset1 <- tailinfo %>%
dplyr::filter(rA<=0.5)
subset2 <-tailinfo %>%
dplyr::filter(rA>0.5)
for(i in c(1:dim(subset1)[1])){
#Build the mapping
y <- c("T"="A", "A"="T", "G"="C" , "C"="G")
#Calculate the reverse complementary chain
x <- unlist(strsplit(subset1$tail[i], "", fixed=TRUE))
x <- y[x]
names(x) = NULL
subset1$tail[i] <- str_c(x,collapse = "")
}
inputlist <- rbind(subset1,subset2)
##Calculate nonA bases
inputlist <- inputlist %>%
mutate(Acounts = stri_count(tail,fixed = "A"),
Ccounts = stri_count(tail,fixed = "C"),
Gcounts = stri_count(tail,fixed = "G"),
Tcounts = stri_count(tail,fixed = "T"),
nonA_number = PAL-Acounts,
nonA_rate = nonA_number/PAL
)
inputlist <- inputlist %>%
dplyr::filter(rt>=0.9)
if(justOneTail){
inputlist <- inputlist %>%
dplyr::filter(read_type == "one-tail")
}
resultlist$nonAinfo <- inputlist
##upset plot
#1.A simple statistical chart counted the number of reads containing nonA Base tail
##count the reads number of all kinds
input <- c(
"C"=length(which(inputlist[,"Ccounts"]!=0 & inputlist[,"Gcounts"]==0 & inputlist[,"Tcounts"]==0)),
"G"=length(which(inputlist[,"Ccounts"]==0 & inputlist[,"Gcounts"]!=0 & inputlist[,"Tcounts"]==0)),
"T"=length(which(inputlist[,"Ccounts"]==0 & inputlist[,"Gcounts"]==0 & inputlist[,"Tcounts"]!=0)),
"C&G"=length(which(inputlist[,"Ccounts"]!=0 & inputlist[,"Gcounts"]!=0 & inputlist[,"Tcounts"]==0)),
"C&T"=length(which(inputlist[,"Ccounts"]!=0 & inputlist[,"Gcounts"]==0 & inputlist[,"Tcounts"]!=0)),
"G&T"=length(which(inputlist[,"Ccounts"]==0 & inputlist[,"Gcounts"]!=0 & inputlist[,"Tcounts"]!=0)),
"C&G&T"=length(which(inputlist[,"Ccounts"]!=0 & inputlist[,"Gcounts"]!=0 & inputlist[,"Tcounts"]!=0))
)
library(UpSetR)
data <- UpSetR::fromExpression(input)
p1 <- upset(data,
mainbar.y.label = "reads counts",
sets.x.label = "reads counts",
queries = list(list(query=intersects, params=list("C", "G"), color="#F18687", active=T),
list(query=intersects, params=list("C", "T"), color="#92D050", active=T),
list(query=intersects, params=list("G", "T"), color="#9BBFDC", active=T),
List(query=intersects, params=list("C","G","T"),color="#F6BF18",active=T)
)
)
# p1
resultlist$p1 <- p1
#2.The line chart1
base=c("<12","12-24","24-36","36-48","48-60","60-72","72-84","84-96",">=96")
C_c=c()
G_c=c()
T_c=c()
nonA_c=c()
all=c()
for (i in c(1:9)) {
k = (i-1)*12
start = 0 + k
end = 12 + k
if(i!=9){
inputlist1 <- inputlist %>%
dplyr::filter(PAL >= start & PAL < end)
allc = dim(inputlist1)[1]
Cc=length(which(inputlist1[,"Ccounts"]!=0 & inputlist1[,"Gcounts"]==0 & inputlist1[,"Tcounts"]==0))
Gc=length(which(inputlist1[,"Ccounts"]==0 & inputlist1[,"Gcounts"]!=0 & inputlist1[,"Tcounts"]==0))
Tc=length(which(inputlist1[,"Ccounts"]==0 & inputlist1[,"Gcounts"]==0 & inputlist1[,"Tcounts"]!=0))
nonAc=length(which(inputlist1[,"Ccounts"]!=0 | inputlist1[,"Gcounts"]!=0 | inputlist1[,"Tcounts"]!=0))
C_c[i] = Cc
G_c[i] = Gc
T_c[i] = Tc
nonA_c[i] = nonAc
all[i] = allc
}
else{
inputlist1 <- inputlist %>%
dplyr::filter(PAL >= start)
allc = dim(inputlist1)[1]
Cc=length(which(inputlist1[,"Ccounts"]!=0 & inputlist1[,"Gcounts"]==0 & inputlist1[,"Tcounts"]==0))
Gc=length(which(inputlist1[,"Ccounts"]==0 & inputlist1[,"Gcounts"]!=0 & inputlist1[,"Tcounts"]==0))
Tc=length(which(inputlist1[,"Ccounts"]==0 & inputlist1[,"Gcounts"]==0 & inputlist1[,"Tcounts"]!=0))
nonAc=length(which(inputlist1[,"Ccounts"]!=0 | inputlist1[,"Gcounts"]!=0 | inputlist1[,"Tcounts"]!=0))
C_c[i] = Cc
G_c[i] = Gc
T_c[i] = Tc
nonA_c[i] = nonAc
all[i] = allc
}
}
data <- data.frame(base=base,C_c=C_c,G_c=G_c,T_c=T_c,nonA_c=nonA_c,all=all)
data <- data %>%
mutate(Crt = C_c / all * 100,
Grt = G_c / all * 100,
Trt = T_c / all * 100,
nonArt = nonA_c / all * 100)
d1 <- data %>%
select(base,Crt) %>%
mutate(type="C") %>%
rename(rt=Crt)
d2 <- data %>%
select(base,Grt) %>%
mutate(type="G") %>%
rename(rt=Grt)
d3 <- data %>%
select(base,Trt) %>%
mutate(type="T") %>%
rename(rt=Trt)
d4 <- data %>%
select(base,nonArt) %>%
mutate(type="nonA") %>%
rename(rt=nonArt)
data10B <- rbind(d1,d2,d3,d4)
data10B$base <- factor(data10B$base,levels=c("<12","12-24","24-36","36-48","48-60","60-72","72-84","84-96",">=96"))
p2 <- ggplot(data10B, aes(x=factor(base), y=rt, colour=type,group=type)) +
geom_line(size=1.5)+
geom_point(size=2.5)+
ggthemes::theme_base()+
scale_fill_manual(values=c("#7ED321","#F28D8E","#AA87C2","#9BBFDC"))+
scale_color_manual(values=c("#7ED321","#F28D8E","#AA87C2","#9BBFDC"))+
labs(x = "PAL(nt)", y = "reads with non_A base/total one-tail reads(%)") +
theme(legend.position="top",axis.text.x = element_text(angle=30, vjust=0.5))
# +
# scale_y_continuous(limits=c(0,30), breaks=seq(0,30,5))
# p2
resultlist$p2 <- p2
#3.The line chart2
C_c=c()
G_c=c()
T_c=c()
nonA_c=c()
all=c()
for (i in c(1:9)) {
k = (i-1)*12
start = 0 + k
end = 12 + k
if(i!=9){
inputlist1 <- inputlist %>%
dplyr::filter(PAL >= start & PAL < end)
allc = sum(inputlist1$PAL)
Cc=sum(inputlist1$Ccounts)
Gc=sum(inputlist1$Gcounts)
Tc=sum(inputlist1$Tcounts)
nonAc=sum(inputlist1$nonA_number)
C_c[i] = Cc
G_c[i] = Gc
T_c[i] = Tc
nonA_c[i] = nonAc
all[i] = allc
}
else{
inputlist1 <- inputlist %>%
dplyr::filter(PAL >= start)
allc = sum(inputlist1$PAL)
Cc=sum(inputlist1$Ccounts)
Gc=sum(inputlist1$Gcounts)
Tc=sum(inputlist1$Tcounts)
nonAc=sum(inputlist1$nonA_number)
C_c[i] = Cc
G_c[i] = Gc
T_c[i] = Tc
nonA_c[i] = nonAc
all[i] = allc
}
}
data10C <- data.frame(base=base,C_c=C_c,G_c=G_c,T_c=T_c,nonA_c=nonA_c,all=all)
data10C <- data10C %>%
mutate(Crt = C_c / all * 1000,
Grt = G_c / all * 1000,
Trt = T_c / all * 1000,
nonArt = nonA_c / all * 1000)
d1 <- data10C %>%
select(base,Crt) %>%
mutate(type="C") %>%
rename(rt=Crt)
d2 <- data10C %>%
select(base,Grt) %>%
mutate(type="G") %>%
rename(rt=Grt)
d3 <- data10C %>%
select(base,Trt) %>%
mutate(type="T") %>%
rename(rt=Trt)
data10C <- rbind(d1,d2,d3)
data10C$base <- factor(data10C$base,levels=c("<12","12-24","24-36","36-48","48-60","60-72","72-84","84-96",">=96"))
# data10B$type <- factor(data10B$type,levels=c("non_A","C","G","T"))
p3 <- ggplot(data10C, aes(x=factor(base), y=rt, colour=type,group=type)) +
geom_line(size=1.5)+
geom_point(size=2.5)+
ggthemes::theme_base()+
scale_fill_manual(values=c("#7ED321","#F28D8E","#9BBFDC"))+
scale_color_manual(values=c("#7ED321","#F28D8E","#9BBFDC"))+
labs(x = "PAL(nt)", y = "Frequency of non-A residues (\u2030)") +
theme(legend.position="top",axis.text.x = element_text(angle=30, vjust=0.5))
# +
# scale_y_continuous(limits=c(0,10), breaks=seq(0,10,1))
resultlist$p3 <- p3
return(resultlist)
}
# tailViso ---------------------------------------------------
#' tailViso
#'
#' @description Visualize tails using heat maps and logos.
#' @details Visualize the tail input by the user so that the base composition of
#' different tails can be compared intuitively.
#' @param tails A dataframe. The set of tails that the user wants to visualize
#' contains at least one set of tail sequences and a unique identifier for
#' each tail.
#' @param tailLen Maximum tail length to draw.
#' @param Ntail Number of tails to draw.
#' @param custom A dataframe. Custom base color matching information, one column
#' is the base name, the other column is the corresponding color scheme.
#' @param strand The direction of the tail chain, + represents the polyA tail
#' and - represents the polyT tail.
#' @param faPath Tail sequence file path.
#' @param showLogo Logical value indicating whether to draw the LOGO diagram,
#' The default is T.
#' @param showReadNum Logical value indicating whether to display the sequence
#' identifier, default to T.
#' @return Return the picture.
#' @examples
#' library(ggmsa)
#' library(seqRFLP)
#' data(taildf)
#' my_cutstom <- data.frame(names=c("A","C","T","G"),color=c("#3171A5","#4EAA4C","#C9C4C2","#D73D3D"))
#' p <- tailViso(taildf,tailLen=100,Ntail=20,custom=my_cutstom,strand="-",faPath="D:/",showLogo=T,showReadNum= F)
#' @family Visualization functions
#' @seealso [DSA_ViolinPlot()] to draw a violin diagram of the result of
#' differential tail length analysis.
#' @export
tailViso <- function(taildf,tailLen,Ntail,custom,strand,faPath,showLogo,showReadNum){
##check the param
if(missing(showReadNum)){
showReadNum=F
}
if(missing(showLogo)){
showLogo <- T
}
if(!is.logical(showLogo)){
stop("ShowLogo must be a logical value")
}
if(!is.logical(showReadNum)){
stop("showReadNum must be a logical value")
}
if(missing(taildf)){
stop("taildf lossing")
}
if(missing(faPath)){
stop("faPath lossing")
}
if(!(strand %in% c("+","-"))){
stop("The STRAND parameter must be + or -")
}
if(missing(custom)){
my_cutstom <- data.frame(names=c("A","C","T","G"),color=c("#3171A5","#4EAA4C","#C9C4C2","#D73D3D"))
}
else{
my_cutstom <- custom
}
if(missing(Ntail)){
stop("Ntail lossing")
}
if(missing(tailLen)){
stop("tailLen lossing")
}
##plot
taildf <- select(taildf,read_num,tail,PAL)
if(Ntail>120){
print("Too many tails were input, only 120 tails were extracted for processing")
Ntail = 120
}
if(Ntail>dim(taildf)[1]){
Ntail = dim(taildf)[1]
}
if(strand == "+"){
seqs <- apply(taildf, 1, seqreverse)
taildf$tail <- seqs
}
taildf <- taildf[sample(nrow(taildf), Ntail), ]
maxTailL <- max(taildf$PAL)
if(maxTailL>=tailLen){
print("The input tail is too long, only the length you specify is cut for processing")
maxTailL = tailLen
}
taildf <- select(taildf,read_num,tail)
taildf$tail <- str_sub(taildf$tail,1,maxTailL)
taildf$tail <- str_pad(taildf$tail,maxTailL,side = "right",pad = "-")
filepath=str_c(faPath,"subseq.fasta")
fa <- dataframe2fas(as.data.frame(taildf),filepath)
if(showLogo){
p <- ggmsa(filepath, start=1,end=80,font = T,custom_color=my_cutstom,
char_width=0.7,seq_name=showReadNum) +
geom_seqlogo("helvetical",top=T,adaptive=T,custom_color=my_cutstom)
}
else{
p <- ggmsa(filepath, start=1,end=80,font = T,custom_color=my_cutstom,
char_width=0.7,seq_name=showReadNum)
}
plot(p)
return(p)
}
# DSA_ViolinPlot ----------------------------------------------------------
#' DSA_ViolinPlot
#'
#' @description Draw a violin diagram of the result of differential tail length
#' analysis.
#' @details The tail length difference analysis results of the four difference
#' significance test methods are displayed through the violin diagram. The
#' tail length distribution under different conditions is represented by
#' different colors to facilitate users to view the differences.
#' @param DSA_result The output of PALdsa().
#' @param SAoDMethod one of "KS", "MWU", "ME", "Wilcox" and "ALL".
#' @return A ViolinPlot.
#' @family Visualization functions
#' @seealso [DSA_UpsetPlot()] to draw a Upset diagram of the result of
#' differential tail length analysis.
#' @export
DSA_ViolinPlot <- function(DSA_result,SAoDMethod){
#data input
KS_re <- filter(DSA_result$re2,KS_re<=0.05)[,1]
KS_count <- str_c("K-S test\n",dim(KS_re)[1]," genes")
KS_re <- DSA_result$re1 %>%
filter(gene %in% KS_re$gene) %>%
select(PAID,PAL) %>%
mutate(method=KS_count)
Wilcox_re <- filter(DSA_result$re2,wilcox_re<=0.05)[,1]
Wilcox_count <- str_c("Wilcox test\n",dim(Wilcox_re)[1]," genes")
Wilcox_re <- DSA_result$re1 %>%
filter(gene %in% Wilcox_re$gene) %>%
select(PAID,PAL) %>%
mutate(method=Wilcox_count)
moses_re <- filter(DSA_result$re2,moses_re<=0.05)[,1]
moses_count <- str_c("MosesExtreme reaction\n",dim(moses_re)[1]," genes")
moses_re <- DSA_result$re1 %>%
filter(gene %in% moses_re$gene) %>%
select(PAID,PAL) %>%
mutate(method=moses_count)
MU_re <- filter(DSA_result$re2,MU_re<=0.05)[,1]
MU_count <- str_c("M-W U test\n",dim(MU_re)[1]," genes")
MU_re <- DSA_result$re1 %>%
filter(gene %in% MU_re$gene) %>%
select(PAID,PAL) %>%
mutate(method=MU_count)
if(SAoDMethod == "ALL"){
plot_data <- rbind(KS_re,Wilcox_re,moses_re,MU_re)
P <- ggplot(plot_data, aes(x=method, y=PAL, fill=PAID)) +
geom_violin(trim=FALSE) +
geom_boxplot(width=0.2,position=position_dodge(0.9),color="black")+
scale_fill_manual(values = c("#ef857d","#00b0f0"))+
scale_color_manual(values = c("#ef857d","#00b0f0"))+
theme_base()+
labs(x = "", y = "PAL per 3'UTR isform") +
theme(legend.position="top",
axis.text.y=element_text(family="Arial",size=10,face="plain"),
axis.title.y=element_text(family="Arial",size = 12,face="plain"),
panel.border = element_blank(),axis.line = element_line(colour = "black",size=1),
legend.text=element_text(face="plain", family="Arial", colour="black",
size=10),
legend.title=element_text(face="plain", family="Arial", colour="black",
size=12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
return(P)
}
else if(SAoDMethod == "KS"){
plot_data <- KS_re
P <- ggplot(plot_data, aes(x=method, y=PAL, fill=PAID)) +
geom_violin(trim=FALSE) +
geom_boxplot(width=0.2,position=position_dodge(0.9),color="black")+
scale_fill_manual(values = c("#ef857d","#00b0f0"))+
scale_color_manual(values = c("#ef857d","#00b0f0"))+
theme_base()+
labs(x = "", y = "PAL per 3'UTR isform") +
theme(legend.position="top",
axis.text.y=element_text(family="Arial",size=10,face="plain"),
axis.title.y=element_text(family="Arial",size = 12,face="plain"),
panel.border = element_blank(),axis.line = element_line(colour = "black",size=1),
legend.text=element_text(face="plain", family="Arial", colour="black",
size=10),
legend.title=element_text(face="plain", family="Arial", colour="black",
size=12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
return(P)
}
else if(SAoDMethod == "MWU"){
plot_data <- MU_re
P <- ggplot(plot_data, aes(x=method, y=PAL, fill=PAID)) +
geom_violin(trim=FALSE) +
geom_boxplot(width=0.2,position=position_dodge(0.9),color="black")+
scale_fill_manual(values = c("#ef857d","#00b0f0"))+
scale_color_manual(values = c("#ef857d","#00b0f0"))+
theme_base()+
labs(x = "", y = "PAL per 3'UTR isform") +
theme(legend.position="top",
axis.text.y=element_text(family="Arial",size=10,face="plain"),
axis.title.y=element_text(family="Arial",size = 12,face="plain"),
panel.border = element_blank(),axis.line = element_line(colour = "black",size=1),
legend.text=element_text(face="plain", family="Arial", colour="black",
size=10),
legend.title=element_text(face="plain", family="Arial", colour="black",
size=12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
return(P)
}
else if(SAoDMethod == "Wilcox"){
plot_data <- Wilcox_re
P <- ggplot(plot_data, aes(x=method, y=PAL, fill=PAID)) +
geom_violin(trim=FALSE) +
geom_boxplot(width=0.2,position=position_dodge(0.9),color="black")+
scale_fill_manual(values = c("#ef857d","#00b0f0"))+
scale_color_manual(values = c("#ef857d","#00b0f0"))+
theme_base()+
labs(x = "", y = "PAL per 3'UTR isform") +
theme(legend.position="top",
axis.text.y=element_text(family="Arial",size=10,face="plain"),
axis.title.y=element_text(family="Arial",size = 12,face="plain"),
panel.border = element_blank(),axis.line = element_line(colour = "black",size=1),
legend.text=element_text(face="plain", family="Arial", colour="black",
size=10),
legend.title=element_text(face="plain", family="Arial", colour="black",
size=12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
return(P)
}
else if(SAoDMethod == "ME"){
plot_data <- moses_re
P <- ggplot(plot_data, aes(x=method, y=PAL, fill=PAID)) +
geom_violin(trim=FALSE) +
geom_boxplot(width=0.2,position=position_dodge(0.9),color="black")+
scale_fill_manual(values = c("#ef857d","#00b0f0"))+
scale_color_manual(values = c("#ef857d","#00b0f0"))+
theme_base()+
labs(x = "", y = "PAL per 3'UTR isform") +
theme(legend.position="top",
axis.text.y=element_text(family="Arial",size=10,face="plain"),
axis.title.y=element_text(family="Arial",size = 12,face="plain"),
panel.border = element_blank(),axis.line = element_line(colour = "black",size=1),
legend.text=element_text(face="plain", family="Arial", colour="black",
size=10),
legend.title=element_text(face="plain", family="Arial", colour="black",
size=12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
return(P)
}
else{
stop("SAoDMethod must be one of c('KS', 'MWU', 'ME', 'Wilcox', 'ALL')!")
}
}
# DSA_UpsetPlot -----------------------------------------------------------
#' DSA_UpsetPlot
#'
#' @description Draw a Upset diagram of the result of differential tail length
#' analysis.
#' @details Overlapping analysis of the results of differential tail length
#' analysis of the four difference significance testing methods can help users
#' quickly determine which results of differential tail length analysis are
#' highly confident. The consensus results of the four methods are marked in
#' red to represent highly confident tail length genes.
#' @param DSA_result The output of PALdsa().
#' @return A UpsetPlot.
#' @family Visualization functions
#' @seealso [DSA_ViolinPlot()] to draw a violin diagram of the result of
#' differential tail length analysis.
#' @export
DSA_UpsetPlot <- function(DSA_result){
# library(UpSetR)
plot_data <- DSA_result[,-c(2,3)]
plot_data[plot_data == "lose"] <- "0"
plot_data[is.na(plot_data)] <- "0"
plot_data[plot_data == "NaN"] <- "0"
plot_data_col1 <- as.character(plot_data$gene)
plot_data <- plot_data[,-1]
plot_data <- as.data.frame(lapply(plot_data,as.numeric))
plot_data[plot_data < 0.05] <- 0.05
plot_data[plot_data > 0.05] <- 0
plot_data[plot_data == 0.05] <- 1
plot_data$gene <- plot_data_col1
plot_data <- plot_data %>%
rename(KS=KS_re,Wilcox=wilcox_re,Moses=moses_re,MU=MU_re)
p <- upset(plot_data, sets = c("KS", "Wilcox", "Moses", "MU"),
mb.ratio = c(0.6, 0.4), order.by = "freq",
queries = list(list(query=intersects,
params=list("KS", "Wilcox", "Moses", "MU"),
color="#DD3737", active=T)),
sets.bar.color = c("#b681bd","#377eb8","#7cc47a","#e06868"),
nsets = 7, number.angles = 0, point.size = 6, line.size = 1,
mainbar.y.label = "overlap gene counts",
sets.x.label = "gene counts", text.scale = c(1, 1,1))
return(p)
}
# PALdsa ------------------------------------------------------------------
#' PALdsa
#'
#' @description Analysis of significant difference of tail length under
#' different conditions.
#' @details The poly(A) tail length of genes with two PACs was analyzed for
#' significant difference.It is feasible to directly analyze whether there is
#' a significant difference in tail length between the two PACs, or to
#' annotate PACs as the proximal PA site and the distal PA site before
#' conducting the significance analysis of tail length difference.
#' @param PAdf A dataframe of PA infomation without merge.
#' @param PALdf A dataframe that contains the length information of all reads
#' tails.
#' @param gff Genome annotation stored in a GFF/GTF file or a TXDB R object can
#' be used for annotating PACs. Please refer to movAPA for details.
#' @param d distance to group nearby PACds, default is 24 nt.
#' @param mode The PAL comparison mode, "PD" refers to comparing the tail
#' length difference between the proximal and distal PA sites of each gene
#' with two PACs."2PA" represents a direct comparison of tail length
#' differences between the two PA loci of genes with two PACs, default for
#' "PD".
#' @param withViolinPlot Logical value variable, Whether to draw a violin
#' diagram, see function DSA_ViolinPlot() for details, default is true.
#' @param withUpsetPlot Logical value variable, Whether to draw a upset
#' diagram, see function DSA_UpsetPlot() for details, default is true.
#' @param SAoDMethod one of "KS", "MWU", "ME", "Wilcox" and "ALL".
#' @return By default, this function returns a list of two graphs and a
#' dataframe, or only a dataframe if no drawing operation has been performed.
#' The result dataframe contains 7 column contents, the first column is
#' geneID, and the second and third columns are the median tail lengths under
#' different conditions. The remaining four columns are p-values calculated by
#' different difference significance test methods, each column corresponds to
#' a difference significance test method, in which "lose" or "NAN" means that
#' for some reason (probably too little data) this method has not been able to
#' calculate the exact p-values.
#' @examples
#' library(movAPA)
#' library(TxDb.Mmusculus.UCSC.mm10.knownGene)
#' gff <- parseGenomeAnnotation(TxDb.Mmusculus.UCSC.mm10.knownGene)
#' data(AnnotedTails)
#' files = system.file("extdata", "./output/PAs/PAs.txt", package = "PolyAtailor", mustWork = TRUE)
#' PAs <- read.table(files,header=TRUE,sep=" ")
#' diffPAL2PAgenes <-
#' PALdsa(PAs,AnnotedTails,gff,mode="PD",SAoDMethod="ME",withViolinPlot=TRUE,withUpsetPlot=F)
#' @family Visualization functions
#' @seealso [DSA_ViolinPlot()] to draw a violin diagram of the result of
#' differential tail length analysis.
#' @export
PALdsa <- function(PAdf,PALdf,gff,d,mode,withViolinPlot,withUpsetPlot,SAoDMethod){
#check the paraments
if(missing(PAdf) | missing(PALdf)){
stop("Missing important parameters: PAdf or PALdf!")
}
if(missing(gff)){
stop("Missing the annotations file!please check!")
}
if(missing(d)){
d = 24
}
if(missing(SAoDMethod)){
SAoDMethod = "ALL"
}
if(!is.integer(d) & !is.numeric(d)){
stop("Parameter d must be an integer!")
}
if(missing(mode)){
mode = "PD"
}
if(!(mode %in% c("PD","2PA"))){
stop("The mode parameter must be one of c('PD','2PA')!")
}
if(missing(withViolinPlot)){
withViolinPlot = T
}
if(missing(withUpsetPlot)){
withUpsetPlot = T
}
#functions
if(mode == "PD"){
all_re = diffPALdpPA(PAdf,PALdf,gff,d=d)
}
if(mode == "2PA"){
all_re = diffPAL2PAgene(PAdf,PALdf,gff,d=d)
}
re <- list()
re$DSAresult = all_re$re2
if(withViolinPlot){
p1 = DSA_ViolinPlot(all_re,SAoDMethod)
re$ViolinPlot = p1
plot(p1)
}
if(withUpsetPlot){
p2 = DSA_UpsetPlot(all_re$re2)
re$UpsetPlot = p2
}
return(re)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.