#!/usr/bin/env Rscript
## TESTING SEGMENT LENGTH DIST AND AGAINST ANNOTATED GENES AND TRANSCRIPTS
library(segmenTools)
##segtools <- "~/programs/segmenTools/"
##source(file.path(segtools,"R/segmenTools.R")) # for segment analysis
##source(file.path(segtools,"R/coor2index.R")) # coor2index
library(cluster) # for pam clustering
library(optparse) # command-line options
## nicer timestamp
time <- function() format(Sys.time(), "%Y%m%d %H:%M:%S")
## cluster/segment colors; function derived from scale_colour_hue in ggplot2
color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
grDevices::hcl(h = hues, l = 65, c = 100)[1:n]
}
### OPTIONS
option_list <- list(
make_option(c("-i", "--infile"), type="character", default="",
help="chromosome coordinates of primary segments as produced by clusterSegments.R but without header ('allsegs.csv')"),
make_option(c("--chrfile"), type="character", default="",
help="chromosome index file, providing a sorted list of chromosomes and their lengths in column 3 [default %default]"),
make_option(c("--qtypes"), type="character", default="",
help="sub-set testset in column 'type'"),
make_option(c("--qtypcol"), type="character", default="type",
help="name of column with sub-set annotation"),
##chrfile = $YEASTDAT/chromosomes/sequenceIndex_R64-1-1_20110208.csv
## SEGMENT TEST SETTINGS
make_option(c("--fuse.segs"), action="store_true", default=FALSE,
help="use FUSE tag from clusterSegments to fuse adjacent segments"),
make_option(c("--target"), type="character", default="",
help="target set of chromosomal segments"),
make_option(c("--ttypes"), type="character", default="",
help="sub-set testset in column 'type'"),
make_option(c("--ttypcol"), type="character", default="type",
help="name of column with sub-set annotation"),
make_option(c("--tcolcol"), type="character", default="",
help="name of column with sub-set colors for plots"),
make_option("--ovlth", default=0.8,
help="overlap threshold (mutual coverage) to be counted as a direct hit; at least ovlth*length must be reached for both query (segments) and targets (test set) [default %default]"),
make_option("--minj", default=0.8,
help="minimal jaccard index, the fraction of tests above this threshold are reported and the cutoff line plotted [default %default]"),
make_option("--minf", default=0.2,
help="minimal fraction of testsets; the jaccard index at this threshold is reported and the cutoff line plotted [default %default]"),
make_option(c("--randomize"), action="store_true", default=FALSE,
help="additionally calculate overlaps for randomized target segments"),
## OUTPUT OPTIONS
make_option(c("--out.path"), type="character", default=".",
help="directory path for output data (figures, csv files)"),
make_option(c("--testid"), type="character", default="testset",
help="ID of the testset, used for folder and file names"),
make_option(c("-v", "--verb"), type="integer", default=1,
help="0: silent, 1: main messages, 2: warning messages"),
make_option(c("--fig.type"), type="character", default="png",
help="figure type, png or pdf [default %default]"),
make_option(c("--save"), action="store_true", default=FALSE,
help="save complete overlap data as one RData file (big!)"),
make_option(c("--save.rdata"), action="store_true", default=FALSE,
help="save overlap data as individual RData files for segment types"))
## get command line options
opt <- parse_args(OptionParser(option_list=option_list))
## process comma-separated list arguments
lst.args <- c(ttypes="character",qtypes="character")
for ( i in 1:length(lst.args) ) {
idx <- which(names(opt)==names(lst.args)[i])
opt[[idx]] <- unlist(strsplit(opt[[idx]], ","))
##for ( j in 1:length(opt[[idx]]) ) {
## tmp <- strsplit(opt[[idx]][j], ":")
##}
mode(opt[[idx]]) <- lst.args[i]
}
## promote options to main environment and print all arguments
if ( opt$verb>0 )
cat(paste("SETTINGS:\n"))
for ( i in 1:length(opt) ) {
if ( opt$verb>0 )
cat(paste("\t",names(opt)[i], ":", #typeof(opt[[i]]),
paste(opt[[i]],collapse=", "), "\n"))
arg <- names(opt)[i]
assign(arg, opt[[arg]])
}
if ( verb>0 )
cat(paste("\n"))
### START
## load chromosome index
if ( verb>0 )
cat(paste("Loading chromosome index file:", chrfile, "\n"))
cf <- read.table(chrfile,sep="\t",header=FALSE)
chrS <- c(0,cumsum(cf[,ncol(cf)])) ## index of chr/pos = chrS[chr] + pos
## READ SEGMENTS TO BE TESTED
if ( verb>0 )
cat(paste("LOADING SEGMENTS:", infile, "\n"))
segs <- read.table(infile,sep="\t",header=TRUE, comment.char="")
#' uses a logical column to fuse adjacent segments, i.e. the
#' lower segment i with segment i-1, where segments
#' is an ordered list of segments
fuseSegments <- function(segments, col="fuse", val=1) {
fuse <- segments[2:nrow(segments),col] == val
cat(paste("NOTE: FUSING", sum(fuse), "SEGMENTS, from segment types:\n",
paste(unique(segments[fuse,"type"]),collapse="; "),"\n"))
fsegs <- segments[c(TRUE,!fuse),]
fsegs[,"end"] <- segments[c(!fuse,TRUE),"end"]
fsegs
}
## use "fuse" column from clusterSegments.R fuse filter!
if ( fuse.segs ) {
if ( verb>0 )
cat(paste("fusing segments by fuse tag\n"))
segs <- fuseSegments(segs,col="fuse",val=1)
}
## replace genome coordinates by continuous index
segs <- coor2index(segs,chrS)
## filter types
qtypes <- qtypes[qtypes!=""]
if ( length(qtypes)==0 ) {
qtypes <- as.character(segs[,qtypcol])
qtypes[qtypes==""] <- "na"
qtypes[is.na(qtypes)] <- "na"
} else {
## filter to types given by cmdline arg qtypes
segs <- segs[as.character(segs[,qtypcol])%in%qtypes,]
qtypes <- as.character(segs[,qtypcol]) # get remaining
}
## split by type
lst <- split(segs,as.character(segs[,qtypcol]))
sgtypes <- names(lst)
debug <- FALSE
if ( debug ) { ## test only large number
sgnum <- unlist(lapply(lst,nrow))
sgtypes <- sgtypes[sgnum>1000] ### TODO: remove this and fuse types
## only selected!
best.selected <- c("D:dft1-7.dcash.snr_T:raw_K:12_S:icor_E:1_M:75_nui:1",
##"D:dft1-7.dcash.snr_T:raw_K:12_S:icor_E:2_M:150_nui:2",
"D:dft1-7.dcash.snr_T:raw_K:12_S:icor_E:2_M:150_nui:3",
"D:dft1-7.dcash.snr_T:raw_K:12_S:icor_E:3_M:200_nui:3")
worst.selected <- c("D:dft1-7.dcash.snr_T:raw_K:12_S:icor_E:1_M:200_nui:1",
"D:dft1-7.dcash.snr_T:raw_K:12_S:icor_E:3_M:75_nui:3")
## concatenate: numbered 1--5:
selected <- c(worst.selected[1],best.selected,worst.selected[2])
sgtypes <- selected
}
## define colors and pch for segment types
sgcols <- rainbow(length(sgtypes))
sgpchs <- rep(1:17, len=length(sgtypes)) #only 17 of 18 to avoid multiple of lty
sgltys <- rep(1:6, len=length(sgtypes))
names(sgcols) <- names(sgpchs) <- names(sgltys) <- sgtypes
## get segment classes
## TODO: rm sgclasses?
## this is only used for PAM clustering below and
## was specific for analysis of segmenTier results 2017
sgclasses <- getSegmentClasses(sgtypes, sep="_")
## col, pch and lty for classes
sgclcols <- rainbow(length(sgclasses))
sgclpchs <- rep(1:17, len=length(sgclasses)) #only 17 to avoid lty multiple
sgclltys <- rep(1:6, len=length(sgclasses))
names(sgclcols) <- names(sgclpchs) <-names(sgclltys) <- sgclasses
## new: segment class table, only used for PAM clustering
sgcltab <- getSegmentClassTable(sgtypes, sep="_")
### START ANALYSIS
## output dirs - generate locally in cwd
dir.create(out.path) # length distributions
## output dir
dir.create(file.path(out.path,testid),recursive=TRUE) # test set
### TEST AGAINST OTHER DATA SETS
if ( verb>0 )
cat(paste("LOADING TEST SETS:", testid, "-", target, "\n"))
trgs <- read.table(target,sep="\t",header=TRUE,
stringsAsFactors=FALSE, comment.char = "")
trgs <- coor2index(trgs,chrS) # map to continuous index!
tpcol <- ttypcol
cpcol <- tcolcol
## test types
ttypes <- ttypes[ttypes!=""]
if ( length(ttypes)==0 ) {
ttypes <- as.character(trgs[,tpcol])
ttypes[ttypes==""] <- "na"
ttypes[is.na(ttypes)] <- "na"
} else {
## filter to types given by cmdline arg ttypes
trgs <- trgs[as.character(trgs[,tpcol])%in%ttypes,]
ttypes <- as.character(trgs[,tpcol]) # get remaining
}
test.types <- sort(names(table(ttypes))) # sort by number
if ( cpcol=="" ) {
# TODO: rgb range1:length(test.types)
tcols <- sub("00$","",rainbow(length(test.types),alpha=0))
names(tcols) <- test.types
} else {
## just take first color for each type,
## user must ensure that these are unique
tcols <- sapply(test.types,
function(x)
as.character(trgs[which(trgs[,tpcol]==x)[1],cpcol]))
}
tcols["na"] <- "#939393"
## results
ovlstats <- rep(list(NA),length(test.types))
names(ovlstats) <- test.types
#randomize <- TRUE # FALSE #
if ( randomize )
rndstats <- ovlstats
## loop over targets (transcript and ORF data sets from SGD)
for ( test.type in test.types ) {
if ( verb> 0 )
cat(paste(test.type, "\t",time(),"\n"))
target <- trgs[ttypes==test.type,]
## TODO: integrate randomization in analysis?
if ( randomize )
target.rnd <- randomSegments(target) ## TODO: with total or not?
## result list: overlap statistics
ostat <- rep(list(NA),length(sgtypes))
names(ostat) <- sgtypes
ovlstats[[test.type]] <- ostat
if ( randomize )
rndstats[[test.type]] <- ostat
## loop over queries (segment types)
for ( type in sgtypes ) {
if ( verb>0 )
cat(paste("#",which(sgtypes==type), type,
"overlap with",test.type,"\t",time(),"\n"))
sgs <- lst[[type]]
## for testing
devel <- FALSE
if ( devel )
sgs <- sgs[1:1000,]
ovl <- segmentAnnotate(query=sgs, target=target,
add.na=TRUE, details=TRUE,
untie=FALSE, collapse=FALSE, sort=FALSE,
msgfile=stdout())
if ( !any(!is.na(ovl[,"query"])) ) next
sts <- getOverlapStats(ovl, ovlth=ovlth, minj=minj, minf=minf,
hrng=c(.8,1.2),
tnum=nrow(target),qnum=nrow(sgs),
qid=type, tid=test.type)
ovlstats[[test.type]][[type]] <- sts
if ( randomize ) {
rnd <- segmentAnnotate(query=sgs, target=target.rnd,
add.na=TRUE, details=TRUE,
untie=FALSE, collapse=FALSE, sort=FALSE,
msgfile=stdout())
#if ( !any(!is.na(ovl[,"query"])) ) next
sts.rnd <- getOverlapStats(rnd, ovlth=ovlth, minj=minj, minf=minf,
hrng=c(.8,1.2),
tnum=nrow(target.rnd),qnum=nrow(sgs),
qid=type, tid=test.type)
rndstats[[test.type]][[type]] <- sts.rnd
}
## TODO: save results as files
}
}
if ( save )
save.image(file.path(out.path,testid,"overlaps.RData"))
## REMOVE EMPTY TESTS
for ( test.type in test.types ) {
cat(paste("checking", test.type, "\n"))
lst <- ovlstats[[test.type]]
## rm empty
len <- unlist(lapply(lst, function(x) length(x) ))
if ( any(len==1) )
cat(paste("\tremoving",sum(len==1),"empty tests:",
paste(names(lst)[len==1],collapse="; "),"\n"))
lst <- lst[len>1]
## re-attach
ovlstats[[test.type]] <- lst
if ( randomize ) {
cat(paste("\trandomized\n"))
lst <- rndstats[[test.type]]
## rm empty
len <- unlist(lapply(lst, function(x) length(x) ))
if ( any(len==1) )
cat(paste("\tremoving",sum(len==1),"empty tests:",
paste(names(lst)[len==1],collapse="; "),"\n"))
lst <- lst[len>1]
## re-attach
rndstats[[test.type]] <- lst
}
}
## COLLATE DATA AND WRITE OUT RESULTS
## export hitnum, jaccard, numhit and ratio thresholds
for ( test.type in test.types ) {
#if ( is.null(ovlstats[[test.type]]) ) next
covlStats <- collectOvlStats(ovlstats, type=test.type)
ids <- covlStats$nms
jaccard <- covlStats$jaccard # jaccard of best hits
j.prcnt<- covlStats$j.prcnt# percent of targets covered with J>threshold
j.cutoff<- covlStats$j.cutoff# jaccard index at CDF(threshold)
height <- covlStats$height # target recovery fraction within threshold
hitnum <- covlStats$hitnum # total number of 'good' hits
numhit <- covlStats$numhit # average number of hits per target
tnum <- covlStats$tnum # number of tested targets
tnum <- covlStats$tnum # number of tested targets
result <- data.frame(ID=ids, tnum=tnum, hits=hitnum, Jaccard=jaccard,
JaccPrcnt=j.prcnt, JaccCutoff=j.cutoff,
hits.per.target=numhit,
ratio.low=height[,1], ratio.high=height[,2])
## write out table of segmentation characteristics
file.name <- file.path(out.path,testid,paste("segmentRecovery_",test.type,
".csv",sep=""))
write.table(result,file=file.name, sep="\t",
col.names=TRUE,row.names=FALSE,quote=FALSE)
}
## directly compare real vs. random targets
if ( randomize ) {
rnd.path <- file.path(out.path,testid,"randomize")
dir.create(rnd.path)
x <- seq(0,4,.01)
totN <- length(test.types)*length(sgtypes)
relative <- rep(list(NA), totN)
for ( test.type in test.types ) {
for ( type in sgtypes ) {
rcdf.real <- rcdf.rand <- NULL
if ( length(ovlstats[[test.type]][[type]])>1 )
rcdf.real <- ovlstats[[test.type]][[type]]$CDF$rcdf
if ( length(rndstats[[test.type]][[type]])>1 )
rcdf.rand <- rndstats[[test.type]][[type]]$CDF$rcdf
file.name <- file.path(rnd.path,paste0(test.type,"_",type))
plotdev(file.name,width=4,height=3.5,type=fig.type)
par(mai=c(.5,.5,.5,.1),mgp=c(1.3,.4,0),tcl=-.2)
plot(1,col=NA,xlim=range(x),ylim=c(0,1),
main=paste(test.type, "recovery"),
xlab="segment length/target length",ylab="ECDF")
abline(h=0:1, lty=2, lwd=.5, col="gray70")
abline(h=c(minf,.8), lty=2, lwd=.5, col="gray70")
abline(v=c(ovlth,2-ovlth), lty=2, lwd=.5, col="gray70")
if ( !is.null(rcdf.real) )
lines(x, rcdf.real(x), col=1,lty=1,lwd=2)
if ( !is.null(rcdf.rand) )
lines(x, rcdf.rand(x), col=2,lty=2,lwd=2)
legend("topleft",legend=c("real","random"),lty=1:2,col=1:2,lwd=2,
bty="n")
dev.off()
}
}
}
#### PLOT OF OVERLAP STATISTICS
sets <- c("ovl")
if ( randomize )
sets <- c("ovl","rnd")
for ( set in sets ) {
stats <- get(paste0(set,"stats"))
fname <- ifelse(set=="rnd","_random","")
## remove empty results!
rm <- unlist(lapply(stats, function (x)
!any(unlist(lapply(x, function(y) y$hitnum))>0)))
stats <- stats[!rm]
test.types <- test.types[!rm]
### PLOT BY TEST TYPES
for ( test.type in test.types ) {
if ( !test.type%in%names(stats) ) {
warning(test.type, " not found in stats object for set: ", set)
next
}
cstats <- collectOvlStats(stats, type=test.type)
CDF <- cstats$CDF
jaccard <- cstats$jaccard
height <- cstats$height
hitnum <- cstats$hitnum
numhit <- cstats$numhit
qnum <- cstats$qnum
tnum <- cstats$tnum
nms <- cstats$nms
sgnum <- length(sgtypes) ## skip clustering if only one!
## TODO: mv this to getOverlapStats
## MAX vs. MIN CLUSTERING cluster jaccard vs. numhits vs segment classes
## TODO: also include height in clustering?
pm <- NULL
if ( sgnum>7 & ncol(sgcltab)>1) { # TODO: pmcol used below skip if
K <- 7
dat <- cbind((jaccard-min(jaccard))/(max(jaccard)-min(jaccard)),
(numhit-min(numhit))/(max(numhit)-min(numhit)),
height)
pm <- pam(dat, K)
cllst <- apply(sgcltab,2, unique)
## TODO: sort clustering by increasing height!
pmsrt <- split(dat[,3],pm$clustering)
pmmn <- unlist(lapply(pmsrt, mean))
pmsrt <- order(as.numeric(names(pmmn)[order(pmmn)]))
## sorted clustering
pmcls <- pmsrt[pm$clustering]
pmcol <- rainbow(length(pmsrt))
names(pmcol) <- 1:length(pmsrt)
## write out table of segmentation characteristics
result <- data.frame(ID=nms, CL=pmcls)
file.name <- file.path(out.path,testid,
paste("segmentRecovery_",test.type,
"_clusters",fname,".csv",sep=""))
write.table(result,file=file.name, sep="\t",
col.names=TRUE,row.names=FALSE,quote=FALSE)
## strange bug: 75 in list of 75,100,150 gets a leading space
## TODO: names lost for $rundir/test.csv !?
## trim all:
cllst <- lapply(cllst,trimws)
## sor numeric! K, S, E, M, nui
cllst <- lapply(cllst, function(x) {
if ( suppressWarnings(all(!is.na(as.numeric(x)))) )
x <- as.character(sort(as.numeric(x)))
x})
allcl <- unlist(sapply(1:length(cllst),
function(x) paste(names(cllst)[x],
cllst[[x]],sep=".")))
enum <- matrix(NA, nrow=K, ncol=length(allcl))
colnames(enum) <- allcl
rownames(enum) <- 1:K
pval <- enum
## get cumulative hypergeometric distribution of clustering vs. segments
for ( i in 1:K )
for ( j in 1:ncol(sgcltab) ) {
clcl <- clusterCluster(pmcls==i,sgcltab[,j])
cln <- colnames(sgcltab)[j]
cln <- paste(cln,colnames(clcl$overlap),sep=".")
if ( "TRUE" %in% rownames(clcl$overlap) ) {
enum[i,cln] <- clcl$overlap["TRUE",]
pval[i,cln] <- clcl$p.value["TRUE",]
}
}
file.name <- file.path(out.path,testid,
paste(test.type,"_segmentationClusters",
fname,sep=""))
plotdev(file.name,width=4.5,height=4.5,type=fig.type)
par(mai=c(.7,.5,.1,.1),mgp=c(1.7,.5,0))
image_matrix(-log2(pval) ,text=enum, axis=1:2,
col=c("#FFFFFF",rev(grey.colors(20))),
axis2.col=pmcol[1:nrow(pval)],
xlab=NA,ylab=NA)
abline(v=cumsum(unlist(lapply(cllst, length)))+.5)
axis(1, at=cumsum(unlist(lapply(cllst, length)))+.5, tck=-.1,labels=NA)
dev.off()
## TODO: plot by segment classes as length dist
## new lower and upper threshold of ratio
## OPT: uppler left,
file.name <- file.path(out.path,testid,
paste(test.type,"_ratioTotal_lh_clustered",
fname,sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type)
par(mfcol=c(1,1),mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0))
plot(height,xlab=expression(R["short"]),ylab=expression(R["long"]),
col=NA)
legend("topleft","good",bty="n",text.font=2)
points(height,col=pmcol[pmcls],pch=20) #sgpchs[nms])
abline(v=.2,lty=2)
abline(h=.8,lty=2)
dev.off()
}
file.name <- file.path(out.path,testid,
paste(test.type,"_ratioTotal_lh",fname,sep=""))
plotdev(file.name,width=10,height=5,type=fig.type)
par(mfcol=c(1,2),mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0))
plot(height,xlab="fraction: ratio < 0.8",ylab="fraction: ratio < 1.2",
col=sgcols[nms],pch=sgpchs[nms])
abline(v=.2,lty=2)
abline(h=.8,lty=2)
par(mai=c(.1,.1,.1,.1))
plot(0, col=NA,axes=FALSE,ylab=NA,xlab=NA)
legend("topleft", legend=nms, col=sgcols[nms],pch=sgpchs[nms],
cex=.5,bty="n",ncol=2)
dev.off()
## CDF of absolute best hit CDF (rcdf)
file.name <- file.path(out.path,testid,
paste(test.type,"_ratioTotal",fname,sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type)
par(mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0),xaxs="i")
plot_cdfLst(x=seq(0,2,.05), CDF=CDF, type="rcdf", col=sgcols, lty=sgltys,
h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
xlab="ratio: query length/target length")
legend("topleft",paste(test.type,"-",tnum))
dev.off()
## CONSSEG FIGURE: consensus segmentation vs. segmenTier input
if ( Sys.Date() < as.Date("2021-02-01") ) {
ccols <- sgcols
ccols["consensus"] <- "#000000"
ccols[names(ccols)!="consensus"] <- c(2:6)
clwd <- rep(1, length(sgcols))
names(clwd) <- names(sgcols)
clwd["consensus"] <- 2
clty <- rep(2, length(sgcols))
names(clty) <- names(sgcols)
clty["consensus"] <- 1
## CDF of absolute best hit CDF (rcdf)
file.name <- file.path(out.path,testid,
paste(test.type,"_ratioTotal_consseg",
fname,sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type,res=300)
par(mai=c(.5,.5,.125,.125),mgp=c(1.3,.4,0),xaxs="i")
plot_cdfLst(x=seq(0,2,.05), CDF=CDF, type="rcdf",
lwd=clwd,
col=ccols, lty=clty,
h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
xlab="ratio: query length/target length")
legend("topleft",paste("target:", test.type))#,"-",tnum))
nms <- gsub("D:dft1-7.dcash.snr_T:raw_K:12_S:icor_","",names(clwd))
legend("right", nms, lwd=clwd, col=ccols, lty=clty,
bg="white",box.col=NA,cex=.7)
dev.off()
}
## CDF of absolute best hit CDF (rcdf) - cluster colors
if ( !is.null(pm) ) {
file.name <- file.path(out.path,testid,
paste(test.type,"_ratioTotal_clustered",
fname,sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type)
par(mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0),xaxs="i")
plot_cdfLst(x=seq(0,2,.05), CDF=CDF, type="rcdf",
col=pmcol[pmcls], lty=sgltys,
h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
xlab="R: query length/target length",range="lines")
legend("topleft",paste(test.type,"-",tnum))
rect(1.4,.26,1.9,.34,col="#FFFFFFBB",border=NA)
text(1.65,.3,"too long",font=2, col=pmcol[1])
rect(.1,.56,0.6,.64,col="#FFFFFFBB",border=NA)
text(.35,.6,"too short",font=2, col=pmcol[length(pmcol)])
points(1.2,.1,pch=4)
text(1.2,.1,expression(R["long"]),pos=4,font=2)
points(.8,.7,pch=4)
text(.8,.7,expression(R["short"]),pos=2,font=2)
dev.off()
}
## MAX vs. MIN: jaccard of best hits vs. num hits per target
file.name <- file.path(out.path,testid,
paste(test.type,"_jaccard_fragmentation",
fname,sep=""))
plotdev(file.name,width=10,height=4,type=fig.type)
par(mfcol=c(1,2),mai=c(1,1,.1,.1))
plot(jaccard,numhit,
col=sgcols[nms],pch=sgpchs[nms],
ylab="average hits per target sequence",
xlab="jaccard: intersect/union")#,
par(mai=c(.1,.1,.1,.1))
plot(0, col=NA,axes=FALSE,ylab=NA,xlab=NA)
legend("topleft", legend=nms, col=sgcols[nms],pch=sgpchs[nms],
cex=.5,bty="n",ncol=2)
dev.off()
## MAX vs. MIN CLUSTERS - JACCARD
if ( !is.null(pm) ) {
file.name <- file.path(out.path,testid,
paste(test.type,
"_jaccard_fragmentation_clustered",
fname,sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type)
par(mfcol=c(1,1),mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0))
#par(mfcol=c(1,1),mai=c(1,1,.1,.1))
plot(jaccard,numhit,col=NA,
ylab="average hits per target sequence",
xlab="jaccard: intersect/union")#,
mxx <- max(jaccard)
mny <- min(numhit)
points(jaccard,numhit,col=pmcol[pmcls],pch=20) #sgpchs[nms])
#rect(.1,.01,0.6,.09,col="#FFFFFFBB",border=NA)
#text(.35,.05,"good",font=2)
legend("bottomright","good",bty="n", text.font=2) #bg="#FFFFFFBB",box.col=NA, text.font=2)
##par(mai=c(1,.7,.1,.1))
##image_matrix(-log2(pval) ,text=enum, axis=1:2,
## col=c("#FFFFFF",rev(grey.colors(20))),
## axis2.col=1:nrow(pval),
## xlab=NA,ylab=NA)
##axis(1, at=cumsum(unlist(lapply(cllst,length)))+.5, tck=-1,labels=NA)
dev.off()
}
## MAX vs. MIN: ratio heights vs. num hits per target
## TODO: plot name legend separately and adaptive!
file.name <- file.path(out.path,testid,
paste(test.type,"_ratio_fragmentation",fname,sep=""))
plotdev(file.name,width=10,height=4,type=fig.type)
par(mfcol=c(1,2),mai=c(1,1,.1,.1))
plot(apply(height,1,diff), numhit,
col=sgcols[nms],pch=sgpchs[nms],
ylab="average hits per target sequence",
xlab="fraction: 0.8 < ratio < 1.2")#,
par(mai=c(.1,.1,.1,.1))
plot(0, col=NA,axes=FALSE,ylab=NA,xlab=NA)
legend("topleft", legend=nms, col=sgcols[nms],pch=sgpchs[nms],
cex=.5,bty="n",ncol=2)
dev.off()
## MAX vs. MIN CLUSTERS - good hits per target vs. num hits per target
if ( !is.null(pm) ) {
file.name <-file.path(out.path,testid,
paste(test.type,"_ratio_fragmentation_clustered",
fname,sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type)
par(mfcol=c(1,1),mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0))
plot(apply(height,1,diff), numhit,
col=pmcol[pmcls],pch=sgpchs[nms],
ylab="average hits per target sequence",
xlab="fraction: 0.8 < ratio < 1.2")#,
imgdat <- t(apply(-log2(pval), 2, rev))
dev.off()
}
### BELOW PERHAPS NOT REQUIRED
## CDF PLOTS
## CDF of jaccard (jcdf)
if ( !is.null(pm) ) {
file.name <- file.path(out.path,testid,
paste(test.type,"_jaccard_cdf_clustered",
fname,sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type)
par(mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0),xaxs="i")
plot_cdfLst(x=seq(0,1.1,.05), CDF=CDF, type="jcdf",
col=pmcol[pmcls], lty=sgltys,
h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
xlab="J: intersect/union",range="lines")
legend("topleft",paste(test.type,"-",tnum))
dev.off()
}
file.name <- file.path(out.path,testid,
paste(test.type,"_jaccard_rcdf",fname,sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type)
par(mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0),xaxs="i")
plot_cdfLst(x=seq(0,1.1,.05), CDF=CDF, type="jcdf",
col=sgcols, lty=sgltys,
h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
xlab="jaccard: intersect/union")
legend("topleft",paste(test.type,"-",tnum))
dev.off()
## CDF of relative best hit CDF (rrcdf)
file.name <- file.path(out.path,testid,
paste(test.type,"_ratio",fname,sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type)
par(mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0),xaxs="i")
plot_cdfLst(x=seq(0,2,.05), CDF=CDF, type="rrcdf", col=sgcols, lty=sgltys,
h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
xlab="relative ratio: query length/target length")
legend("topleft",paste(test.type,"-",tnum))
dev.off()
## CDF of best hit target coverage
file.name <-file.path(out.path,testid,
paste(test.type,"_totalCoverage",fname,sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type)
par(mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0),xaxs="i")
plot_cdfLst(x=seq(0,1.1,.05), CDF=CDF, type="tcdf", col=sgcols, lty=sgltys,
h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
xlab="coverage of test set")
legend("topleft",paste(test.type,"-",tnum))
dev.off()
## CDF of best hit target coverage - cluster colors
if ( !is.null(pm) ) {
file.name <-file.path(out.path,testid,
paste(test.type,"_totalCoverage_clustered",
fname,sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type)
par(mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0),xaxs="i")
plot_cdfLst(x=seq(0,1.1,.05), CDF=CDF, type="tcdf",
col=pmcol[pmcls], lty=sgltys,
h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
xlab="coverage of test set")
legend("topleft",paste(test.type,"-",tnum))
dev.off()
}
## SUMMARY OF CDF of absolute best hit ratio
## fraction of mutual coverage between 0.8 and 1.2
if ( !is.null(pm) ) {
file.name <- file.path(out.path,testid,
paste(test.type,"_ratioTotal_rng_clustered",
fname,sep=""))
plotdev(file.name,width=2+.2*length(CDF),height=6,type=fig.type)
par(mai=c(3.3,.75,.1,.1))
plot(0,col=NA,ylim=c(0,1.2),xlim=c(0,length(CDF)+1),
axes=FALSE,xlab=NA,ylab=NA)
axis(2)
abline(h=c(minf,.8),lty=2)
leg <- NULL
for ( i in 1:length(CDF) ) {
if ( "rcdf" %in% names(CDF[[i]]) ) {
nm <- CDF[[i]]$qid
if ( diff(height[i,]) > 0 )
arrows(x0=i,x1=i,y0=height[i,1],y1=height[i,2],
col=pmcol[pmcls[i]],lty=sgltys[nm],
length=0.1, angle=90, code=3)
text(i, 1.1, round(diff(height[i,]),3),srt=90)
}
}
axis(1,at=1:length(CDF),labels=nms,las=2)
dev.off()
}
## MAX vs. MIN: good hits per target vs. num hits per target
file.name <- file.path(out.path,testid,
paste(test.type,"_coverage_fragmentation",
fname,sep=""))
plotdev(file.name,width=10,height=4,type=fig.type)
par(mfcol=c(1,2),mai=c(1,1,.1,.1))
plot(hitnum/tnum, numhit,
col=sgcols[nms],pch=sgpchs[nms],
ylab="average hits per target sequence",
xlab="total number of 'good' hits")#,
par(mai=c(.1,.1,.1,.1))
plot(0, col=NA,axes=FALSE,ylab=NA,xlab=NA)
legend("topleft", legend=nms, col=sgcols[nms],pch=sgpchs[nms],
cex=.5,bty="n",ncol=2)
dev.off()
## MAXIMIZE COVERAGE: good hits per target
cvg <- hitnum/tnum
file.name <- file.path(out.path,testid,
paste(test.type,"_coverage",fname,sep=""))
plotdev(file.name,,width=2+.2*length(CDF),height=6,type=fig.type)
par(mai=c(3.3,.75,.1,.1),mgp=c(1.75,.5,0))
plot(1:length(cvg),cvg,type="b",col=sgcols[nms],pch=sgpchs[nms],axes=FALSE,
xlab=NA,ylab=paste("number of 'good' hits",ovlth))
axis(2)
axis(1,at=1:length(cvg),labels=nms,las=2)
dev.off()
## MAXIMIZE JACCARD: good hits per target
file.name <- file.path(out.path,testid,
paste(test.type,"_jaccard",fname,sep=""))
plotdev(file.name,,width=2+.2*length(CDF),height=6,type=fig.type)
par(mai=c(3.3,.75,.1,.1),mgp=c(1.75,.5,0))
plot(1:length(jaccard),jaccard,type="b",col=sgcols[nms],pch=sgpchs[nms],
axes=FALSE,xlab=NA,ylab="jaccard: intersect/union")
axis(2)
axis(1,at=1:length(jaccard),labels=nms,las=2)
dev.off()
## MINIMIZE FRAGMENTATION: num hits per target
nmh <- numhit
file.name <-file.path(out.path,testid,
paste(test.type,"_fragmentation",fname,sep=""))
plotdev(file.name,,width=2+.2*length(CDF),height=6,type=fig.type)
par(mai=c(3.3,.75,.1,.1),mgp=c(1.75,.5,0))
plot(1:length(nmh),nmh,type="b",col=sgcols[nms],pch=sgpchs[nms],axes=FALSE,
xlab=NA,ylab=paste("avg segments per target"))
axis(2)
axis(1,at=1:length(nmh),labels=nms,las=2)
dev.off()
}
dir.create(file.path(out.path,testid,"segtypes"))
### PLOT BY SEGMENT TYPES & SAVE RDATA!
for ( type in sgtypes ) {
## re-order result list
hitnum <- rep(NA,length(test.types))
CDF <- rep(list(NA),length(test.types))
height <- matrix(NA, ncol=2, nrow=length(test.types))
rownames(height) <- names(hitnum) <- names(CDF) <- test.types
numhit <- tnum <- hitnum
for ( test.type in test.types ) {
if ( is.null(stats[[test.type]][[type]]) ) next
CDF[[test.type]] <- stats[[test.type]][[type]]$CDF
CDF[[test.type]]$name <- test.type
numhit[test.type] <- stats[[test.type]][[type]]$numhit
hitnum[test.type] <- stats[[test.type]][[type]]$hitnum
tnum[test.type] <- stats[[test.type]][[type]]$tnum
height[test.type,] <- stats[[test.type]][[type]]$height
}
## save as RData file for each segmentation
if ( save.rdata ) {
file.name <- file.path(out.path,testid,"segtypes",
paste0(type,fname,".RData"))
overlaps <- list(CDF=CDF,
numhit=numhit,
hitnum=hitnum,
tnum=tnum,
height=height)
save(overlaps,file=file.name)
}
## plot
file.name <- file.path(out.path,testid,"segtypes",
paste(type,"_ratioTotal",fname,sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type)
par(mai=c(.75,.75,.5,.1),mgp=c(1.75,.5,0),xaxs="i")
leg <- NULL
xmax <- 3
x <- seq(0,xmax,.05)
plot(1,type="l",main=NA,col=NA,lty=1,
xlab="ratio: query length/target length",xlim=c(0,xmax),
ylab="cum.dist.func.",ylim=c(0,1.1))
abline(v=c(ovlth,2-ovlth),lty=2)
abline(h=c(minf,.8),lty=2)
abline(h=0:1, lty=2, col="gray",lwd=.75)
for ( i in 1:length(CDF) ) {
if ( length(CDF[[i]])<2 ) next
col <- tcols[CDF[[i]]$name] # todo: tid
if ( !is.null(CDF[[i]]$rcdf) ) {
lines(x,CDF[[i]]$rcdf(x),col=paste(col,"AA",sep=""),lty=1,lwd=2)
leg <- c(leg, CDF[[i]]$name) # todo: tid
}
}
mtext(paste(testid,"recovery by segments"),3,0,cex=1.5)
if ( !is.null(leg) )
legend("bottomright",leg,col=tcols[leg],lty=1,lwd=2,bty="n")
legend("topleft",paste(type))
dev.off()
## CONSSEG
if ( Sys.Date() < as.Date("2021-02-01") ) {
## plot
file.name <- file.path(out.path,testid,"segtypes",
paste(type,"_ratioTotal_consseg",fname,sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type,res=300)
par(mai=c(.5,.5,.125,.125),mgp=c(1.3,.4,0),xaxs="i")
leg <- NULL
xmax <- 3
x <- seq(0,xmax,.05)
plot(1,type="l",main=NA,col=NA,lty=1,
xlab="ratio: query length/target length",xlim=c(0,xmax),
ylab="cum.dist.func.",ylim=c(0,1.05))
abline(v=c(ovlth,2-ovlth),lty=2)
abline(h=c(minf,.8),lty=2)
abline(h=0:1, lty=2, col="gray",lwd=.75)
for ( i in 1:length(CDF) ) {
if ( length(CDF[[i]])<2 ) next
col <- tcols[CDF[[i]]$name] # todo: tid
if ( !is.null(CDF[[i]]$rcdf) ) {
lines(x,CDF[[i]]$rcdf(x),col=i,lty=1,lwd=2)
leg <- c(leg, CDF[[i]]$name) # todo: tid
}
}
#mtext(paste(testid,"recovery by segments"),3,0,cex=1.5)
if ( !is.null(leg) ) {
leg <- gsub("D:dft1-7.dcash.snr_T:raw_K:12_S:icor_","",leg)
legend("bottomright",leg,col=1:length(CDF),cex=.7,lty=1,lwd=2,
bty="o",bg="#FFFFFF",box.col=NA)
}
legend("topleft",paste("query:",type),seg.len=0,
bg="#FFFFFFFF",y.intersp=0.1, x.intersp=0)
dev.off()
}
file.name <- file.path(out.path,testid,"segtypes",
paste(type,"_ratioTotal_rng",fname,sep=""))
plotdev(file.name,width=2+.2*length(CDF),height=6,type=fig.type)
par(mai=c(3.3,.75,.1,.1))
plot(0,col=NA,ylim=c(0,1.2),xlim=c(0,length(CDF)+1),
axes=FALSE,xlab=NA,ylab=NA)
axis(2)
abline(h=c(minf,.8),lty=2)
leg <- NULL
for ( i in 1:length(CDF) ) {
if ( "rcdf" %in% names(CDF[[i]]) ) {
if ( diff(height[i,]) > 0 )
arrows(x0=i,x1=i,y0=height[i,1],y1=height[i,2],
col=tcols[names(CDF)[i]],lty=1,length=0.1,
angle=90,code=3)
text(i, 1.1, round(diff(height[i,]),3),srt=90)
}
}
axis(1,at=1:length(CDF),labels=names(CDF),las=2)
dev.off()
}
} # end of loop over real vs. random!
if ( !interactive() ) quit(save="no")
### TEST-SET ANALYSES
#### TODO; allow segmentRevery to work with transcripts vs features
## ORF recovery by transcripts!
trcl <- segmentAnnotate(query=clorf,target=genes,
add.na=TRUE,details=TRUE,sort=TRUE)
## plot cluster gene recovery by published transcripts
trst <- getOverlapStats(trcl,tnum=nrow(genes),
qnum=nrow(clorf),qid="clorf", ovlth=ovlth)
file.name <- file.path(paste("orfXtranscripts_ratioTotal",sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type)
par(mai=c(.75,.75,.5,.1),mgp=c(1.75,.5,0),xaxs="i")
xmax <- 2
x <- seq(0,xmax,.05)
plot(x, trst$CDF$rcdf(x),type="l",col=1,lty=1,xlim=c(0,xmax),xlab="transcript length/ORF length",ylab="cum.dist.func.",main=NA,ylim=c(0,1))
abline(v=c(ovlth,2-ovlth),lty=2)
abline(h=c(minf,.8),lty=2)
abline(h=0:1, lty=2, col="gray",lwd=.75)
mtext(paste("ORF recovery by ORF transcripts"),3,0,cex=1.5)
legend("bottomright","ORF recovery by ORF transcripts",bty="n")
legend("topleft","")
dev.off()
## cluster ORF recovery by transcripts
## CLUSTER ORFS
test <- "clgenes"
trgs <- genes
#trgs[trgs[,"CL_rdx"]=="","CL_rdx"] <- "na"
tpcol <- "CL_rdx"
cpcol <- "CL_rdx_col"
## output dir
dir.create(file.path(out.path,test),recursive=TRUE)
## test types
ttypes <- as.character(trgs[,tpcol])
ttypes[ttypes==""] <- "na"
ttypes[is.na(ttypes)] <- "na"
test.types <- sort(names(table(ttypes)))
tcols <- sapply(test.types,
function(x)
as.character(trgs[which(trgs[,tpcol]==x)[1],cpcol]))
tcols["na"] <- "#939393"
## results
ovlstats <- rep(list(NA),length(test.types))
names(ovlstats) <- test.types
query <- transcripts[transcripts[,"type"]=="ORF",]
for ( test.type in test.types ) {
target <- trgs[ttypes==test.type,,drop=FALSE]
cat(paste("transcript overlap with",test.type,"\t",time(),"\n"))
ovl <- segmentAnnotate(query=query,target=target, add.na=TRUE,details=TRUE)
sts <- list(CDF=NA,hitnum=0,qid=test.type,tid="ORF")
if ( any(!is.na(ovl[,"query"])))
sts <- getOverlapStats(ovl,ovlth=.8,qid=test.type,tid="ORF",
tnum=nrow(target),qnum=nrow(query))
ovlstats[[test.type]] <- sts
}
rm <- unlist(lapply(ovlstats, function (x)
!any(x$hitnum>0)))
ovlstats <- ovlstats[!rm]
test.types <- test.types[!rm]
## fuse results to primary result list - TODO: not fused?
CDF <- rep(list(NA),length(test.types))
names(CDF) <- test.types
for ( test.type in test.types ) {
CDF[[test.type]] <- ovlstats[[test.type]]$CDF
CDF[[test.type]]$name <- test.type
}
## plot
file.name <- file.path(paste("clusterXtranscripts_ratioTotal",sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type)
par(mai=c(.75,.75,.5,.1),mgp=c(1.75,.5,0),xaxs="i")
leg <- NULL
xmax <- 3
x <- seq(0,xmax,.05)
plot(x, CDF[[1]]$rcdf(x),type="l",col=NA,lty=1,xlim=c(0,xmax),xlab="ratio: query length/target length",ylab="cum.dist.func.",main=NA,ylim=c(0,1))
abline(v=c(ovlth,2-ovlth),lty=2)
abline(h=c(minf,.8),lty=2)
abline(h=0:1, lty=2, col="gray",lwd=.75)
for ( i in 1:length(CDF) ) {
col <- tcols[CDF[[i]]$name]
if ( "rcdf" %in% names(CDF[[i]]) )
lines(x,CDF[[i]]$rcdf(x),col=paste(col,"AA",sep=""),lty=1,lwd=2)
leg <- c(leg, CDF[[i]]$name)
}
mtext(paste(test,"recovery by ORF transcripts"),3,0,cex=1.5)
legend("topleft","",bty="n")
legend("bottomright",leg,col=tcols[leg],lty=1,lwd=2,bty="n")
dev.off()
if ( !interactive() ) quit(save="no")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.