scripts/segmentRecovery.R

#!/usr/bin/env Rscript

## TESTING SEGMENT LENGTH DIST AND AGAINST ANNOTATED GENES AND TRANSCRIPTS

library(segmenTools)
##segtools <- "~/programs/segmenTools/"
##source(file.path(segtools,"R/segmenTools.R")) # for segment analysis
##source(file.path(segtools,"R/coor2index.R")) # coor2index

library(cluster) # for pam clustering
library(optparse) # command-line options

## nicer timestamp
time <- function() format(Sys.time(), "%Y%m%d %H:%M:%S")
## cluster/segment colors; function derived from scale_colour_hue in ggplot2
color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  grDevices::hcl(h = hues, l = 65, c = 100)[1:n]
}

### OPTIONS
option_list <- list(
    make_option(c("-i", "--infile"), type="character", default="", 
                help="chromosome coordinates of primary segments as produced by clusterSegments.R but without header ('allsegs.csv')"),    
    make_option(c("--chrfile"), type="character", default="",
                help="chromosome index file, providing a sorted list of chromosomes and their lengths in column 3 [default %default]"),
    make_option(c("--qtypes"), type="character", default="", 
                help="sub-set testset in column 'type'"),
    make_option(c("--qtypcol"), type="character", default="type", 
              help="name of column with sub-set annotation"),
    ##chrfile = $YEASTDAT/chromosomes/sequenceIndex_R64-1-1_20110208.csv
    ## SEGMENT TEST SETTINGS
    make_option(c("--fuse.segs"), action="store_true", default=FALSE,
                help="use FUSE tag from clusterSegments to fuse adjacent segments"),
    make_option(c("--target"), type="character", default="", 
              help="target set of chromosomal segments"),    
    make_option(c("--ttypes"), type="character", default="", 
                help="sub-set testset in column 'type'"),
    make_option(c("--ttypcol"), type="character", default="type", 
              help="name of column with sub-set annotation"),
    make_option(c("--tcolcol"), type="character", default="", 
              help="name of column with sub-set colors for plots"),
    make_option("--ovlth", default=0.8,
                help="overlap threshold (mutual coverage) to be counted as a direct hit; at least ovlth*length must be reached for both query (segments) and targets (test set) [default %default]"),
    make_option("--minj", default=0.8,
                help="minimal jaccard index, the fraction of tests above this threshold are reported and the cutoff line plotted [default %default]"),
    make_option("--minf", default=0.2,
                help="minimal fraction of testsets; the jaccard index at this threshold is reported and the cutoff line plotted [default %default]"),
    make_option(c("--randomize"), action="store_true", default=FALSE,
                help="additionally calculate overlaps for randomized target segments"),
    ## OUTPUT OPTIONS
    make_option(c("--out.path"), type="character", default=".", 
                help="directory path for output data (figures, csv files)"),
    make_option(c("--testid"), type="character", default="testset", 
                help="ID of the testset, used for folder and file names"),
    make_option(c("-v", "--verb"), type="integer", default=1, 
                help="0: silent, 1: main messages, 2: warning messages"),
    make_option(c("--fig.type"), type="character", default="png",
                help="figure type, png or pdf [default %default]"),
    make_option(c("--save"), action="store_true", default=FALSE,
                help="save complete overlap data as one RData file (big!)"),
    make_option(c("--save.rdata"), action="store_true", default=FALSE,
                help="save overlap data as individual RData files for segment types"))

## get command line options
opt <- parse_args(OptionParser(option_list=option_list))

## process comma-separated list arguments
lst.args <- c(ttypes="character",qtypes="character")
for ( i in 1:length(lst.args) ) {
    idx <- which(names(opt)==names(lst.args)[i])
    opt[[idx]] <- unlist(strsplit(opt[[idx]], ","))
    ##for ( j in 1:length(opt[[idx]]) ) {
    ##    tmp <- strsplit(opt[[idx]][j], ":")
    ##}
    mode(opt[[idx]]) <- lst.args[i]
}

## promote options to main environment and print all arguments
if ( opt$verb>0 )
    cat(paste("SETTINGS:\n"))
for ( i in 1:length(opt) ) {
    if ( opt$verb>0 )
        cat(paste("\t",names(opt)[i], ":", #typeof(opt[[i]]),
                  paste(opt[[i]],collapse=", "), "\n"))
    arg <- names(opt)[i]
    assign(arg, opt[[arg]])
}
if ( verb>0 )
    cat(paste("\n"))


### START

## load chromosome index
if ( verb>0 )
    cat(paste("Loading chromosome index file:", chrfile, "\n"))
cf <- read.table(chrfile,sep="\t",header=FALSE)
chrS <- c(0,cumsum(cf[,ncol(cf)])) ## index of chr/pos = chrS[chr] + pos

## READ SEGMENTS TO BE TESTED 
if ( verb>0 )
    cat(paste("LOADING SEGMENTS:", infile, "\n"))
segs <- read.table(infile,sep="\t",header=TRUE, comment.char="")



#' uses a logical column to fuse adjacent segments, i.e. the
#' lower segment i with segment i-1, where segments
#' is an ordered list of segments
fuseSegments <- function(segments, col="fuse", val=1) {
    fuse <- segments[2:nrow(segments),col] == val
    cat(paste("NOTE: FUSING", sum(fuse), "SEGMENTS, from segment types:\n",
              paste(unique(segments[fuse,"type"]),collapse="; "),"\n"))
    fsegs <- segments[c(TRUE,!fuse),]
    
    fsegs[,"end"] <- segments[c(!fuse,TRUE),"end"]
    fsegs
}
## use "fuse" column from clusterSegments.R fuse filter!
if ( fuse.segs ) {
    if ( verb>0 )
        cat(paste("fusing segments by fuse tag\n"))
    segs <- fuseSegments(segs,col="fuse",val=1)
}

## replace genome coordinates by continuous index
segs <- coor2index(segs,chrS)

## filter types
qtypes <- qtypes[qtypes!=""]
if ( length(qtypes)==0 ) {
    qtypes <- as.character(segs[,qtypcol])
    qtypes[qtypes==""] <- "na"
    qtypes[is.na(qtypes)] <- "na"
} else {
    ## filter to types given by cmdline arg qtypes
    segs <- segs[as.character(segs[,qtypcol])%in%qtypes,]
    qtypes <- as.character(segs[,qtypcol]) # get remaining
}

## split by type
lst <- split(segs,as.character(segs[,qtypcol]))
sgtypes <- names(lst)



debug <- FALSE
if ( debug ) { ## test only large number
    sgnum <- unlist(lapply(lst,nrow))
    sgtypes <- sgtypes[sgnum>1000] ### TODO: remove this and fuse types

    ## only selected!
    best.selected <- c("D:dft1-7.dcash.snr_T:raw_K:12_S:icor_E:1_M:75_nui:1",
                       ##"D:dft1-7.dcash.snr_T:raw_K:12_S:icor_E:2_M:150_nui:2",
                       "D:dft1-7.dcash.snr_T:raw_K:12_S:icor_E:2_M:150_nui:3",
                       "D:dft1-7.dcash.snr_T:raw_K:12_S:icor_E:3_M:200_nui:3")
    worst.selected <- c("D:dft1-7.dcash.snr_T:raw_K:12_S:icor_E:1_M:200_nui:1",
                        "D:dft1-7.dcash.snr_T:raw_K:12_S:icor_E:3_M:75_nui:3")
    ## concatenate: numbered 1--5:
    selected <- c(worst.selected[1],best.selected,worst.selected[2])
    sgtypes <- selected

}

## define colors and pch for segment types
sgcols <- rainbow(length(sgtypes))
sgpchs <- rep(1:17, len=length(sgtypes)) #only 17 of 18 to avoid multiple of lty
sgltys <- rep(1:6, len=length(sgtypes))
names(sgcols) <- names(sgpchs) <- names(sgltys) <- sgtypes

## get segment classes
## TODO: rm sgclasses?
## this is only used for PAM clustering below and
## was specific for analysis of segmenTier results 2017
sgclasses <- getSegmentClasses(sgtypes, sep="_")

## col, pch and lty for classes
sgclcols <- rainbow(length(sgclasses))
sgclpchs <- rep(1:17, len=length(sgclasses)) #only 17 to avoid lty multiple
sgclltys <- rep(1:6, len=length(sgclasses))
names(sgclcols) <- names(sgclpchs) <-names(sgclltys) <- sgclasses

## new: segment class table, only used for PAM clustering 
sgcltab <- getSegmentClassTable(sgtypes, sep="_")

### START ANALYSIS

## output dirs - generate locally in cwd
dir.create(out.path) # length distributions
## output dir
dir.create(file.path(out.path,testid),recursive=TRUE) # test set


### TEST AGAINST OTHER DATA SETS

if ( verb>0 )
    cat(paste("LOADING TEST SETS:", testid, "-", target, "\n"))

trgs <- read.table(target,sep="\t",header=TRUE,
                   stringsAsFactors=FALSE, comment.char = "")
trgs <- coor2index(trgs,chrS) # map to continuous index!

tpcol <- ttypcol
cpcol <- tcolcol

## test types
ttypes <- ttypes[ttypes!=""]
if ( length(ttypes)==0 ) {
    ttypes <- as.character(trgs[,tpcol])
    ttypes[ttypes==""] <- "na"
    ttypes[is.na(ttypes)] <- "na"
} else {
    ## filter to types given by cmdline arg ttypes
    trgs <- trgs[as.character(trgs[,tpcol])%in%ttypes,]
    ttypes <- as.character(trgs[,tpcol]) # get remaining
}
test.types <- sort(names(table(ttypes))) # sort by number

if ( cpcol=="" ) {
    # TODO: rgb range1:length(test.types)
    tcols <- sub("00$","",rainbow(length(test.types),alpha=0))
    names(tcols) <- test.types
} else {
    ## just take first color for each type,
    ## user must ensure that these are unique
    tcols <- sapply(test.types,
                    function(x)
                        as.character(trgs[which(trgs[,tpcol]==x)[1],cpcol]))
}
tcols["na"] <- "#939393"


## results
ovlstats <- rep(list(NA),length(test.types))
names(ovlstats) <- test.types

#randomize <- TRUE # FALSE # 
if ( randomize )
    rndstats <- ovlstats

## loop over targets (transcript and ORF data sets from SGD)
for ( test.type in test.types ) {

    if ( verb> 0 )
        cat(paste(test.type, "\t",time(),"\n"))
    
    target <- trgs[ttypes==test.type,]

    ## TODO: integrate randomization in analysis?
    if ( randomize ) 
        target.rnd <- randomSegments(target) ## TODO: with total or not?

    ## result list: overlap statistics
    ostat <- rep(list(NA),length(sgtypes))
    names(ostat) <- sgtypes
    ovlstats[[test.type]] <- ostat

    if ( randomize )
        rndstats[[test.type]] <- ostat

    ## loop over queries (segment types)
    for ( type in sgtypes ) {
        if ( verb>0 )
            cat(paste("#",which(sgtypes==type), type,
                      "overlap with",test.type,"\t",time(),"\n"))
        sgs <- lst[[type]]
        
        ## for testing
        devel <- FALSE
        if ( devel )
            sgs <- sgs[1:1000,]

        
        ovl <- segmentAnnotate(query=sgs, target=target,
                               add.na=TRUE, details=TRUE,
                               untie=FALSE, collapse=FALSE, sort=FALSE,
                               msgfile=stdout())
        if ( !any(!is.na(ovl[,"query"])) ) next
        sts <- getOverlapStats(ovl, ovlth=ovlth, minj=minj, minf=minf,
                               hrng=c(.8,1.2),
                               tnum=nrow(target),qnum=nrow(sgs),
                               qid=type, tid=test.type)
        
        ovlstats[[test.type]][[type]] <- sts

        if ( randomize ) {
            
            rnd <- segmentAnnotate(query=sgs, target=target.rnd,
                                   add.na=TRUE, details=TRUE,
                                   untie=FALSE, collapse=FALSE, sort=FALSE,
                                   msgfile=stdout())
            #if ( !any(!is.na(ovl[,"query"])) ) next
            sts.rnd <- getOverlapStats(rnd, ovlth=ovlth, minj=minj, minf=minf,
                                       hrng=c(.8,1.2),
                                       tnum=nrow(target.rnd),qnum=nrow(sgs),
                                       qid=type, tid=test.type)
            
            rndstats[[test.type]][[type]] <- sts.rnd
        }

        ## TODO: save results as files 
    }
}

if ( save ) 
  save.image(file.path(out.path,testid,"overlaps.RData"))

## REMOVE EMPTY TESTS

for ( test.type in test.types ) {

    cat(paste("checking", test.type, "\n"))
    lst <- ovlstats[[test.type]]

    ## rm empty
    len <- unlist(lapply(lst, function(x) length(x) ))
    if ( any(len==1) )
      cat(paste("\tremoving",sum(len==1),"empty tests:",
                paste(names(lst)[len==1],collapse="; "),"\n"))
    lst <- lst[len>1]

    ## re-attach
    ovlstats[[test.type]] <- lst

    if ( randomize ) {
        
        cat(paste("\trandomized\n"))
        lst <- rndstats[[test.type]]
        
        ## rm empty
        len <- unlist(lapply(lst, function(x) length(x) ))
        if ( any(len==1) )
          cat(paste("\tremoving",sum(len==1),"empty tests:",
                    paste(names(lst)[len==1],collapse="; "),"\n"))
        lst <- lst[len>1]
        
        ## re-attach
        rndstats[[test.type]] <- lst
    }
}


## COLLATE DATA AND WRITE OUT RESULTS
## export hitnum, jaccard, numhit and ratio thresholds 
for ( test.type in test.types ) {

    #if ( is.null(ovlstats[[test.type]]) ) next
    
    covlStats <- collectOvlStats(ovlstats, type=test.type)

    ids <- covlStats$nms
    jaccard <- covlStats$jaccard # jaccard of best hits
    j.prcnt<- covlStats$j.prcnt# percent of targets covered with J>threshold
    j.cutoff<- covlStats$j.cutoff# jaccard index at CDF(threshold)
    height <- covlStats$height # target recovery fraction within threshold
    hitnum <- covlStats$hitnum # total number of 'good' hits 
    numhit <- covlStats$numhit # average number of hits per target
    tnum <- covlStats$tnum # number of tested targets
    tnum <- covlStats$tnum # number of tested targets

    result <- data.frame(ID=ids, tnum=tnum, hits=hitnum, Jaccard=jaccard,
                         JaccPrcnt=j.prcnt, JaccCutoff=j.cutoff,
                         hits.per.target=numhit,
                         ratio.low=height[,1], ratio.high=height[,2])
    
    ## write out table of segmentation characteristics
    file.name <- file.path(out.path,testid,paste("segmentRecovery_",test.type,
                                          ".csv",sep=""))
    write.table(result,file=file.name, sep="\t",
                col.names=TRUE,row.names=FALSE,quote=FALSE)
}

## directly compare real vs. random targets
if ( randomize ) {

    rnd.path <- file.path(out.path,testid,"randomize")
    dir.create(rnd.path)

    x <- seq(0,4,.01)

    totN <- length(test.types)*length(sgtypes)
    relative <- rep(list(NA), totN)
    for ( test.type in test.types ) {
        for ( type in sgtypes ) {
            rcdf.real <- rcdf.rand <- NULL
            if ( length(ovlstats[[test.type]][[type]])>1 ) 
                rcdf.real <- ovlstats[[test.type]][[type]]$CDF$rcdf
            if ( length(rndstats[[test.type]][[type]])>1 ) 
                rcdf.rand <- rndstats[[test.type]][[type]]$CDF$rcdf
            
            file.name <- file.path(rnd.path,paste0(test.type,"_",type))
            plotdev(file.name,width=4,height=3.5,type=fig.type)
            par(mai=c(.5,.5,.5,.1),mgp=c(1.3,.4,0),tcl=-.2)
            plot(1,col=NA,xlim=range(x),ylim=c(0,1),
                 main=paste(test.type, "recovery"),
                 xlab="segment length/target length",ylab="ECDF")
            abline(h=0:1, lty=2, lwd=.5, col="gray70")
            abline(h=c(minf,.8), lty=2, lwd=.5, col="gray70")
            abline(v=c(ovlth,2-ovlth), lty=2, lwd=.5, col="gray70")
            if ( !is.null(rcdf.real) )
              lines(x, rcdf.real(x), col=1,lty=1,lwd=2)
            if ( !is.null(rcdf.rand) )
              lines(x, rcdf.rand(x), col=2,lty=2,lwd=2)
            legend("topleft",legend=c("real","random"),lty=1:2,col=1:2,lwd=2,
                   bty="n")
            dev.off()

        }
    }
}


#### PLOT OF OVERLAP STATISTICS

sets <- c("ovl")
if ( randomize )
    sets <- c("ovl","rnd")

for ( set in sets ) {

    stats <- get(paste0(set,"stats"))
    fname <- ifelse(set=="rnd","_random","")
    
    ## remove empty results!
    rm <- unlist(lapply(stats, function (x)
        !any(unlist(lapply(x, function(y) y$hitnum))>0)))

    
    
    stats <- stats[!rm]
    test.types <- test.types[!rm]

### PLOT BY TEST TYPES
for ( test.type in test.types ) {


    if ( !test.type%in%names(stats) ) {
        warning(test.type, " not found in stats object for set: ", set)
        next
    }
    
    cstats <- collectOvlStats(stats, type=test.type)

    CDF <- cstats$CDF
    jaccard <- cstats$jaccard
    height <- cstats$height
    hitnum <- cstats$hitnum
    numhit <- cstats$numhit
    qnum <- cstats$qnum
    tnum <- cstats$tnum
    nms <- cstats$nms

    sgnum <- length(sgtypes) ## skip clustering if only one!
    
    ## TODO: mv this to getOverlapStats
    
    ## MAX vs. MIN CLUSTERING  cluster jaccard vs. numhits vs segment classes
    ## TODO: also include height in clustering?
    pm <- NULL
    if ( sgnum>7 & ncol(sgcltab)>1) { # TODO: pmcol used below skip if 
        K <- 7
        dat <- cbind((jaccard-min(jaccard))/(max(jaccard)-min(jaccard)),
                     (numhit-min(numhit))/(max(numhit)-min(numhit)),
                     height)
        pm <- pam(dat, K)
        cllst <- apply(sgcltab,2, unique) 

        ## TODO: sort clustering by increasing height!
        pmsrt <- split(dat[,3],pm$clustering)
        pmmn <- unlist(lapply(pmsrt, mean))
        
        pmsrt <- order(as.numeric(names(pmmn)[order(pmmn)]))
        ## sorted clustering
        pmcls <- pmsrt[pm$clustering]

        pmcol <- rainbow(length(pmsrt))
        names(pmcol) <- 1:length(pmsrt)

        ## write out table of segmentation characteristics
        result <- data.frame(ID=nms, CL=pmcls)
        file.name <- file.path(out.path,testid,
                               paste("segmentRecovery_",test.type,
                                     "_clusters",fname,".csv",sep=""))
        write.table(result,file=file.name, sep="\t",
                    col.names=TRUE,row.names=FALSE,quote=FALSE)
        
        
        ## strange bug: 75 in list of 75,100,150 gets a leading space
        ## TODO: names lost for $rundir/test.csv !? 
        ## trim all:
        cllst <- lapply(cllst,trimws)
        
        ## sor numeric! K, S, E, M, nui
        cllst <- lapply(cllst, function(x) {
            if ( suppressWarnings(all(!is.na(as.numeric(x)))) )
                x <- as.character(sort(as.numeric(x)))
            x})
        allcl <- unlist(sapply(1:length(cllst),
                               function(x) paste(names(cllst)[x],
                                                 cllst[[x]],sep=".")))
        enum <- matrix(NA, nrow=K, ncol=length(allcl))
        colnames(enum) <- allcl
        rownames(enum) <- 1:K
        pval <- enum
        ## get cumulative hypergeometric distribution of clustering vs. segments
        for ( i in 1:K ) 
            for ( j in 1:ncol(sgcltab) ) {
                clcl <-  clusterCluster(pmcls==i,sgcltab[,j])
                cln <- colnames(sgcltab)[j]
                cln <- paste(cln,colnames(clcl$overlap),sep=".")
                if ( "TRUE" %in% rownames(clcl$overlap) ) {
                    enum[i,cln] <- clcl$overlap["TRUE",]
                    pval[i,cln] <- clcl$p.value["TRUE",]
                }
            }
        
        file.name <- file.path(out.path,testid,
                               paste(test.type,"_segmentationClusters",
                                     fname,sep=""))
        plotdev(file.name,width=4.5,height=4.5,type=fig.type)
        par(mai=c(.7,.5,.1,.1),mgp=c(1.7,.5,0))
        image_matrix(-log2(pval) ,text=enum, axis=1:2,
                     col=c("#FFFFFF",rev(grey.colors(20))),
                     axis2.col=pmcol[1:nrow(pval)],
                     xlab=NA,ylab=NA)
        abline(v=cumsum(unlist(lapply(cllst, length)))+.5)
        axis(1, at=cumsum(unlist(lapply(cllst, length)))+.5, tck=-.1,labels=NA) 
        dev.off()
        ## TODO: plot by segment classes as length dist
        
        ## new lower and upper threshold of ratio
        ## OPT: uppler left, 
        file.name <- file.path(out.path,testid,
                               paste(test.type,"_ratioTotal_lh_clustered",
                                     fname,sep=""))
        plotdev(file.name,width=3.5,height=3.5,type=fig.type)
        par(mfcol=c(1,1),mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0))
        plot(height,xlab=expression(R["short"]),ylab=expression(R["long"]),
         col=NA)
        legend("topleft","good",bty="n",text.font=2)
        points(height,col=pmcol[pmcls],pch=20) #sgpchs[nms])
        abline(v=.2,lty=2)
        abline(h=.8,lty=2)
        dev.off()
    }
    
    file.name <- file.path(out.path,testid,
                           paste(test.type,"_ratioTotal_lh",fname,sep=""))
    plotdev(file.name,width=10,height=5,type=fig.type)
    par(mfcol=c(1,2),mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0))
    plot(height,xlab="fraction: ratio < 0.8",ylab="fraction: ratio < 1.2",
         col=sgcols[nms],pch=sgpchs[nms])
    abline(v=.2,lty=2)
    abline(h=.8,lty=2)
    par(mai=c(.1,.1,.1,.1))
    plot(0, col=NA,axes=FALSE,ylab=NA,xlab=NA)
    legend("topleft", legend=nms, col=sgcols[nms],pch=sgpchs[nms],
           cex=.5,bty="n",ncol=2)
    dev.off()
    
    ## CDF of absolute best hit CDF (rcdf)
    file.name <- file.path(out.path,testid,
                           paste(test.type,"_ratioTotal",fname,sep=""))
    plotdev(file.name,width=3.5,height=3.5,type=fig.type)
    par(mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0),xaxs="i")
    plot_cdfLst(x=seq(0,2,.05), CDF=CDF, type="rcdf", col=sgcols, lty=sgltys,
                h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
                xlab="ratio: query length/target length")
    legend("topleft",paste(test.type,"-",tnum))
    dev.off()

    ## CONSSEG FIGURE: consensus segmentation vs. segmenTier input
    if ( Sys.Date() < as.Date("2021-02-01") ) {

        ccols <- sgcols
        ccols["consensus"] <- "#000000"
        ccols[names(ccols)!="consensus"] <- c(2:6)
        clwd <- rep(1, length(sgcols))
        names(clwd) <- names(sgcols)
        clwd["consensus"] <- 2
        clty <- rep(2, length(sgcols))
        names(clty) <- names(sgcols)
        clty["consensus"] <- 1
                    
        ## CDF of absolute best hit CDF (rcdf)
        file.name <- file.path(out.path,testid,
                               paste(test.type,"_ratioTotal_consseg",
                                     fname,sep=""))
        plotdev(file.name,width=3.5,height=3.5,type=fig.type,res=300)
        par(mai=c(.5,.5,.125,.125),mgp=c(1.3,.4,0),xaxs="i")
        plot_cdfLst(x=seq(0,2,.05), CDF=CDF, type="rcdf",
                    lwd=clwd,
                    col=ccols, lty=clty,
                    h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
                    xlab="ratio: query length/target length")
        legend("topleft",paste("target:", test.type))#,"-",tnum))
        nms <- gsub("D:dft1-7.dcash.snr_T:raw_K:12_S:icor_","",names(clwd))
        legend("right", nms, lwd=clwd, col=ccols, lty=clty,
               bg="white",box.col=NA,cex=.7)
        dev.off()
    }

    ## CDF of absolute best hit CDF (rcdf)  - cluster colors
    if ( !is.null(pm) ) {
        file.name <- file.path(out.path,testid,
                               paste(test.type,"_ratioTotal_clustered",
                                     fname,sep=""))
        plotdev(file.name,width=3.5,height=3.5,type=fig.type)
        par(mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0),xaxs="i")
        plot_cdfLst(x=seq(0,2,.05), CDF=CDF, type="rcdf",
                    col=pmcol[pmcls], lty=sgltys,
                    h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
                    xlab="R: query length/target length",range="lines")
        legend("topleft",paste(test.type,"-",tnum))
        rect(1.4,.26,1.9,.34,col="#FFFFFFBB",border=NA)
        text(1.65,.3,"too long",font=2, col=pmcol[1])
        rect(.1,.56,0.6,.64,col="#FFFFFFBB",border=NA)
        text(.35,.6,"too short",font=2, col=pmcol[length(pmcol)])
        points(1.2,.1,pch=4)
        text(1.2,.1,expression(R["long"]),pos=4,font=2)
        points(.8,.7,pch=4)
        text(.8,.7,expression(R["short"]),pos=2,font=2)
        dev.off()
    }
      
    ## MAX vs. MIN: jaccard of best hits vs. num hits per target 
    file.name <- file.path(out.path,testid,
                           paste(test.type,"_jaccard_fragmentation",
                                 fname,sep=""))
    plotdev(file.name,width=10,height=4,type=fig.type)
    par(mfcol=c(1,2),mai=c(1,1,.1,.1))
    plot(jaccard,numhit,
         col=sgcols[nms],pch=sgpchs[nms],
         ylab="average hits per target sequence",
         xlab="jaccard: intersect/union")#,
    par(mai=c(.1,.1,.1,.1))
    plot(0, col=NA,axes=FALSE,ylab=NA,xlab=NA)
    legend("topleft", legend=nms, col=sgcols[nms],pch=sgpchs[nms],
           cex=.5,bty="n",ncol=2)
    dev.off()

    ## MAX vs. MIN CLUSTERS - JACCARD
    if ( !is.null(pm) ) {
        file.name <- file.path(out.path,testid,
                               paste(test.type,
                                     "_jaccard_fragmentation_clustered",
                                     fname,sep=""))
        plotdev(file.name,width=3.5,height=3.5,type=fig.type)
        par(mfcol=c(1,1),mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0))
        #par(mfcol=c(1,1),mai=c(1,1,.1,.1))
        plot(jaccard,numhit,col=NA,         
             ylab="average hits per target sequence",
             xlab="jaccard: intersect/union")#,
        mxx <- max(jaccard)
        mny <- min(numhit)
        points(jaccard,numhit,col=pmcol[pmcls],pch=20) #sgpchs[nms])
        #rect(.1,.01,0.6,.09,col="#FFFFFFBB",border=NA)
        #text(.35,.05,"good",font=2)
        legend("bottomright","good",bty="n", text.font=2) #bg="#FFFFFFBB",box.col=NA, text.font=2) 
        ##par(mai=c(1,.7,.1,.1))
        ##image_matrix(-log2(pval) ,text=enum, axis=1:2,
        ##             col=c("#FFFFFF",rev(grey.colors(20))),
        ##             axis2.col=1:nrow(pval),
        ##             xlab=NA,ylab=NA)
        ##axis(1, at=cumsum(unlist(lapply(cllst,length)))+.5, tck=-1,labels=NA) 
        dev.off()
    }
    
    ## MAX vs. MIN: ratio heights vs. num hits per target
    ## TODO: plot name legend separately and adaptive!
    file.name <- file.path(out.path,testid,
                           paste(test.type,"_ratio_fragmentation",fname,sep=""))
    plotdev(file.name,width=10,height=4,type=fig.type)
    par(mfcol=c(1,2),mai=c(1,1,.1,.1))
    plot(apply(height,1,diff), numhit,
         col=sgcols[nms],pch=sgpchs[nms],
         ylab="average hits per target sequence",
         xlab="fraction: 0.8 < ratio < 1.2")#,
    par(mai=c(.1,.1,.1,.1))
    plot(0, col=NA,axes=FALSE,ylab=NA,xlab=NA)
    legend("topleft", legend=nms, col=sgcols[nms],pch=sgpchs[nms],
           cex=.5,bty="n",ncol=2)
    dev.off()
    
    ## MAX vs. MIN CLUSTERS - good hits per target vs. num hits per target 
    if ( !is.null(pm) ) {
        file.name <-file.path(out.path,testid,
                              paste(test.type,"_ratio_fragmentation_clustered",
                                    fname,sep=""))
        plotdev(file.name,width=3.5,height=3.5,type=fig.type)
        par(mfcol=c(1,1),mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0))
        plot(apply(height,1,diff), numhit,
             col=pmcol[pmcls],pch=sgpchs[nms],
             ylab="average hits per target sequence",
             xlab="fraction: 0.8 < ratio < 1.2")#,
        imgdat <- t(apply(-log2(pval), 2, rev))
        dev.off()
    }
    
### BELOW PERHAPS NOT REQUIRED

    ## CDF PLOTS

    ## CDF of jaccard (jcdf)
    if ( !is.null(pm) ) {
        file.name <- file.path(out.path,testid,
                               paste(test.type,"_jaccard_cdf_clustered",
                                     fname,sep=""))
        plotdev(file.name,width=3.5,height=3.5,type=fig.type)
        par(mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0),xaxs="i")
        plot_cdfLst(x=seq(0,1.1,.05), CDF=CDF, type="jcdf",
                    col=pmcol[pmcls], lty=sgltys,
                    h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
                    xlab="J: intersect/union",range="lines")
        legend("topleft",paste(test.type,"-",tnum))
        dev.off()
    }
    file.name <- file.path(out.path,testid,
                           paste(test.type,"_jaccard_rcdf",fname,sep=""))
    plotdev(file.name,width=3.5,height=3.5,type=fig.type)
    par(mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0),xaxs="i")
    plot_cdfLst(x=seq(0,1.1,.05), CDF=CDF, type="jcdf",
                col=sgcols, lty=sgltys,
                h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
                xlab="jaccard: intersect/union")
    legend("topleft",paste(test.type,"-",tnum))
    dev.off()
    
    ## CDF of relative best hit CDF (rrcdf)
    file.name <- file.path(out.path,testid,
                           paste(test.type,"_ratio",fname,sep=""))
    plotdev(file.name,width=3.5,height=3.5,type=fig.type)
    par(mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0),xaxs="i")
    plot_cdfLst(x=seq(0,2,.05), CDF=CDF, type="rrcdf", col=sgcols, lty=sgltys,
                h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
                xlab="relative ratio: query length/target length")
    legend("topleft",paste(test.type,"-",tnum))
    dev.off()


    ## CDF of best hit target coverage
    file.name <-file.path(out.path,testid,
                          paste(test.type,"_totalCoverage",fname,sep=""))
    plotdev(file.name,width=3.5,height=3.5,type=fig.type)
    par(mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0),xaxs="i")
    plot_cdfLst(x=seq(0,1.1,.05), CDF=CDF, type="tcdf", col=sgcols, lty=sgltys,
                h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
                xlab="coverage of test set")
    legend("topleft",paste(test.type,"-",tnum))
    dev.off()
   
     ## CDF of best hit target coverage - cluster colors
    if ( !is.null(pm) ) {
        file.name <-file.path(out.path,testid,
                              paste(test.type,"_totalCoverage_clustered",
                                    fname,sep=""))
        plotdev(file.name,width=3.5,height=3.5,type=fig.type)
        par(mai=c(.75,.75,.1,.1),mgp=c(1.75,.5,0),xaxs="i")
        plot_cdfLst(x=seq(0,1.1,.05), CDF=CDF, type="tcdf",
                    col=pmcol[pmcls], lty=sgltys,
                    h=c(minf,.8), v=c(ovlth,2-ovlth), #c(0.8,1.2),
                    xlab="coverage of test set")
        legend("topleft",paste(test.type,"-",tnum))
        dev.off()
    }
    
    ## SUMMARY OF CDF of absolute best hit ratio
    ## fraction of mutual coverage between 0.8 and 1.2
    if ( !is.null(pm) ) {
        file.name <- file.path(out.path,testid,
                               paste(test.type,"_ratioTotal_rng_clustered",
                                     fname,sep=""))
        plotdev(file.name,width=2+.2*length(CDF),height=6,type=fig.type)
        par(mai=c(3.3,.75,.1,.1))
        plot(0,col=NA,ylim=c(0,1.2),xlim=c(0,length(CDF)+1),
             axes=FALSE,xlab=NA,ylab=NA)
        axis(2)
        abline(h=c(minf,.8),lty=2)
        leg <- NULL
        for ( i in 1:length(CDF) ) {
            if ( "rcdf" %in% names(CDF[[i]]) ) {
                nm <- CDF[[i]]$qid
                if ( diff(height[i,]) > 0 )
                    arrows(x0=i,x1=i,y0=height[i,1],y1=height[i,2],
                           col=pmcol[pmcls[i]],lty=sgltys[nm],
                           length=0.1, angle=90, code=3)
                text(i, 1.1, round(diff(height[i,]),3),srt=90)
            }
        }
        axis(1,at=1:length(CDF),labels=nms,las=2)
        dev.off()
    }
   
    ## MAX vs. MIN: good hits per target vs. num hits per target
    file.name <- file.path(out.path,testid,
                           paste(test.type,"_coverage_fragmentation",
                                 fname,sep=""))
    plotdev(file.name,width=10,height=4,type=fig.type)
    par(mfcol=c(1,2),mai=c(1,1,.1,.1))
    plot(hitnum/tnum, numhit,
         col=sgcols[nms],pch=sgpchs[nms],
         ylab="average hits per target sequence",
         xlab="total number of 'good' hits")#,
    par(mai=c(.1,.1,.1,.1))
    plot(0, col=NA,axes=FALSE,ylab=NA,xlab=NA)
    legend("topleft", legend=nms, col=sgcols[nms],pch=sgpchs[nms],
           cex=.5,bty="n",ncol=2)
    dev.off()

    ## MAXIMIZE COVERAGE: good hits per target
    cvg <- hitnum/tnum
    file.name <- file.path(out.path,testid,
                           paste(test.type,"_coverage",fname,sep=""))
    plotdev(file.name,,width=2+.2*length(CDF),height=6,type=fig.type)
    par(mai=c(3.3,.75,.1,.1),mgp=c(1.75,.5,0))
    plot(1:length(cvg),cvg,type="b",col=sgcols[nms],pch=sgpchs[nms],axes=FALSE,
         xlab=NA,ylab=paste("number of 'good' hits",ovlth))
    axis(2)
    axis(1,at=1:length(cvg),labels=nms,las=2)
    dev.off()
    
    ## MAXIMIZE JACCARD: good hits per target
    file.name <- file.path(out.path,testid,
                           paste(test.type,"_jaccard",fname,sep=""))
    plotdev(file.name,,width=2+.2*length(CDF),height=6,type=fig.type)
    par(mai=c(3.3,.75,.1,.1),mgp=c(1.75,.5,0))
    plot(1:length(jaccard),jaccard,type="b",col=sgcols[nms],pch=sgpchs[nms],
         axes=FALSE,xlab=NA,ylab="jaccard: intersect/union")
    axis(2)
    axis(1,at=1:length(jaccard),labels=nms,las=2)
    dev.off()

    ## MINIMIZE FRAGMENTATION: num hits per target 
    nmh <- numhit
    file.name <-file.path(out.path,testid,
                          paste(test.type,"_fragmentation",fname,sep=""))
    plotdev(file.name,,width=2+.2*length(CDF),height=6,type=fig.type)
    par(mai=c(3.3,.75,.1,.1),mgp=c(1.75,.5,0))
    plot(1:length(nmh),nmh,type="b",col=sgcols[nms],pch=sgpchs[nms],axes=FALSE,
         xlab=NA,ylab=paste("avg segments per target"))
    axis(2)
    axis(1,at=1:length(nmh),labels=nms,las=2)
    dev.off()
}


dir.create(file.path(out.path,testid,"segtypes"))
### PLOT BY SEGMENT TYPES & SAVE RDATA!

for ( type in sgtypes ) {

    ## re-order result list
    hitnum <- rep(NA,length(test.types))
    CDF <- rep(list(NA),length(test.types))
    height <- matrix(NA, ncol=2, nrow=length(test.types))
    rownames(height) <- names(hitnum) <- names(CDF) <- test.types
    numhit <- tnum <- hitnum
    for ( test.type in test.types ) {
        if ( is.null(stats[[test.type]][[type]]) ) next
        CDF[[test.type]] <- stats[[test.type]][[type]]$CDF
        CDF[[test.type]]$name <- test.type
        numhit[test.type] <-  stats[[test.type]][[type]]$numhit
        hitnum[test.type] <-  stats[[test.type]][[type]]$hitnum
        tnum[test.type] <-  stats[[test.type]][[type]]$tnum
        height[test.type,] <- stats[[test.type]][[type]]$height
    }
    ## save as RData file for each segmentation
    if ( save.rdata ) {
        file.name <- file.path(out.path,testid,"segtypes",
                               paste0(type,fname,".RData"))
        overlaps <- list(CDF=CDF,
                         numhit=numhit,
                         hitnum=hitnum,
                         tnum=tnum,
                         height=height)
        save(overlaps,file=file.name)
    }
    
    ## plot
    file.name <- file.path(out.path,testid,"segtypes",
                           paste(type,"_ratioTotal",fname,sep=""))
    plotdev(file.name,width=3.5,height=3.5,type=fig.type)
    par(mai=c(.75,.75,.5,.1),mgp=c(1.75,.5,0),xaxs="i")
    leg <- NULL
    xmax <- 3
    x <- seq(0,xmax,.05)
    plot(1,type="l",main=NA,col=NA,lty=1,
         xlab="ratio: query length/target length",xlim=c(0,xmax),
         ylab="cum.dist.func.",ylim=c(0,1.1))
    abline(v=c(ovlth,2-ovlth),lty=2)
    abline(h=c(minf,.8),lty=2)
    abline(h=0:1, lty=2, col="gray",lwd=.75)
    for ( i in 1:length(CDF) ) {
        if ( length(CDF[[i]])<2 ) next
        col <- tcols[CDF[[i]]$name] # todo: tid
        if ( !is.null(CDF[[i]]$rcdf) ) {
            lines(x,CDF[[i]]$rcdf(x),col=paste(col,"AA",sep=""),lty=1,lwd=2) 
            leg <- c(leg, CDF[[i]]$name) # todo: tid
        }
    }
    mtext(paste(testid,"recovery by segments"),3,0,cex=1.5)
    if ( !is.null(leg) )
        legend("bottomright",leg,col=tcols[leg],lty=1,lwd=2,bty="n")
    legend("topleft",paste(type))
    dev.off()

    ## CONSSEG
    if ( Sys.Date() < as.Date("2021-02-01") ) {
        ## plot
        file.name <- file.path(out.path,testid,"segtypes",
                               paste(type,"_ratioTotal_consseg",fname,sep=""))
        plotdev(file.name,width=3.5,height=3.5,type=fig.type,res=300)
        par(mai=c(.5,.5,.125,.125),mgp=c(1.3,.4,0),xaxs="i")
        leg <- NULL
        xmax <- 3
        x <- seq(0,xmax,.05)
        plot(1,type="l",main=NA,col=NA,lty=1,
             xlab="ratio: query length/target length",xlim=c(0,xmax),
             ylab="cum.dist.func.",ylim=c(0,1.05))
        abline(v=c(ovlth,2-ovlth),lty=2)
        abline(h=c(minf,.8),lty=2)
        abline(h=0:1, lty=2, col="gray",lwd=.75)
        for ( i in 1:length(CDF) ) {
            if ( length(CDF[[i]])<2 ) next
            col <- tcols[CDF[[i]]$name] # todo: tid
            if ( !is.null(CDF[[i]]$rcdf) ) {
                lines(x,CDF[[i]]$rcdf(x),col=i,lty=1,lwd=2)
                leg <- c(leg, CDF[[i]]$name) # todo: tid
            }
        }
        #mtext(paste(testid,"recovery by segments"),3,0,cex=1.5)
        if ( !is.null(leg) ) {
            leg <- gsub("D:dft1-7.dcash.snr_T:raw_K:12_S:icor_","",leg)
            legend("bottomright",leg,col=1:length(CDF),cex=.7,lty=1,lwd=2,
                   bty="o",bg="#FFFFFF",box.col=NA)
        }
        legend("topleft",paste("query:",type),seg.len=0,
              bg="#FFFFFFFF",y.intersp=0.1, x.intersp=0)
        dev.off()
    }

    
    file.name <- file.path(out.path,testid,"segtypes",
                           paste(type,"_ratioTotal_rng",fname,sep=""))
    plotdev(file.name,width=2+.2*length(CDF),height=6,type=fig.type)
    par(mai=c(3.3,.75,.1,.1))
    plot(0,col=NA,ylim=c(0,1.2),xlim=c(0,length(CDF)+1),
         axes=FALSE,xlab=NA,ylab=NA)
    axis(2)
    abline(h=c(minf,.8),lty=2)
    leg <- NULL
    for ( i in 1:length(CDF) ) {
        if ( "rcdf" %in% names(CDF[[i]]) ) {
            if ( diff(height[i,]) > 0 )
                arrows(x0=i,x1=i,y0=height[i,1],y1=height[i,2],
                       col=tcols[names(CDF)[i]],lty=1,length=0.1,
                       angle=90,code=3)
            text(i, 1.1, round(diff(height[i,]),3),srt=90)
        }
    }
    axis(1,at=1:length(CDF),labels=names(CDF),las=2)
    dev.off()
}
} # end of loop over real vs. random!

if ( !interactive() ) quit(save="no")


### TEST-SET ANALYSES
#### TODO; allow segmentRevery to work with transcripts vs features

## ORF recovery by transcripts!
trcl <- segmentAnnotate(query=clorf,target=genes,
                        add.na=TRUE,details=TRUE,sort=TRUE)
## plot cluster gene recovery by published transcripts
trst <- getOverlapStats(trcl,tnum=nrow(genes),
                        qnum=nrow(clorf),qid="clorf", ovlth=ovlth)
file.name <- file.path(paste("orfXtranscripts_ratioTotal",sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type)
par(mai=c(.75,.75,.5,.1),mgp=c(1.75,.5,0),xaxs="i")
xmax <- 2
x <- seq(0,xmax,.05)
plot(x, trst$CDF$rcdf(x),type="l",col=1,lty=1,xlim=c(0,xmax),xlab="transcript length/ORF length",ylab="cum.dist.func.",main=NA,ylim=c(0,1))
abline(v=c(ovlth,2-ovlth),lty=2)
abline(h=c(minf,.8),lty=2)
abline(h=0:1, lty=2, col="gray",lwd=.75)
mtext(paste("ORF recovery by ORF transcripts"),3,0,cex=1.5)
legend("bottomright","ORF recovery by ORF transcripts",bty="n")
legend("topleft","")
dev.off()

## cluster ORF recovery by transcripts
## CLUSTER ORFS
test <- "clgenes"
trgs <- genes
#trgs[trgs[,"CL_rdx"]=="","CL_rdx"] <- "na"
tpcol <- "CL_rdx"
cpcol <- "CL_rdx_col"
## output dir
dir.create(file.path(out.path,test),recursive=TRUE)

## test types
ttypes <- as.character(trgs[,tpcol])
ttypes[ttypes==""] <- "na"
ttypes[is.na(ttypes)] <- "na"
test.types <- sort(names(table(ttypes)))
tcols <- sapply(test.types,
                function(x)
                    as.character(trgs[which(trgs[,tpcol]==x)[1],cpcol]))

tcols["na"] <- "#939393"
                
## results
ovlstats <- rep(list(NA),length(test.types))
names(ovlstats) <- test.types

query <- transcripts[transcripts[,"type"]=="ORF",]
for ( test.type in test.types ) {

    target <- trgs[ttypes==test.type,,drop=FALSE]
     
    cat(paste("transcript overlap with",test.type,"\t",time(),"\n"))
    
    ovl <- segmentAnnotate(query=query,target=target, add.na=TRUE,details=TRUE)
    sts <- list(CDF=NA,hitnum=0,qid=test.type,tid="ORF")

    if ( any(!is.na(ovl[,"query"])))
        sts <- getOverlapStats(ovl,ovlth=.8,qid=test.type,tid="ORF",
                               tnum=nrow(target),qnum=nrow(query))
    
    ovlstats[[test.type]] <- sts
}
rm <- unlist(lapply(ovlstats, function (x)
                    !any(x$hitnum>0)))
ovlstats <- ovlstats[!rm]
test.types <- test.types[!rm]

## fuse results to primary result list - TODO: not fused?
CDF <- rep(list(NA),length(test.types))
names(CDF) <- test.types
for ( test.type in test.types ) {
    CDF[[test.type]] <- ovlstats[[test.type]]$CDF
    CDF[[test.type]]$name <- test.type
}
 
## plot
file.name <- file.path(paste("clusterXtranscripts_ratioTotal",sep=""))
plotdev(file.name,width=3.5,height=3.5,type=fig.type)
par(mai=c(.75,.75,.5,.1),mgp=c(1.75,.5,0),xaxs="i")
leg <- NULL
xmax <- 3
x <- seq(0,xmax,.05)
plot(x, CDF[[1]]$rcdf(x),type="l",col=NA,lty=1,xlim=c(0,xmax),xlab="ratio: query length/target length",ylab="cum.dist.func.",main=NA,ylim=c(0,1))
abline(v=c(ovlth,2-ovlth),lty=2)
abline(h=c(minf,.8),lty=2)
abline(h=0:1, lty=2, col="gray",lwd=.75)
for ( i in 1:length(CDF) ) {
    col <- tcols[CDF[[i]]$name]
    if ( "rcdf" %in% names(CDF[[i]]) )
        lines(x,CDF[[i]]$rcdf(x),col=paste(col,"AA",sep=""),lty=1,lwd=2) 
    leg <- c(leg, CDF[[i]]$name)
}
mtext(paste(test,"recovery by ORF transcripts"),3,0,cex=1.5)
legend("topleft","",bty="n")
legend("bottomright",leg,col=tcols[leg],lty=1,lwd=2,bty="n")
dev.off()


if ( !interactive() ) quit(save="no")
raim/segmenTools documentation built on April 30, 2024, 4:53 a.m.