#!/usr/bin/env Rscript
library("segmenTools") # coor2index
library("segmenTier") # main segmentation algorithm
suppressPackageStartupMessages(library(optparse))
#library("Rcpp")
#source("~/programs/segmenTier/R/cluster.R")
#source("~/programs/segmenTier/R/segment.R")
#sourceCpp("~/programs/segmenTier/src/cluster.cpp")
#sourceCpp("~/programs/segmenTier/src/segment.cpp")
## nicer timestamp
time <- function() format(Sys.time(), "%Y%m%d %H:%M:%S")
## messages
msg <- function(x) cat(x, file=stdout()) # until piping is implemented
### PARSE OPTIONS
option_list <- list(
## INPUT OPTIONS
make_option(c("-i", "--infile"), type="character", default="",
help="chromosome coordinates of primary segments"),
make_option(c("--chrfile"), type="character", default="",
help="chromosome index file, providing a sorted list of chromosomes and their lengths in column 3 [default %default]"),
make_option(c("--datafile"), type="character", default="",
help="full data set, RData that contains the time series in matrix 'ts' and its genomic coordinates in matrix 'coor' [default %default]"),
make_option(c("--primdir"), type="character", default="primarysegments",
help="individual data sets for primary segments [default %default]"),
make_option(c("--primfiles"), type="character", default="primseg_",
help="individual data sets for primary segments [default %default]"),
make_option(c("--use.data"), action="store_true", default=FALSE,
help="use the full data set (helper, will be obsolete)"),
## OUTPUT OPTIONS
make_option(c("-v", "--verb"), type="integer", default=1,
help="0: silent, 1: main messages, 2: warning messages"),
make_option(c("--fig.type"), type="character", default="png",
help="figure type, png or pdf [default %default]"),
make_option(c("--do.test"), action="store_true", default=FALSE,
help="only do testsets"),
make_option(c("--only.plot"), action="store_true", default=FALSE,
help="only plot existing segmentations; useful to check ongoing results during longer runs"),
make_option(c("--redo"), action="store_true", default=FALSE,
help="overwrite existing segmentations; leave FALSE if you merely want to plot existing segmentations"),
make_option(c("--write.empty"), action="store_true", default=FALSE,
help="write result files for also for pre-segments that were skipped (length<min.size, length>max.size), didnt comprise enough data for clustering ('not enough data', 'not enough data diversity'), or no segments where found ('no segments'); the csv file of segments will be empty), the RData file will contain the call arguments (list 'opt') and all objects created before abortion or NULL ('tset', 'cset', 'sset')"),
make_option(c("--save.matrix"), action="store_true", default=FALSE,
help="NOT IMPLEMENTED: store the total score matrix S(i,c) and the backtracing matrix K(i,c) in the RData file"),
make_option(c("--plot"), action="store_true", default=FALSE,
help="plot segment figures"),
make_option(c("--genbro"), type="character", default="",
help="path for the R genome browser sources"),
make_option(c("--gendat"), type="character", default="",
help="path for the R genome data sources"),
make_option(c("--idsuffix"), type="character", default="",
help="suffix for segment IDs [default %default]"),
make_option(c("--type.name"), type="character", default=c("T,D"),
help="parameters to use for IDs [default %default]"),
make_option(c("--short.name"), action="store_true", default=FALSE,
help="strip down segment types to varied values [default %default]"),
make_option(c("--genome"), type="character", default="yeast_R64-1-1",
help="in plots, additionally show annoted features and reported transcripts; this requires installation of the `R Genome Browser' and data sets and is currently only available for `yeast_R64-1-1"),
make_option(c("-n", "--outname"), type="character", default="primseg",
help="output filename prefix"),
make_option(c("-o", "--outdir"), type="character", default=".",
help="path to output directory"),
## PRIMARY SEGMENT SELECTION
make_option(c("--segs"), type="character", default="",
help="primary segment numbers, all or test if empty, comma-separated list [default: %default]"),
make_option(c("--use.empty"), action="store_true", default=FALSE,
help="cluster inter-segment regions instead"),
make_option(c("--min.sz"), type="integer", default=2,
help="minimal size of segments (non cluster-0 values) [default: %default]"),
make_option(c("--max.sz"), type="integer", default=3e6,
help="maximal size of segments (non cluster-0 values) [default: %default]"),
## TIMESERIES PROCESSING
make_option(c("--trafo"), type="character", default="raw",
help="time-series transformation function, R base functions like 'log', and 'ash' for asinh is available [default %default]"),
make_option("--low.thresh", default=-Inf,
help="cut-off to set low values to 0 [default %default]"),
## DISCRETE FOURIER TRANSFORM
make_option(c("--use.fft"), action="store_false", default=TRUE,
help="do DFT of time-series [default %default]"),
make_option(c("--dft.range"), type="character", default="1,2,3,4,5,6,7",
help="DFT components to use, comma-separated list of integers [default %default]"),
make_option(c("--use.snr"), action="store_false", default=TRUE,
help="do SNR of the DFT [default %default]"),
make_option(c("--dc.trafo"), type="character", default="raw",
help="DC component transformation function, see --trafo [default %default]"),
## CLUSTERING K-MEANS
make_option(c("--K"), type="character", default="12,16,20",
help="number of clusters to use in k-means, comma-separated [default %default]"),
make_option(c("--iter.max"), type="integer", default=100000,
help="max. iterations in k-means [default %default]"),
make_option(c("--nstart"), type="integer", default=100,
help="initial k-means configurations [default %default]"),
make_option("--nui.thresh", default=0.6,
help="cluster correlation threshold to re-assign points to nuissance [default %default]"),
## SEGMENTATION PARAMETERS
make_option("--nui.cr", type="character", default="2",
help="correlation of nuissance cluster with others (-) and itself (+), comma-separated characters [default %default]"),
make_option(c("--scores"), type="character", default="ccor,icor",
help="scoring functions to use, comma-separated characters [default %default]"),
make_option(c("--scales"), type="character", default="1,3,5",
help="scale exponent for similarity matrices, comma-separated doubles [default %default]"),
make_option(c("--M"), type="character", default="175",
help="minimal length penalty, comma-separated integers [default %default]"),
make_option(c("--Mn"), type="character", default="15",
help="minimal length penalty [default %default]"),
make_option(c("--multi"), type="character", default="max",
help="handling of multiple max. score k in scoring, min or max [default %default]"),
make_option(c("--multib"), type="character", default="max",
help="handling of multiple max. score k in back-tracing, min or max [default %default]"),
make_option(c("--nextmax"), type="character", default="T",
help="in back-tracing, search for the next non-decreasing S(i,c) before initializing a new segment [default %default]"),
## POST-PROCESSING
make_option("--fuse.thresh", default=0.2,
help = "correlation threshold between clusters to fuse adjacent segments [default \"%default\"]")
)
## get command line options
opt <- parse_args(OptionParser(option_list=option_list))
## process comma-separated list arguments
lst.args <- c(segs="integer",
type.name="character",
dft.range="integer",
K="integer",
scores="character",
scales="numeric",
M="integer", Mn="integer", nui.cr="integer",
nextmax="logical",
multi="character",multib="character")
for ( i in 1:length(lst.args) ) {
idx <- which(names(opt)==names(lst.args)[i])
## get individual values
tmp <- as.list(unlist(strsplit(opt[[idx]], ",")))
## expand ranges
if ( length(tmp)>0 )
if ( lst.args[i]=="numeric" | lst.args[i]=="integer" )
for ( j in 1:length(tmp) ) { # only for numeric modes
tmp2 <- unlist(strsplit(tmp[[j]], ":"))
if ( length(tmp2)>1 ) {
tmp2 <- as.numeric(tmp2)
tmp[[j]] <- tmp2[1]:tmp2[2]
}
}
if ( length(tmp)>0 )
opt[[idx]] <- unlist(tmp)
mode(opt[[idx]]) <- lst.args[i]
}
## promote options to main environment and print all arguments
if ( opt$verb>0 )
cat(paste("SETTINGS:\n"))
for ( i in 1:length(opt) ) {
if ( opt$verb>0 )
cat(paste(names(opt)[i], "\t", #typeof(opt[[i]]),
paste(opt[[i]],collapse=", "), "\n",sep=""))
arg <- names(opt)[i]
assign(arg, opt[[arg]])
}
## clear NAs - this happens for missing segment arg (due to char-int mapping)
segs <- segs[!is.na(segs)]
### OUTPUT FILENAME PREFIX
outname <- file.path(outdir,"primseg")
### START
cat(paste("LOADING SEQUENCING DATA\t", time(), "\n",sep=""))
## TODO: move this out and pre-write primary segment data
## via a separate function writing out primarySegments
## or use the infile approach?
## load time-series and oscillation data: coor, ts, osc
## only "ts", the original time-series and "coor", the corresponding
## chromosome coordinates are used here!
if ( chrfile != "" ) {
cat(paste("Using chromosome index file:",chrfile, "\n"))
cf <- read.table(chrfile, sep="\t",header=FALSE)
chrS <- c(0,cumsum(cf[,ncol(cf)])) ## index of chr/pos = chrS[chr] + pos
} else {
##datafile <- file.path("/data/yeast/RNAseq/results/genomeData/Sacchromyces.cerevisiae.clean.S288C_oscillation.RData")
cat(paste("Using full data file:", datafile, "\n"))
load(datafile)
chr <- unique(coor[,"chr"])
chrS <- rep(0,length(chr)+1)
for ( i in 2:length(chrS) ) {
idx <- which(coor[,"chr"]==chr[i-1])
chrS[i] <- idx[length(idx)] - nrow(coor)/2
}
}
## NOT USED:
#rawfile <- file.path(data.path,"genomeData",
# "Sacchromyces.cerevisiae.clean.S288C_timeseries.csv")
## LOAD SELECTED PRIMARY SEGMENTS
cat(paste("Loading primary segments:",infile,"\n"))
primseg <- read.table(infile, sep="\t",header=TRUE)
primseg <- coor2index(primseg, chrS)[,c("ID","start","end")]
prdf <- primseg[,"end"] - primseg[,"start"] + 1
## set segment name (used in segment IDs and file names)
## TESTSETS
## primseg3 tests
tests <- list(known=c(srg1=436,gdh3=4),
nonsplit=c(3133,16,39,100,2360,2427),
long5utr=c(34),
short3utr=c(38),
splitgene=c(37),
fiveprimedeg=c(4,14,21,25))
alltests <- unique(unlist(tests)) # new primseg3 20161115
### SELECT SEGMENTS
if ( length(segs)==0 ) {
## smallest segments first
sets <- rev(order(prdf)) # order(prdf) #
if ( do.test ) {
cat(paste("DEVEL OPTION: CALCULATING TESTSETS\n"))
sets <- alltests
}
} else { ## command-line
sets <- segs
}
if ( !only.plot )
cat(paste("CALCULATING SEGMENTATIONS\t", time(), "\n",sep=""))
cat(paste("TESTSETS\t", length(sets), "\n",sep=""))
do.sets <- sets
if ( only.plot ) { # skip segmentation; continue at plots
do.sets <- c()
plot <- TRUE
}
## write empty result files
## handy for checking completeness of results when stdout/stderr
## are redirected on clusters and may be lost (e.g. on PBS without
## option -k oe)
writeEmpty <- function(outname, segid, opt, tset, cset, sset) {
file.name <- file.path(paste(outname,"_",segid,sep=""))
## segments.csv: empty file
file.create(paste(file.name,"_segments.csv",sep=""))
## segments.RData: all available objects
save(opt, segid, tset, cset, sset,
file=paste(file.name,"_segments.RData",sep=""))
}
### RUN SEGMENTATION
for ( i in do.sets ) {
## generate segment id
segf <- as.character(primseg[i,"ID"])
##segid <- str_pad(i,5,pad="0")
## TODO: add back number to segid/file.name
## for easer maintenance of parameter scan results
if ( idsuffix!="" )
segid <- paste(segf, idsuffix, sep="_")
cat(paste("PRIMARY SEGMENT\t", segid, "\t", which(sets==i), "of",
length(sets),"\n",sep=""))
rng <- primseg[i,"start"]:primseg[i,"end"]
strand <- idx2str(primseg[i,"start"],chrS)
## already done?
file.name <- file.path(paste(outname,"_",segid,"_segments.csv",sep=""))
if ( file.exists(file.name) & !redo ) {
cat(paste("\talready done\n"))
next
}
## minimial size of segment in terms of expressed points!
if ( length(rng) < 2 ) {
cat(paste("\tinvalid primary segment of length <2\n"))
if ( write.empty )
writeEmpty(outname, segid, opt, NULL, NULL, NULL)
next
}
## minimial size of segment in terms of expressed points!
if ( length(rng) < min.sz ) {
cat(paste("\ttoo short:",length(rng),"\n"))
if ( write.empty )
writeEmpty(outname, segid, opt, NULL, NULL, NULL)
next
}
## maximal size of segment in terms of expressed points!
if ( length(rng) > max.sz ) {
cat(paste("\ttoo long:",length(rng),"\n"))
if ( write.empty )
writeEmpty(outname, segid, opt, NULL, NULL, NULL)
next
}
## load time-series
#ts <- read.table(rawfile,sep="\t",header=FALSE,colClasses="numeric",
# skip=primseg[i,"start"],
# nrows=primseg[i,"end"]-primseg[i,"start"]+1)
if ( !use.data ) {
segdata <- file.path(primdir, paste(primfiles,segf,".csv",sep=""))
tsd <- read.table(segdata, sep="\t", header=TRUE)
} else
tsd <- ts[rng,]
## reverse data from reverse strand!
## both strands are handled in direction of transcription!
if ( strand==-1 )
tsd <- tsd[nrow(tsd):1,]
cat(paste("CLUSTERING\t",time(),"\n",sep=""))
## process time series, get DFT etc.
## TODO: add processing info to tset
## allow multiple processing, and take over IDs in cluster
tset <- processTimeseries(ts=tsd, na2zero=TRUE, use.fft=use.fft,
trafo=trafo, dc.trafo=dc.trafo,
dft.range=dft.range,
use.snr=use.snr, low.thresh=low.thresh)
## cluster time series
## TODO: use global clustering and calculate icor locally
## TODO: calculate global clustering from "training set"
## e.g. previous clustering of ORF (from microarrays)
## * get cluster gene ranges
## * calculate avg DFT (or DFT of timeseries avg?)
## by segmentAvg code from segmentDynamics.R
## * pass clustering to clusterTimeseries and calculate Pci
cset <- clusterTimeseries(tset,K=K, iter.max=iter.max, nstart=nstart,
nui.thresh=nui.thresh, verb=verb)
## segment all clusterings for different scoring functions
allsegs <- NULL
if ( is.null(cset) ) {
cat(paste("WARNING", i, segid, "no clustering\n"))
if ( write.empty )
writeEmpty(outname, segid, opt, tset, NULL, NULL)
} else {
## collect varysettings
vary <- list(## scoring function
E=scales, S=scores,
## scoring params
M=M, Mn=Mn, a=2, nui=nui.cr,
## backtrace params
nextmax=nextmax, multi=multi,multib=multib)
## note: we add time-series processing to the segment
## type names, allowing to externally vary (multiple
## distinct calls to this script) and later co-analyse
## them.
sset <- segmentCluster.batch(cset, varySettings=vary,
verb=verb,
fuse.threshold=fuse.thresh,
id=segid, type.name=type.name,
#short.name=short.name,
save.matrix=save.matrix)
allsegs <- sset$segments # SEGMENTS!
## plotSegmentation(tset,cset,sset)
if ( is.null(allsegs) ) {
cat(paste("no segments\n"))
if ( write.empty )
writeEmpty(outname, segid, opt, tset, cset, sset)
} else {
## CHROMOSOMAL COORDINATES
## 1) add/subtract to primseg indexed coordinates
if ( strand==-1 ) {
segcoors <- cbind(chr = rep(NA, nrow(allsegs)),
primseg[i,"end"] + 1 -
allsegs[, c("start","end"), drop=FALSE],
strand=rep(NA,nrow(allsegs)))
} else {
segcoors <- cbind(chr = rep(NA, nrow(allsegs)),
allsegs[, c("start","end"), drop=FALSE] +
primseg[i,"start"] - 1,
strand=rep(NA,nrow(allsegs)))
}
## 2) map back to chromosomes
segcoors <- index2coor(segcoors,chrS)
allsegs <- cbind(allsegs[,c("ID","type")],
segcoors,
allsegs[,c("CL","color","fuse"),drop=FALSE])
## store clustering data
centers <- cset$centers
clusters <- cset$clusters
if ( strand == -1 )
clusters <- clusters[nrow(clusters):1,]
## save data!
file.name <- file.path(paste(outname,"_",segid,sep=""))
## as table
write.table(allsegs,sep="\t",
file=paste(file.name,"_segments.csv",sep=""),
quote=FALSE,row.names=FALSE,col.names=TRUE)
## as RData, mainly for plots below
sset$segments <- allsegs
save(opt, segid, tset, cset, sset,
file=paste(file.name,"_segments.RData",sep=""))
}
}
}
if ( !plot ) {
cat(paste("DONE SEGMENTATION AT\t",time(),"\n",sep=""))
quit(save="no")
}
cat(paste("PLOTTING\t",time(),"\n",sep=""))
## TODO: load plotting specific parameters
## and align with opt from saved RData file
columns <- c(name="name", chr="chr", strand="strand",
start="start", end="end", type="type", color="color")
## LOAD YEAST GENOME BROWSER
##genome <- "yeast_R64-1-1"
if ( genome=="yeast_R64-1-1" ) {
### REQUIRES ENVIRONMENT VARIABLES SET TO YEAST GENOME BROWSER & DATA!
### TODO: make genomeBrowser a package; introduce official IDs for genomes
### see https://gitlab.com/raim/genomeBrowser and raim@tbi.univie.ac.at
### for the required data files
cat(paste("genome data\t", genome, "\n",sep=""))
## for genome data and plots - todo - skip this!
if ( genbro=="" )
genbro <- Sys.getenv("GENBRO")
source(file.path(genbro,"src/genomeBrowser.R")) ## for loadData
source(file.path(genbro,"src/genomeBrowser_utils.R")) ## plotFeature
if ( gendat=="" )
gendat <- Sys.getenv("GENDAT")
gendat <- file.path(gendat,"yeast")
## TODO: load data from files to make independent of
## genomeBrowser/genomeData
## PLOT SETTINGS FOR TESTSETS
fcolumns <- columns
fcolumns["color"] <- "CL_rdx_col"
ftypes <- c( "gene_cassette" , "gene",
"five_prime_UTR_intron", "intron",
"dubious" , "pseudogene",
"rRNA" , "tRNA",
"tRNA.intron" , "ncRNA",
"snRNA" , "snoRNA",
"telomere" , "centromere",
"ARS" , "transposable_element_gene",
"LTR_retrotransposon" , "repeat_region")
## LOAD DATA SETS
cat(paste("Loading annotation data\n"))
testIDs <- c("transcripts","annotation")
dataSets <- loadData(testIDs, data.path=gendat)
dataSets[["annotation"]]$settings$names <- FALSE
}
### PLOTTING
for ( i in sets ) {
## generate segment id
segf <- primseg[i,"ID"]
##segid <- str_pad(i,5,pad="0")
if ( idsuffix!="" )
segid <- paste(segf, idsuffix, sep="_")
cat(paste("Primary segment\t", segid, "\t", which(sets==i), "of",
length(sets),"\n",sep=""))
file.name <- file.path(paste(outname,"_",segid,sep=""))
## already plotted?
if ( file.exists(paste(file.name,fig.type,sep=".")) & !redo ) {
cat(paste("\talready plotted.\n"))
next
}
## load data
dfile <- paste(file.name,"_segments.RData",sep="")
if ( !file.exists(dfile) ) {
cat(paste("\tnot yet calculated.\n"))
next
}
load(dfile)
## record settings of plotted data
## TODO: write setting file before segmentation run and
## use to load into analysis scripts?
sink(paste(file.name,"_settings.dat",sep=""))
if ( opt$verb>0 )
cat(paste("LOADED SETTINGS:\n"))
for ( j in 1:length(opt) ) {
if ( opt$verb>0 )
cat(paste(names(opt)[j], "\t", #typeof(opt[[i]]),
paste(opt[[j]],collapse=", "), "\n",sep=""))
##arg <- names(opt)[i]
##assign(arg, opt[[arg]])
}
sink()
## plot
## TODO:
## consolidate to plot functions, aligned with segment_data.R
## plot.tset(tset)
## plot.cset(cset, k)
## plot.sset(sset, k)
## plotAll(tset,cset,sset) # plot by k in fitting order
coors <- index2coor(t(c(chr=1,unlist(primseg[i,c("start","end")]))),chrS)
strand <- ifelse(coors[,"strand"]==-1, "-", "+")
xaxis <- coors[,"start"]:coors[,"end"]
N <- nrow(tset$ts)
## revert back timeseries!
## TODO: use function or move to plot
if ( strand=="-" ) {
tset$tot <- rev(tset$tot)
tset$zero.vals <- rev(tset$zero.vals)
tset$ts <- tset$ts[nrow(tset$ts):1,]
}
## adapt width to segment length!
## TODO: adapt left mai according to longest type name!
width <- 2.5 + N/1e3 # 1 kb per inch; plut left margin
if ( fig.type=="png" )
width <- min(c(width, 326)) # APPARENTLY THE MAX. WIDTH ALLOWED
nsg <- length(sset$ids)
nrows <- 3 + ncol(cset$clusters) + ifelse(genome=="yeast_R64-1-1",2,0)
if ( save.matrix )
nrows <- nrows + length(sset$SK)
heights <- rep(0.7,nrows)
heights[4] <- max(c(0.7, nsg * 0.7/10))
height <- sum(heights)
plotdev(file.name,width=width,height=height,type=fig.type)
## TODO: replace mfcol by layout and adjust height
## with segment number
par(mai=c(.01,1.25,.01,.01),mgp=c(1.7,.5,0),xaxs="i")
layout(matrix(1:nrows),widths=1,heights=heights)
plot(tset,plot=c("total","timeseries"),xaxis=xaxis)
for ( k in 1:ncol(cset$clusters) )
plot(cset, k=k)
plot(sset,plot="segments",xaxis=xaxis, lwd=2)
axis(1)
if ( genome=="yeast_R64-1-1" ) {
tmp <- plotFeatures(dataSets[["annotation"]]$data,
coors=coors, strand=strand,
typord=TRUE, cuttypes=TRUE,
axis1=TRUE, ylab=NA, names=TRUE,
columns=fcolumns, types=ftypes)
#axis(1)
tpy <- plotFeatures(dataSets[["transcripts"]]$data,
coors=coors, strand=strand,
typord=TRUE, cuttypes=TRUE, ylab=NA)
}
## PLOT SCORING MATRICES
if ( save.matrix )
plot(sset,plot="S",xaxis=xaxis)
dev.off()
}
cat(paste("DONE PLOTS AT\t",time(),"\n",sep=""))
quit(save="no")
## OLD TODO - keep until checked
segmentData <- function(segment=primseg[1,],data,kmax=30) {
### cluster DFT
## get data
## flowClust, Kmax=30, find best K via BIC, or use cluster DAG/TREE
## do on command-line or convert genomeOscillation.R to functions
## see clusterSegments.sh
clsk <- kmeans(data,centers,1:kmax)
## calculate cluster similarity
## run SegmentDP - use clustering or directly
## use similarity to cluster medians!?
## or BEST: cluster probability matrix
## back-trace winning clusters for each sub-segment
## calculate mean for winning clusters
## return: <sub-segment, cluster, segmentmedian>
## <cluster, clustermedian>
}
### 3) global clustering
clusterSegments <- function(secseg) {
## collect all secseg clustermedians
## cluster all clustermedians
## re-assign each cluster to global cluster
## OR simpler
## cluster segmentmedians
## return: <sub-segment, cluster, segmentmedian>
## <cluster, datamedian>
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.