Nothing
##################################################################
### MAIN FUNCTIONS
##################################################################
runSeqsumm = function(summ.win=50, file=NULL, folder=NULL, output.file="seqsumm_out.txt", samtools.path="samtools") {
if (! is.null(file)) {
if (! file.exists(file)) {
stop("File does not exist.")
} else {
folder = dirname(file)
filename = basename(file)
file.type = toupper(gsub(".*[.]","",filename))
if (! file.type%in%c("SAM","BAM"))
stop("File should have either SAM or BAM extension.")
}
} else {
if (! .folderExists(folder))
stop("Folder does not exist.")
}
wd = getwd()
setwd(folder)
if (length(list.files(pattern="^seqsumm_out.txt$")) > 0)
warning("A seqsumm output file already exists in the folder and will be overwritten.", immediate.=TRUE)
if (is.null(file)) {
filename = list.files(pattern=".sam")[1]
if (is.na(filename)) {
file.type = "BAM"
filename = list.files(pattern=".bam")[1]
if (is.na(filename)) {
setwd(wd)
stop("There are not any SAM or BAM files in the specified folder.")
}
} else {
file.type = "SAM"
}
}
cmd = ifelse(file.type=="BAM", paste(samtools.path," view ",filename,sep=""), paste("cat",filename))
message(paste("Reading records from", filename))
.C('seqsumm', as.integer(summ.win*1000), cmd, output.file)
setwd(wd)
}
readSeqsumm = function(build="", tumour.data=NULL, normal.data=NULL, folder=NULL, normal.folder=NULL, resample.win=NULL, sex=TRUE, nproc=2) {
.readSeqsumm = function(tab, columns=c(5:ncol(tab))) {
if (ncol(tab) < 5)
stop("The seqsumm file has a wrong format or is corrupt.")
rco = list()
m = min(columns)
for (chr in chrs) {
chrtab = tab[tab$chrom==chr,]
for (i in columns) {
chrtabti = chrtab[,i]
chrtabti = chrtabti[1:lengths[chr]] # make them of same length as the chromosome
chrtabti[is.na(chrtabti)] = 0 # fill sample windows with 0s if necessary
if (length(rco) < i-m+1)
rco[[i-m+1]] = chrtabti
else
rco[[i-m+1]] = c(rco[[i-m+1]], chrtabti)
}
}
rco
}
# check if build is present and supported by annotation
if (length(build)==0)
build = ""
supported = FALSE
if (! is.null(build))
if (! is.na(build) && nchar(build)>0)
if (build %in% supported.builds())
supported = TRUE
if (! supported)
message("Note: chromosome lengths will be estimated from sample.")
message("Reading summarized data...")
flush.console()
# read tumoural data
if (! is.null(tumour.data)) {
tab.tumour = as.data.frame(tumour.data)
} else {
if (.folderExists(folder)) {
if (.outputExists(folder))
tab.tumour = read.table(file.path(folder, "seqsumm_out.txt"), header=TRUE, comment.char="")
else
stop("No seqsumm output exists in the folder for the tumoural sample.")
} else {
stop("The folder for the tumoural sample does not exist.")
}
}
# read normal data if applicable
if (! is.null(normal.data)) {
tab.normal = as.data.frame(normal.data)
} else {
tab.normal = NULL
if (length(normal.folder) > 0 && !.folderExists(normal.folder))
warning("The folder for the normal sample does not exist. Only the tumoural sample will be read.")
if (.folderExists(normal.folder))
if (.outputExists(normal.folder))
tab.normal = read.table(file.path(normal.folder, "seqsumm_out.txt"), header=TRUE, comment.char="")
}
# perform downsampling if requested
if (!is.null(resample.win)) {
message("Resampling summarized data...")
flush.console()
tab.tumour = .resample(tab.tumour, resample.win, nproc)
if(!is.null(tab.normal))
tab.normal = .resample(tab.normal, resample.win, nproc)
}
# calculate window size
win = diff(tab.tumour$win.start[1:2])/1000
if (win == 0)
stop("Window size is zero.")
tab.tumour$chrom = as.character(tab.tumour$chrom)
chrs = as.character(rle(tab.tumour$chrom)$values)
# remove sex chromosomes if requested
if (! sex) {
if (build %in% c("hg17","hg18","hg19","hg20"))
sex.chrs = c("X","chrX","Y","chrY","23","chr23","24","chr24")
else
sex.chrs = c("X","chrX","Y","chrY")
wh = which(chrs%in%sex.chrs)
if (length(wh) > 0) {
chrs = chrs[-wh]
tab.tumour = tab.tumour[-which(tab.tumour$chrom%in%sex.chrs),]
if (! is.null(tab.normal))
tab.normal = tab.normal[-which(tab.normal$chrom%in%sex.chrs),]
}
}
# generate chromosome window indices
if (supported) {
info = paste(build,"_len",sep="")
data(list=info)
info.chrs = as.character(as.vector(get(info)[,1]))
info.lengths = as.integer(as.vector(get(info)[,2]))
} else {
info.chrs = chrs
info.lengths = sapply(info.chrs, function(i) max(tab.tumour$win.start[tab.tumour$chrom==i])+win*1000)
}
lengths = sapply(chrs, function(i) ceiling(info.lengths[info.chrs==i]/(win*1000)))
names(lengths) = chrs
all.chrs = rep(chrs, lengths)
all.pos = unlist(lapply(chrs, function(i) c(1:lengths[i])))
# create SeqCNAInfo object from input and calculated information
rcount.tumour = .readSeqsumm(tab.tumour)
if (!is.null(tab.normal)) {
rcount.normal = .readSeqsumm(tab.normal)
reads.gc = .readSeqsumm(tab.normal,3)[[1]]
reads.mapq = .readSeqsumm(tab.normal,4)[[1]]
} else {
rcount.normal = list()
reads.gc = .readSeqsumm(tab.tumour,3)[[1]]
reads.mapq = .readSeqsumm(tab.tumour,4)[[1]]
}
rco = new("SeqCNAInfo", tumour=rcount.tumour, normal=rcount.normal, gc=reads.gc, mapq=reads.mapq,
seq=all.chrs, pos=all.pos, build=build, win=win)
}
applyFilters = function(rco, pem.filter=0, trim.filter=0, mapp.filter=0, mapq.filter=0, cnv.filter=FALSE, plots=TRUE, folder=NULL, nproc=2) {
stop.text = "Run readSeqsumm first!"
if (length(rco@tumour) == 0) stop(stop.text)
if (length(rco@seq) == 0) stop(stop.text)
if (length(rco@pos) == 0) stop(stop.text)
if (length(rco@win) == 0) stop(stop.text)
use.normal = length(rco@normal)!=0
chrs = rle(rco@seq)$values
if (is.null(rco@build)) {
supported = FALSE
} else {
if (is.na(rco@build) || nchar(rco@build)==0)
supported = FALSE
else
supported = rco@build%in%supported.builds()
}
which.cnv = which.mapp = which.mapq = c()
which.filter = which.imp = which.q = which.trim = c()
if (supported) {
if (!use.normal || mapp.filter>0 || cnv.filter) {
message("Loading genome build information...")
data(list=rco@build)
data(list=paste(rco@build,"_len",sep=""))
genome.data = .buildGenomeInfo(get(rco@build), get(paste(rco@build,"_len",sep="")), chrs, rco@win, nproc=nproc)
}
if (use.normal) {
x = rco@normal[[1]]
} else {
x = c()
for (chr in chrs)
x = c(x, as.numeric(as.vector(genome.data$GC[which(genome.data$Chr==chr)])))
x[x==-1] = NA # if necessary, GC windows with code -1 recoded to NAs
}
# FILTER WINDOWS WITH CNVs or low mappability
if (cnv.filter) {
if (use.normal) {
cnv.filter = FALSE
message("Note: CNV filter not meaningful with paired-normal. Disabled.")
} else {
which.cnv = which(genome.data$CNV>0.95)
}
}
if (mapp.filter)
which.mapp = which(genome.data$Mapp<mapp.filter)
} else { # avoid CNV filter, and estimate GC content and mappability
if (use.normal) {
x = rco@normal[[1]]
} else {
x = rco@gc
x[x<=0] = NA
x[x>=1] = NA
message("Note: GC content will be estimated from sample.")
}
if (cnv.filter) {
message("Note: the genome build does not support CNV-based filtering.")
cnv.filter = FALSE
}
if (mapp.filter) {
message("Note: the genome build does not support mappability-based filtering.")
mapp.filter = 0
}
}
y = rco@tumour[[1]]
# (PRE-)NORMALIZE THE PROFILE FOR BETTER FILTER ASSESSMENT AND IMPROVED TRIM-BASED FILTERING
if (! use.normal) { # pre-normalize tumoural against GC
pre.x = x
pre.y = y
} else { # or normal against GC
pre.x = rco@gc
pre.y = x
}
method = "quadratic"
reg.method = ifelse(method=="loess", "loess", "lm")
which.ok = intersect(intersect(which(pre.y>0),which(!is.na(pre.y))),which(!is.na(pre.x)))
x.pos = pre.x[which.ok]
y.pos = pre.y[which.ok]
lomo = eval(call(reg.method, .makeFormula("x.pos","y.pos",method,use.normal,environment())))
y.norm = rep(NA, length(y))
z.pos = lomo$residuals / lomo$fitted
y.norm[which.ok] = z.pos
# FILTER WINDOWS WITH LOW MEAN MAPPING QUALITY
if (mapq.filter)
which.mapq = intersect(which(rco@mapq<mapq.filter), which.ok)
which.filter = unique(c(which.mapq, which.cnv, which.mapp))
# FILTER WINDOWS WITH HIGH RATIO OF IMPROPER PAIR-END MAPPINGS IN THE NORMAL (TUMOURAL) SAMPLE,
# A SIGN OF REPETITIVE SEQUENCES, MOSTLY HAPPENING AROUND THE CENTROMERIC REGIONS
pem.allowed = (use.normal && length(rco@normal) > 1) || length(rco@tumour) > 1
if (pem.filter && !pem.allowed)
message("Note: PEM filter will not be applied due to lack of PEM information.")
message("Applying filters...")
do.pem.filter = pem.filter && pem.allowed
if (do.pem.filter) {
types = c(2,4,5) # type 3 reads are non-existent in some alignments, so better not use them
if (use.normal)
improper = Reduce('+',(lapply(types,function(i)rco@normal[[i]])))/rco@normal[[1]]
else
improper = Reduce('+',(lapply(types,function(i)rco@tumour[[i]])))/rco@tumour[[1]]
thr = quantile(improper[is.finite(improper)], probs=1-pem.filter)
which.imp = which(improper > thr)
which.filter = union(which.filter, which.imp)
} else {
pem.filter = 0
}
# FILTER WINDOWS WITH EXTREME COUNTS, PRONE TO RESULT IN GREATER NOISE IN THE RATIO PROFILE
if (trim.filter[1]) {
if (use.normal && trim.filter[1]==1) {
trim.filter[1] = 0
message("Automatic trimming is only available without normal sample. Disabled.")
}
trim.val = ifelse (!use.normal&&trim.filter[1]==1, .autoTrim(z.pos,nproc=nproc), 1-trim.filter[1])
trim.val[2] = ifelse (length(trim.filter) > 1, trim.filter[2], 1-trim.val[1])
quants = quantile(z.pos, rev(trim.val))
which.trim = which(y.norm<quants[1] | y.norm>quants[2])
if (length(which.ok)>0)
which.trim = union(which.trim, c(1:length(y))[-which.ok])
which.q = intersect(which.trim, which.ok)
which.filter = union(which.filter, which.trim)
}
which.nok = c(1:length(y))[-which.ok]
# OUTPUT FILTERING RESULTS
zeroes.len = length(y)-length(z.pos)
message(" Windows without reads or info: ",appendLF=FALSE)
message(length(which.nok))
message(" Total filtered windows: ",appendLF=FALSE)
message(length(union(which.filter,which.nok)))
message(" Remaining windows: ",appendLF=FALSE)
message(length(y)-length(union(which.filter,which.nok)))
flush.console()
if (plots) {
mask = c(trim.filter[1]>0, mapp.filter>0, mapq.filter>0, pem.filter>0, cnv.filter)
l = list(which.trim, which.mapp, which.mapq, which.imp, which.cnv)[mask]
ll = length(unlist(l))
lf = length(which(mask==TRUE))
nrows = ifelse(lf>3,2,1)
ncols = ceiling(lf/nrows)
if (ll > 0) {
message("Plotting filtering...")
flush.console()
all.cols = c("#9E2862FF","#FF7400FF","#FF00BBFF","#006388FF","#92EC00FF")
cols = all.cols[mask]
pchs = c(1:5)[mask]
if (.folderExists(folder)) {
jpeg(file.path(folder,"seqCNA_filters.jpg"),1920,1080)
} else {
if (lf<4) m = t(matrix(c(1,1,rep(1,ncols),2,2,c(3:(lf+2))),ncol=2))
else m = t(matrix(c(1,1,1,1,2,2,3,4,2,2,5,6),ncol=3))
layout(m)
}
# profile with filtered windows
par(oma=c(0,0,0,0))
old.par = par(mar=c(5,4,4,0))
if (trim.filter[1]) {
z = y.norm
yl = quantile(z.pos[is.finite(z.pos)], c(trim.val[2]/2, 1-(1-trim.val[1])/2))
if (!use.normal) ylab = "Pre-normalized tumoural profile"
else ylab = "Normalized normal profile"
} else {
z = y
yl = c(0,quantile(y, 0.9995))
ylab = "Raw counts"
}
binsX = 500
bandX = ceiling(length(z) / binsX)
binsY = 200
bandY = diff(quantile(z[is.finite(z)],c(0.01,0.99))) / binsY
suppressMessages(suppressWarnings(
smoothScatter(postPlotHook=NULL, colramp=colorRampPalette(c("white","black")),
bandwidth=c(bandX,bandY), nbin=c(binsX,binsY),
z, cex=0.5, pch=19, col="#00000033", ylim=yl, cex.axis=1.5,cex.lab=1.5, ylab=ylab, frame.plot=FALSE)
))
for (i in 1:length(l))
points(l[[i]], z[l[[i]]], cex=1, pch=pchs[i], col=cols[i])
if (.folderExists(folder))
dev.off()
par(mar=old.par)
# Venn diagrams
if (lf > 1) {
venn.mask = c(pem.filter>0, mapp.filter>0, mapq.filter>0, cnv.filter, trim.filter[1]>0)
venn.l = list(which.imp,intersect(which.mapp,which.ok),intersect(which.mapq,which.ok),which.cnv,which.q)[venn.mask]
names(venn.l) = c("PEM","mapp.","quality","CNV","trimmed")[venn.mask]
venn.cols = c("#006388FF","#FF7400FF","#FF00BBFF","#92EC00FF","#9E2862FF")[venn.mask]
if (.folderExists(folder))
jpeg(file.path(folder,"seqCNA_filters_overlap.jpg"),1920,1080)
counts = list(sapply(.overLapper(setlist=venn.l, sep="_", type="vennsets")$Venn_List, length))
old.par = par(mar=rep(0,4))
dummy = .vennPlot(counts=counts, lines=venn.cols, lcol=venn.cols, mymain="", mysub="", ccex=1.25)
par(mar=old.par)
if (.folderExists(folder))
dev.off()
if (.folderExists(folder)) {
jpeg(file.path(folder,"seqCNA_filters_thresholds.jpg"),1920,1080)
par(mfrow=c(nrows,ncols))
}
}
# per-filter plots and thresholds
if (mask[1]) {
d = density(y.norm[y.norm<quants[2]*1.5 & y.norm>quants[1]*1.5],na.rm=TRUE)
plot(Inf, xlim=c(min(d$x),max(d$x)), ylim=c(0,max(d$y)),
xlab=ylab,ylab="Density",main="Trimming filter",cex.lab=1.5,cex.axis=1.5)
rect(quants[1], -max(d$y)*2, quants[2], max(d$y)*2, col=gsub("FF$","44",all.cols[1]), lty=0)
abline(h=0, col="grey")
lines(d, lwd=2)
abline(v=quants, col=all.cols[1], lwd=2)
text(quants[1],max(d$y),round(quants[1],2),adj=c(-0.25,0.75), cex=1.5, col=all.cols[1])
text(quants[2],max(d$y),round(quants[2],2),adj=c(-0.25,0.75), cex=1.5, col=all.cols[1])
}
if (mask[2]) {
d = density(genome.data$Mapp,na.rm=TRUE)
plot(Inf, xlim=c(0,1), ylim=c(0,max(d$y)),
xlab="Mappability",ylab="Density",main="Mappability filter",cex.lab=1.5,cex.axis=1.5)
rect(mapp.filter, -max(d$y)*2, 2, max(d$y)*2, col=gsub("FF$","44",all.cols[2]), lty=0)
abline(h=0, col="grey")
lines(d, lwd=2)
abline(v=mapp.filter, col=all.cols[2], lwd=2)
text(mapp.filter,max(d$y),mapp.filter,adj=c(-0.25,0.75), cex=1.5, col=all.cols[2])
}
if (mask[3]) {
d = density(rco@mapq,na.rm=TRUE)
plot(Inf, xlim=c(0,max(d$x)), ylim=c(0,max(d$y)),
xlab="Mean mapping quality",ylab="Density",main="Mapping quality filter",cex.lab=1.5,cex.axis=1.5)
rect(mapq.filter, -max(d$y)*2, max(d$x)*2, max(d$y)*2, col=gsub("FF$","44",all.cols[3]), lty=0)
abline(h=0, col="grey")
lines(d, lwd=2)
abline(v=mapq.filter, col=all.cols[3], lwd=2)
text(mapq.filter,max(d$y),mapq.filter,adj=c(-0.25,0.75), cex=1.5, col=all.cols[3])
}
if (mask[4]) {
d = density(improper[improper<thr*1.5],na.rm=TRUE)
plot(Inf, xlim=c(0,max(d$x)), ylim=c(0,max(d$y)),
xlab="Improper/proper read ratio",ylab="Density",main="PEM filter",cex.lab=1.5,cex.axis=1.5)
rect(-1, -max(d$y)*2, thr, max(d$y)*2, col=gsub("FF$","44",all.cols[4]), lty=0)
abline(h=0, col="grey")
lines(d, lwd=2)
abline(v=thr, col=all.cols[4], lwd=2)
text(thr,max(d$y),round(thr,2),adj=c(-0.25,0.75), cex=1.5, col=all.cols[4])
}
if (.folderExists(folder)) dev.off()
} else {
message("There are no filtered windows to plot.")
}
}
rco@y = y
rco@x = x
rco@skip = as.numeric(which.filter)
rco
}
runSeqnorm = function(rco, norm.win=NULL, method="quadratic", lambdabreak=8, minSeg=7, maxSeg=35, nproc=2, plots=TRUE, folder=NULL) {
if (length(rco@tumour) == 0) stop("Run readSeqsumm first!")
if (length(rco@y) == 0) stop("Run applyFilters first!")
if (length(rco@x) == 0) stop("Run applyFilters first!")
if (!is.null(norm.win)) {
if ((norm.win/rco@win) %% 1 != 0)
message("Note: normalization window will be approximated to the closest multiple of the summarization window.")
} else {
if (length(rco@y) > 200000) {
f = round(length(rco@y) / 50000)
norm.win = rco@win * f
} else {
norm.win = rco@win
}
}
rcount.prof = .seqnorm(rco@x, rco@y, chr=rco@seq, skip=rco@skip, use.normal=length(rco@normal)!=0,
resolution=rco@win/norm.win, nproc=nproc,method=method, out.dir=folder,
out.jpg=file.path(folder,"seqCNA_normalization.jpg"), plots=plots,
lambdabreak=lambdabreak, minRegions=minSeg, maxRegions=maxSeg)
profiles = cbind(rco@seq, (rco@pos-1)*rco@win, round(rcount.prof,4))
colnames(profiles) = c("chrom", "win.start", "normalized")
rco@output = data.frame(profiles)
rco@output$normalized = as.numeric(as.vector(rco@output$normalized))
rco
}
runGLAD = function(rco, lambdabreak=8, nproc=2) {
if (length(rco@tumour) == 0) stop("Run readSeqsumm first!")
if (length(rco@y) == 0) stop("Run applyFilters first!")
if (ncol(rco@output) == 0) stop("Run runSeqnorm first!")
message("Running segmentation...")
flush.console()
which.ok = which(!is.na(rco@output$normalized))
chrs = rle(as.vector(rco@output$chrom))$values
ch = 0
all.prof = rco@output[, c(3,1)]
cluster = makeCluster(rep("localhost",nproc), type="SOCK")
registerDoSNOW(cluster)
res = foreach(ch=chrs, .packages="GLAD") %dopar% {
prof = all.prof[which(all.prof$chrom==ch), ]
input = cbind(c(1:nrow(prof)),prof)
colnames(input) = c("PosOrder","LogRatio","Chromosome")
profileCGH = as.profileCGH(data.frame(input))
res = glad(profileCGH, verbose=FALSE, bandwidth=1, lambdabreak=lambdabreak)$profileValues$Smoothing
}
stopCluster(cluster)
res = unlist(res)
rcount.prof.segm = rep(NA, nrow(rco@output))
rcount.prof.segm[which.ok] = res
rco@output$segmented = rcount.prof.segm
rco
}
applyThresholds = function(rco, thresholds, min.CN) {
if (length(rco@tumour) == 0) stop("Run readSeqsumm first!")
if (length(rco@y) == 0) stop("Run applyFilters first!")
if (ncol(rco@output) == 0) stop("Run runSeqnorm first!")
if (ncol(rco@output) < 4) stop("Run runGLAD first!")
cu = cut(rco@output$segmented, breaks=c(-Inf,thresholds,Inf), labels=min.CN+c(0:length(thresholds)))
rco@output$CN = cu
rco@thr = thresholds
rco@minCN = min.CN
rco
}
writeCNProfile = function(rco, folder) {
if (ncol(rco@output) == 0) stop("Run runSeqnorm first!")
write.table(rco@output, file=paste(folder,"/seqCNA_out.txt",sep=""), quote=FALSE,row.names=FALSE,sep="\t")
}
plotCNProfile = function(rco, folder=NULL) {
if (ncol(rco@output) == 0) stop("Run runSeqnorm first!")
profiles = rco@output
if (.folderExists(folder)) {
jpeg(file.path(folder,"seqCNA_out.jpg"),1920,1080)
cex = 2.5
} else {
cex = 1.4
}
par(mfrow=c(1,1))
par(mar=c(5,4,4,0.5))
par(oma=c(0,0,0,0))
binsX = 500
bandX = ceiling(length(profiles$normalized) / binsX)
binsY = 200
bandY = diff(quantile(profiles$normalized[is.finite(profiles$normalized)],c(0.01,0.99))) / binsY
suppressMessages(suppressWarnings(
smoothScatter(postPlotHook=NULL, colramp=colorRampPalette(c("white","black")),
bandwidth=c(bandX,bandY), nbin=c(binsX,binsY),
profiles$normalized, cex=0.5, pch=19, col="#00000055",
ylim=quantile(profiles$normalized[is.finite(profiles$normalized)],probs=c(0.005,0.995)),
xlab="Genomic window index", ylab="Normalized ratio", frame.plot=FALSE,cex.axis=1.5,cex.lab=1.5)
))
abline(v=which(diff(as.numeric(as.factor(rco@seq)))!=0), lwd=2)
minh = floor(min(profiles$normalized,na.rm=T))
maxh = ceiling(max(profiles$normalized,na.rm=T))
rang = maxh-minh
if (rang>5) byh = 1
else if (rang>1) byh = 0.2
else if (rang>0.2) byh = 0.05
else byh = 0.01
abline(h=seq(minh,maxh,by=byh),lty=2,col="grey",lwd=2)
if ("segmented"%in%colnames(profiles)) {
wh.ok = which(!is.na(profiles$segmented))
wh.ch = wh.ok[diff(profiles$segmented[wh.ok])!=0]
en = c(wh.ch, tail(wh.ok,1))
st = c(1, en[1:(length(en)-1)]+1)
vals = profiles$segmented[en]
dummy = sapply(1:length(en), function(o) lines(c(st[o],en[o]),rep(vals[o],2),lwd=4,col="green"))
}
if ("CN"%in%colnames(profiles)) {
for (cn in 0:length(rco@thr)) {
ypos = median(profiles$segmented[profiles$CN==rco@minCN+cn], na.rm=T)
text(0, ypos, paste("CN",rco@minCN+cn,sep=""), cex=cex, col="#ff0000", adj=1)
}
for (i in 1:length(rco@thr))
abline(h=rco@thr[i], col="#ff0000", lwd=3)
}
if (.folderExists(folder))
dev.off()
}
##################################################################
### SEQNORM FUNCTION
### this is the normalization function, which admits GC-based and
### paired normal modes, with different regression strategies
##################################################################
.seqnorm = function(x,y, nproc=1, resolution=1, use.normal=FALSE, skip=NULL, chr=rep(1,length(x)),
method=c("loess","cubic","quadratic")[3], out.dir=getwd(), out.jpg, plots=TRUE,
lambdabreak=8, minRegions=7, maxRegions=35) {
if (length(x) != length(y)) {
if (! use.normal)
stop("Tumour sample and GC content lengths differ.")
else
stop("Tumour and normal sample lengths differ.")
}
if (any(x[is.finite(x)]<0)) {
if (! use.normal)
stop("It seems that some GC content values are negative. Try recoding from -1 to NA in the GC file.")
else
stop("Hmmm... Something is not right, some read counts are negative!")
}
if (any(y[is.finite(y)]<0))
stop("Hmmm... Something is not right, some read counts are negative!")
if (nproc < 1)
stop("Try setting nproc to a positive number. At least one processor core is needed!")
if (resolution <= 0 || resolution >1)
stop("resolution should be greater than 0 and as big as 1.")
if (nproc > 64)
warning("Warning: unless you really have a computer with more than 64 processor cores, try setting nproc to a lower number.", immediate.=TRUE)
# work only with positive read counts and restore the rest after processing
ret = rep(NA, length(x))
xy.a = intersect(which(!is.na(x)), which(!is.na(y)))
xy.a = intersect(xy.a, intersect(which(x>0),which(y>0)))
if (! is.null(skip))
xy.a = setdiff(xy.a, skip) # filter certain windows expected to misbehave
x.a = x
y.a = y
chr.a = chr
x = x[xy.a]
y = y[xy.a]
chr = chr[xy.a]
chrtab = rle(as.vector(chr))$values
chrlist = lapply(chrtab, function(chri) which(chr==chri))
chrlist = chrlist[order(sapply(chrlist,head,1))]
# work with bigger summarized windows if resolution < 1
summ = 1
if (resolution < 1) {
chrlist.full = chrlist
chrlist = list()
max.ind = 0
x.full = x
y.full = y
summ = floor(1/resolution)
x = y = c()
for (l in chrlist.full) {
xchr = x.full[l]
ychr = y.full[l]
len = ceiling(length(xchr)/summ)
x = c(x, sapply(split(xchr, rep(c(1:len),rep(summ,len))[1:length(xchr)]), mean, na.rm=T))
y = c(y, sapply(split(ychr, rep(c(1:len),rep(summ,len))[1:length(ychr)]), mean, na.rm=T))
chrlist[[length(chrlist)+1]] = c(1:len) + max.ind
max.ind = tail(chrlist[[length(chrlist)]], 1)
}
}
message("Creating segments...")
flush.console()
mins = sapply(chrlist,head,1)
maxs = sapply(chrlist,tail,1)
reg.method = ifelse(method=="loess", "loess", "lm")
classicLomo = eval(call(reg.method, .makeFormula("x","y",method,use.normal,environment())))
classic.rcount.prof = classicLomo$residuals/classicLomo$fitted
cluster = makeCluster(rep("localhost",nproc), type="SOCK")
registerDoSNOW(cluster)
i = 0
bps = foreach(i=1:length(chrlist), .packages="GLAD") %dopar% {
prof = classic.rcount.prof[chrlist[[i]]]
input = cbind(c(1:length(prof)), prof, rep(1,length(prof)))
colnames(input) = c("PosOrder","LogRatio","Chromosome")
profileCGH = as.profileCGH(data.frame(input))
res = glad(profileCGH, verbose=FALSE, bandwidth=1)$profileValues$Smoothing
which(diff(res)!=0) + mins[i]-1
}
breaks = round(sort(c(1, unlist(bps), maxs[-length(maxs)], length(y)+1)))
segmLength = diff(breaks)
message(" Segments total: ", appendLF=FALSE)
message(length(segmLength))
flush.console()
segmDomain = sapply(c(1:(length(breaks)-1)), function(i) {
Reduce("-",quantile(x[breaks[i]:(breaks[i+1]-1)],c(0.95,0.05)))
} )
segmRange = sapply(c(1:(length(breaks)-1)), function(i) {
Reduce("-",quantile(classic.rcount.prof[breaks[i]:(breaks[i+1]-1)],c(0.95,0.05)))
} )
# filter regions based on three different metrics
minDomain = 0.95 # minimum span over X of a segment's middle 90% quantile (in quantiles)
maxRange = 0.85 # maximum span over Y of a segment's middle 90% quantile (in quantiles)
minLength = 0.85 # minimum segment length
stepp = 0.025
.getSegmLm = function() {
segmLm = intersect(which(segmLength>quantile(segmLength,minLength)), which(segmRange<quantile(segmRange,maxRange)))
intersect(segmLm, which(segmDomain>quantile(segmDomain,minDomain)))
}
segmLm = .getSegmLm()
while (length(segmLm) < min(minRegions, length(breaks)-1) && maxRange < 1-stepp) {
minDomain = minDomain - stepp
maxRange = maxRange + stepp
minLength = minLength - stepp
segmLm = .getSegmLm()
}
while (length(segmLm) > maxRegions && minDomain < 1-stepp) {
minDomain = minDomain + stepp
maxRange = maxRange - stepp
minLength = minLength + stepp
segmLm = .getSegmLm()
}
message(" Segments passing QC: ", appendLF=FALSE)
message(length(segmLm))
flush.console()
classic.rcount.prof = scale(classicLomo$residuals/classicLomo$fitted)
if (length(segmLm) < min(minRegions, length(breaks)-1)) {
message(" Not enough segments, falling back to typical regression.")
flush.console()
if (resolution < 1)
ret[xy.a] = y / predict(classicLomo, data.frame(x=x))
else
ret[xy.a] = classic.rcount.prof
return(ret)
}
message("Fitting regression models...")
flush.console()
if (resolution < 1) sel = seq(1,length(x.full),l=100000)
xminwh = which.min(x)
xmin = x[xminwh]
ymin = y[xminwh]
xmaxwh = which.max(x)
xmax = x[xmaxwh]
ymax = y[xmaxwh]
fit = foreach(i=c(1:(length(breaks)-1))[segmLm], .export=c(".makeFormula")) %dopar% {
rang = breaks[i]:(breaks[i+1]-1)
xLm = x[rang]
yLm = y[rang]
if (reg.method=="loess") {
xLm = c(xLm, xmin,xmax)
yLm = c(yLm, ymin,ymax)
}
lomo = eval(call(reg.method, .makeFormula("xLm","yLm",method,use.normal,environment())))
if (resolution < 1) p = x.full[sel]
else p = x
predict(lomo, data.frame("xLm"=p))
}
fit = do.call(rbind, fit)
# subtract shift to ref regression
baseLomo = median(classicLomo$fitted)
fit = t(apply(fit, 1, function(i) i / (median(i)/baseLomo)))
# for each point, merge sub-regression estimations
chunk.len = ceiling(ncol(fit)/nproc)
n = 0
medianFit = foreach(n=1:nproc) %dopar% {
fits = c()
for (i in c(1:chunk.len)+(n-1)*chunk.len)
if (i <= ncol(fit))
fits = c(fits, median(fit[,i]))
fits
}
stopCluster(cluster)
medianFit = unlist(medianFit)
if (resolution < 1) {
xSel = x.full[sel]
lomo = eval(call(reg.method, .makeFormula("xSel","medianFit",method,use.normal,environment())))
lomoFitted = predict(lomo, data.frame(xSel=x.full))
rcount.prof = y.full / lomoFitted
} else {
lomo = eval(call(reg.method, .makeFormula("x","medianFit",method,use.normal,environment())))
lomoFitted = lomo$fitted
rcount.prof = y / lomoFitted
}
rcount.prof = scale(rcount.prof)
if (plots) {
message("Plotting normalization...")
flush.console()
if (.folderExists(out.dir)) {
jpeg(file.path(out.dir,"seqCNA_segment_QC.jpg"),1000,700)
segmLengthCut = cut(segmLength, breaks=c(-Inf,quantile(segmLength,0.5),Inf), labels=c(1,2))
layout(matrix(c(1:4),ncol=2), widths=c(1,4), heights=c(4,1))
par(oma=c(0,0,0,0))
par(mar=c(4,0,1,1))
d = density(segmRange[which(segmLengthCut==2)], from=min(segmRange), to=max(segmRange), n=10000)
plot(-d$y, d$x, frame.plot=FALSE, xaxt="n",yaxt="n", type="p", col="black", xlab="",ylab="", pch=19, cex=0.25)
d = density(segmRange[which(segmLengthCut==1)], from=min(segmRange), to=max(segmRange), n=10000)
points(-d$y, d$x, col="grey", pch=19, cex=0.25)
par(mar=c(1,0,1,1))
plot.new()
legend("topleft",c("Short segments","","Long segments"), col=c("Grey","White","Black"),lwd=4, box.col="white")
par(mar=c(4,4,1,1))
plot(segmDomain, segmRange, col=c("#88888888","#000000aa")[segmLengthCut],
frame.plot=FALSE, xlab="Segment domain", ylab="Segment range", cex=1, pch=19)
abline(h=quantile(segmRange,0.75))
abline(v=quantile(segmDomain,0.75))
par(mar=c(0,4,1,1))
d = density(segmDomain[which(segmLengthCut==1)], from=min(segmDomain), to=max(segmDomain), n=10000)
plot(d$x, d$y, frame.plot=FALSE, xaxt="n",yaxt="n", type="p", xlab="",ylab="", col="grey", pch=19, cex=0.25)
d = density(segmDomain[which(segmLengthCut==2)], from=min(segmDomain), to=max(segmDomain), n=10000)
points(d, col="black", pch=19, cex=0.25)
dev.off()
}
if (.folderExists(out.dir)) {
jpeg(out.jpg,2000,1600)
cex = 2
} else {
cex = 1
}
par(mfrow=c(1,2))
par(oma=c(0,0,0,0))
par(mar=c(4,5,1,1))
qu = quantile(rcount.prof, c(0.001,0.999))
wh = which(rcount.prof>qu[1] & rcount.prof<qu[2])
plot(density(rcount.prof[wh],n=5000,adjust=1/summ), col="#ff6600ff", xlim=quantile(rcount.prof, c(0.01,0.99)), main="",
xlab="Normalized ratio", frame.plot=FALSE,cex.axis=cex,cex.lab=cex, lwd=2)
if (resolution == 1) {
lines(density(classic.rcount.prof[wh],n=5000), col="#0000FFFF", lwd=2)
}
xlab = ifelse(use.normal, "Window counts (normal)", "GC content")
suppressMessages(suppressWarnings(
smoothScatter(colramp=colorRampPalette(c("white","black")),
x,y, col="#00000022", cex=1, pch=19, xlab=xlab, ylab="Window counts (tumour)",
xlim=quantile(x, c(0.001,0.999)), ylim=quantile(y, c(0.01,0.99)), cex.axis=cex, cex.lab=cex)
))
if (resolution < 1) {
ord = order(x.full)[sel]
lines(x.full[ord], lomoFitted[ord], pch=19, col="#FF6600FF", lwd=3)
} else {
ord = order(x)
lines(x[ord], lomoFitted[ord], pch=19, col="#FF6600FF", lwd=3)
lines(x[ord], classicLomo$fitted[ord], pch=19, col="#0000FFFF", lwd=3)
legend("topright",c("Standard regression","","seqnorm regression"),
col=c("#0000FFFF","#FFFFFF00","#FF6600FF"),lwd=4, box.col="#00000000", cex=cex*1.2)
}
if (.folderExists(out.dir)) dev.off()
}
ret[xy.a] = rcount.prof
ret
}
##################################################################
### AUXILIARY FUNCTIONS
##################################################################
.folderExists = function(x) {
if (is.null(x)) return(0)
if (is.na(x)) return(0)
x = gsub("[\\/]*$","",x)
if (!file.exists(x)) return(0)
ret = 1
wd = getwd()
try.wd = try(setwd(wd), silent=TRUE)
if (class(try.wd) == "try-error")
ret = 0
else
setwd(wd)
ret
}
.outputExists = function(x) length(list.files(path=x, pattern="^seqsumm_out.txt$")) > 0
.resample = function(info, new.win, nproc=2) {
new.win = new.win*1000
win = diff(info$win.start[1:2])
if (new.win %% win != 0 || new.win < win)
stop(paste("New window size should be an exact multiple of ",win/1000,".",sep=""))
m.factor = new.win / win
info$reads.gc[info$reads.gc==0] = NA
info$reads.mapq[info$reads.mapq==0] = NA
chrs = rle(as.character(info$chrom))$values
cluster = makeCluster(rep("localhost",nproc), type="SOCK")
registerDoSNOW(cluster)
chr = 0
ss = foreach(chr=chrs) %dopar% {
s = list()
chr.info = info[info$chrom==chr, ]
l = ceiling(nrow(chr.info)/m.factor)
spls = lapply(c(3:ncol(info)), function(i) split(chr.info[,i], rep(c(1:l),rep(m.factor,l))[1:nrow(chr.info)]))
for (i in 1:2) # weighted average for GC content and mapping quality
s[[i]] = sapply(c(1:length(spls[[i]])), function(j)
sum(spls[[i]][[j]] * spls[[3]][[j]], na.rm=TRUE) / sum(spls[[3]][[j]], na.rm=TRUE))
for (i in 3:(ncol(info)-2)) # sum for reads
s[[i]] = sapply(c(1:length(spls[[i]])), function(j)
sum(spls[[i]][[j]], na.rm=TRUE))
do.call(cbind, s)
}
stopCluster(cluster)
dat = do.call(rbind, ss)
l = sapply(ss, nrow)
ch = rep(chrs, l)
win.st = unlist(lapply(l, function(i) c(0:(i-1))*new.win))
newinfo = data.frame(chrom=ch, win.start=win.st, reads.gc=dat[,1], reads.mapq=dat[,2], stringsAsFactors=FALSE)
for (i in 3:ncol(dat))
newinfo = cbind(newinfo, dat[,i])
colnames(newinfo) = colnames(info)
newinfo
}
.makeFormula = function(xn, yn, method, use.normal, env) {
formulae = paste(yn,"~",xn)
if (method != "loess") formulae = paste(formulae, "+I(",xn,"^2)+I(",xn,"^3)")
if (method == "quadratic") formulae = paste(formulae, "+I(",xn,"^4)")
if (use.normal) formulae = paste(formulae, "-1")
as.formula(formulae, env=env)
}
.buildGenomeInfo = function(info, info_len, chrs, win, nproc=2) {
csum = cumsum(ceiling(info_len$length/1000))
names(csum) = info_len$chr
en = csum[chrs]
st = c(1, en[1:(length(en)-1)]+1)
names(st) = names(en)
cluster = makeCluster(rep("localhost",nproc), type="SOCK")
registerDoSNOW(cluster)
chr = 0
ss = foreach(chr=chrs) %dopar% {
s = list()
chr.info = info[st[chr]:en[chr], ]
l = ceiling(nrow(chr.info)/win)
for (i in 1:3) {
if (win == 1) {
s[[i]] = chr.info[,i]
} else {
spl = split(chr.info[,i], rep(c(1:l),rep(win,l))[1:nrow(chr.info)])
s[[i]] = sapply(spl, mean, na.rm=T)
}
}
do.call(cbind, s)
}
stopCluster(cluster)
dat = do.call(rbind, ss)
l = sapply(ss, nrow)
ch = rep(chrs, l)
win.st = unlist(lapply(l, function(i) c(0:(i-1))*win))
newinfo = cbind(ch, win.st, dat)
colnames(newinfo) = c("Chr", "Start", colnames(info))
newinfo = as.data.frame(newinfo)
for (i in 3:5)
newinfo[,i] = as.numeric(as.character(newinfo[,i]))
newinfo
}
.autoTrim = function(rc, from=0.98, by=0.0005, nproc=2) {
to = 1 - 10 / length(rc)
quants = seq(from,to,by)
cluster = makeCluster(rep("localhost",nproc), type="SOCK")
registerDoSNOW(cluster)
chunk.len = ceiling(length(quants)/nproc)
n = 0
ws = foreach(n=1:nproc, .packages="adehabitatLT") %dopar% {
w = c()
for (i in c(1:chunk.len)+(n-1)*chunk.len)
if (i <= length(quants))
w = c(w, wawotest(rc > quantile(rc, quants[i]))[1])
w
}
stopCluster(cluster)
ws = unlist(ws)
names(ws) = quants
ws.smoothed = loess(ws~quants, span=0.5)$fitted
whm = which.max(ws.smoothed)
rang = whm:length(ws.smoothed)
ws.smoothed = ws.smoothed[rang]
y = ws.smoothed - min(ws.smoothed)
y = y / max(y)
x = quantile(rc,quants)[rang]
x = x - min(x)
x = x / max(x)
last.valley = length(y) - tail(rle(diff(y)<0)$lengths,1)
sel = last.valley : length(y)
if (all(diff(y[sel])>=0))
sep = length(quants)
else
sep = which.min(x[sel]+y[sel]) + whm-1 + last.valley-1
trim = quants[sep]
trim
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.