R/seqCNA_functions.r
In seqCNA: Copy number analysis of high-throughput sequencing cancer data

Documented in applyFilters applyThresholds plotCNProfile readSeqsumm runGLAD runSeqnorm runSeqsumm writeCNProfile

##################################################################
### MAIN FUNCTIONS
##################################################################
runSeqsumm = function(summ.win=50, file=NULL, folder=NULL, output.file="seqsumm_out.txt", samtools.path="samtools") {
	if (! is.null(file)) {
		if (! file.exists(file)) {
			stop("File does not exist.")
		} else {
			folder = dirname(file)
			filename = basename(file)
			file.type = toupper(gsub(".*[.]","",filename))
			if (! file.type%in%c("SAM","BAM"))
				stop("File should have either SAM or BAM extension.")
		}
	} else {
		if (! .folderExists(folder)) 
			stop("Folder does not exist.")
	}
	wd = getwd()
	setwd(folder)
	if (length(list.files(pattern="^seqsumm_out.txt$")) > 0)
		warning("A seqsumm output file already exists in the folder and will be overwritten.", immediate.=TRUE)
	if (is.null(file)) {
		filename = list.files(pattern=".sam")[1]
		if (is.na(filename)) {
			file.type = "BAM"
			filename = list.files(pattern=".bam")[1]
			if (is.na(filename)) {
				setwd(wd)
				stop("There are not any SAM or BAM files in the specified folder.")
			}
		} else {
			file.type = "SAM"
		}
	}
	cmd = ifelse(file.type=="BAM", paste(samtools.path," view ",filename,sep=""), paste("cat",filename))
	message(paste("Reading records from", filename))
	.C('seqsumm', as.integer(summ.win*1000), cmd, output.file)
	setwd(wd)
}

readSeqsumm = function(build="", tumour.data=NULL, normal.data=NULL, folder=NULL, normal.folder=NULL, resample.win=NULL, sex=TRUE, nproc=2) {
	.readSeqsumm = function(tab, columns=c(5:ncol(tab))) {
		if (ncol(tab) < 5)
			stop("The seqsumm file has a wrong format or is corrupt.")
		rco = list()
		m = min(columns)
		for (chr in chrs) {
			chrtab = tab[tab$chrom==chr,]
			for (i in columns) {
				chrtabti = chrtab[,i]
				chrtabti = chrtabti[1:lengths[chr]]  # make them of same length as the chromosome
				chrtabti[is.na(chrtabti)] = 0  # fill sample windows with 0s if necessary
				if (length(rco) < i-m+1)
					rco[[i-m+1]] = chrtabti
				else
					rco[[i-m+1]] = c(rco[[i-m+1]], chrtabti)
			}
		}
		rco
	}
	# check if build is present and supported by annotation
	if (length(build)==0)
		build = ""
	supported = FALSE
	if (! is.null(build))
		if (! is.na(build) && nchar(build)>0)
			if (build %in% supported.builds())
				supported = TRUE
	if (! supported)
		message("Note: chromosome lengths will be estimated from sample.")
	message("Reading summarized data...")
	flush.console()
	# read tumoural data
	if (! is.null(tumour.data)) {
		tab.tumour = as.data.frame(tumour.data)
	} else {
		if (.folderExists(folder)) {
			if (.outputExists(folder))
				tab.tumour = read.table(file.path(folder, "seqsumm_out.txt"), header=TRUE, comment.char="")
			else
				stop("No seqsumm output exists in the folder for the tumoural sample.")
		} else {
			stop("The folder for the tumoural sample does not exist.")
		}
	}
	# read normal data if applicable
	if (! is.null(normal.data)) {
		tab.normal = as.data.frame(normal.data)
	} else {
		tab.normal = NULL
		if (length(normal.folder) > 0 && !.folderExists(normal.folder))
			warning("The folder for the normal sample does not exist. Only the tumoural sample will be read.")
		if (.folderExists(normal.folder)) 
			if (.outputExists(normal.folder))
				tab.normal = read.table(file.path(normal.folder, "seqsumm_out.txt"), header=TRUE, comment.char="")
	}
	# perform downsampling if requested
	if (!is.null(resample.win)) {
		message("Resampling summarized data...")
		flush.console()
		tab.tumour = .resample(tab.tumour, resample.win, nproc)
		if(!is.null(tab.normal))
			tab.normal = .resample(tab.normal, resample.win, nproc)
	}
	# calculate window size
	win = diff(tab.tumour$win.start[1:2])/1000
	if (win == 0)
		stop("Window size is zero.")
	tab.tumour$chrom = as.character(tab.tumour$chrom)
	chrs = as.character(rle(tab.tumour$chrom)$values)
	# remove sex chromosomes if requested
	if (! sex) {
		if (build %in% c("hg17","hg18","hg19","hg20"))
			sex.chrs = c("X","chrX","Y","chrY","23","chr23","24","chr24")
		else
			sex.chrs = c("X","chrX","Y","chrY")
		wh = which(chrs%in%sex.chrs)
		if (length(wh) > 0) {
			chrs = chrs[-wh]
			tab.tumour = tab.tumour[-which(tab.tumour$chrom%in%sex.chrs),]
			if (! is.null(tab.normal))
				tab.normal = tab.normal[-which(tab.normal$chrom%in%sex.chrs),]
		}
	}
	# generate chromosome window indices
	if (supported) {
		info = paste(build,"_len",sep="")
		data(list=info)
		info.chrs = as.character(as.vector(get(info)[,1]))
		info.lengths = as.integer(as.vector(get(info)[,2]))
	} else {
		info.chrs = chrs
		info.lengths = sapply(info.chrs, function(i) max(tab.tumour$win.start[tab.tumour$chrom==i])+win*1000)
	}
	lengths = sapply(chrs, function(i) ceiling(info.lengths[info.chrs==i]/(win*1000)))
	names(lengths) = chrs
	all.chrs = rep(chrs, lengths)
	all.pos = unlist(lapply(chrs, function(i) c(1:lengths[i])))
	# create SeqCNAInfo object from input and calculated information
	rcount.tumour = .readSeqsumm(tab.tumour)
	if (!is.null(tab.normal)) {
		rcount.normal = .readSeqsumm(tab.normal)
		reads.gc = .readSeqsumm(tab.normal,3)[[1]]
		reads.mapq = .readSeqsumm(tab.normal,4)[[1]]
	} else {
		rcount.normal = list()
		reads.gc = .readSeqsumm(tab.tumour,3)[[1]]
		reads.mapq = .readSeqsumm(tab.tumour,4)[[1]]
	}
	rco = new("SeqCNAInfo", tumour=rcount.tumour, normal=rcount.normal, gc=reads.gc, mapq=reads.mapq,
		seq=all.chrs, pos=all.pos, build=build, win=win)
}

applyFilters = function(rco, pem.filter=0, trim.filter=0, mapp.filter=0, mapq.filter=0, cnv.filter=FALSE, plots=TRUE, folder=NULL, nproc=2) {
	stop.text = "Run readSeqsumm first!"
	if (length(rco@tumour) == 0) stop(stop.text)
	if (length(rco@seq) == 0) stop(stop.text)
	if (length(rco@pos) == 0) stop(stop.text)
	if (length(rco@win) == 0) stop(stop.text)
	use.normal = length(rco@normal)!=0
	chrs = rle(rco@seq)$values
	if (is.null(rco@build)) {
		supported = FALSE
	} else {
		if (is.na(rco@build) || nchar(rco@build)==0)
			supported = FALSE
		else
			supported = rco@build%in%supported.builds()
	}
	which.cnv = which.mapp = which.mapq = c()
	which.filter = which.imp = which.q = which.trim = c()
	if (supported) {
		if (!use.normal || mapp.filter>0 || cnv.filter) {
			message("Loading genome build information...")
			data(list=rco@build)
			data(list=paste(rco@build,"_len",sep=""))
			genome.data = .buildGenomeInfo(get(rco@build), get(paste(rco@build,"_len",sep="")), chrs, rco@win, nproc=nproc)
		}
		if (use.normal) {
			x = rco@normal[[1]]
		} else {
			x = c()
			for (chr in chrs)
				x = c(x, as.numeric(as.vector(genome.data$GC[which(genome.data$Chr==chr)])))
			x[x==-1] = NA # if necessary, GC windows with code -1 recoded to NAs
		}
		# FILTER WINDOWS WITH CNVs or low mappability
		if (cnv.filter) {
			if (use.normal) {
				cnv.filter = FALSE
				message("Note: CNV filter not meaningful with paired-normal. Disabled.")
			} else {
				which.cnv = which(genome.data$CNV>0.95)
			}
		}
		if (mapp.filter)
			which.mapp = which(genome.data$Mapp<mapp.filter)
	} else { # avoid CNV filter, and estimate GC content and mappability
		if (use.normal) {
			x = rco@normal[[1]]
		} else {
			x = rco@gc
			x[x<=0] = NA
			x[x>=1] = NA
			message("Note: GC content will be estimated from sample.")
		}
		if (cnv.filter) {
			message("Note: the genome build does not support CNV-based filtering.")
			cnv.filter = FALSE
		}
		if (mapp.filter) {
			message("Note: the genome build does not support mappability-based filtering.")
			mapp.filter = 0
		}
	}
	y = rco@tumour[[1]]

	# (PRE-)NORMALIZE THE PROFILE FOR BETTER FILTER ASSESSMENT AND IMPROVED TRIM-BASED FILTERING
	if (! use.normal) { # pre-normalize tumoural against GC
		pre.x = x
		pre.y = y
	} else { # or normal against GC
		pre.x = rco@gc
		pre.y = x
	}
	method = "quadratic"
	reg.method = ifelse(method=="loess", "loess", "lm")
	which.ok = intersect(intersect(which(pre.y>0),which(!is.na(pre.y))),which(!is.na(pre.x)))
	x.pos = pre.x[which.ok]
	y.pos = pre.y[which.ok]
	lomo = eval(call(reg.method, .makeFormula("x.pos","y.pos",method,use.normal,environment())))
	y.norm = rep(NA, length(y))
	z.pos = lomo$residuals / lomo$fitted
	y.norm[which.ok] = z.pos
	# FILTER WINDOWS WITH LOW MEAN MAPPING QUALITY
	if (mapq.filter)
		which.mapq = intersect(which(rco@mapq<mapq.filter), which.ok)
	which.filter = unique(c(which.mapq, which.cnv, which.mapp))
	# FILTER WINDOWS WITH HIGH RATIO OF IMPROPER PAIR-END MAPPINGS IN THE NORMAL (TUMOURAL) SAMPLE,
	# A SIGN OF REPETITIVE SEQUENCES, MOSTLY HAPPENING AROUND THE CENTROMERIC REGIONS
	pem.allowed = (use.normal && length(rco@normal) > 1) || length(rco@tumour) > 1
	if (pem.filter && !pem.allowed)
		message("Note: PEM filter will not be applied due to lack of PEM information.")
	message("Applying filters...")
	do.pem.filter = pem.filter && pem.allowed
	if (do.pem.filter) {
		types = c(2,4,5) # type 3 reads are non-existent in some alignments, so better not use them 
		if (use.normal)
			improper = Reduce('+',(lapply(types,function(i)rco@normal[[i]])))/rco@normal[[1]]
		else
			improper = Reduce('+',(lapply(types,function(i)rco@tumour[[i]])))/rco@tumour[[1]]
		thr = quantile(improper[is.finite(improper)], probs=1-pem.filter)
		which.imp = which(improper > thr)
		which.filter = union(which.filter, which.imp)
	} else {
		pem.filter = 0
	}
	# FILTER WINDOWS WITH EXTREME COUNTS, PRONE TO RESULT IN GREATER NOISE IN THE RATIO PROFILE
	if (trim.filter[1]) {
		if (use.normal && trim.filter[1]==1) {
			trim.filter[1] = 0
			message("Automatic trimming is only available without normal sample. Disabled.")
		}
		trim.val = ifelse (!use.normal&&trim.filter[1]==1, .autoTrim(z.pos,nproc=nproc), 1-trim.filter[1])
		trim.val[2] = ifelse (length(trim.filter) > 1, trim.filter[2], 1-trim.val[1])
		quants = quantile(z.pos, rev(trim.val))
		which.trim = which(y.norm<quants[1] | y.norm>quants[2])
		if (length(which.ok)>0)
			which.trim = union(which.trim, c(1:length(y))[-which.ok])
		which.q = intersect(which.trim, which.ok)
		which.filter = union(which.filter, which.trim)
	}
	which.nok = c(1:length(y))[-which.ok]
	# OUTPUT FILTERING RESULTS
	zeroes.len = length(y)-length(z.pos)
	message("  Windows without reads or info: ",appendLF=FALSE)
	message(length(which.nok))
	message("  Total filtered windows:        ",appendLF=FALSE)
	message(length(union(which.filter,which.nok)))
	message("  Remaining windows:             ",appendLF=FALSE)
	message(length(y)-length(union(which.filter,which.nok)))
	flush.console()
	if (plots) {
		mask  = c(trim.filter[1]>0, mapp.filter>0, mapq.filter>0, pem.filter>0, cnv.filter)
		l = list(which.trim, which.mapp, which.mapq, which.imp, which.cnv)[mask]
		ll = length(unlist(l))
		lf = length(which(mask==TRUE))
		nrows = ifelse(lf>3,2,1)
		ncols = ceiling(lf/nrows)
		if (ll > 0) {
			message("Plotting filtering...")
			flush.console()
			all.cols = c("#9E2862FF","#FF7400FF","#FF00BBFF","#006388FF","#92EC00FF")
			cols = all.cols[mask]
			pchs = c(1:5)[mask]
			if (.folderExists(folder)) {
				jpeg(file.path(folder,"seqCNA_filters.jpg"),1920,1080)
			} else {
				if (lf<4) m = t(matrix(c(1,1,rep(1,ncols),2,2,c(3:(lf+2))),ncol=2))
				else m = t(matrix(c(1,1,1,1,2,2,3,4,2,2,5,6),ncol=3))
				layout(m)
			}
			# profile with filtered windows
			par(oma=c(0,0,0,0))
			old.par = par(mar=c(5,4,4,0))
			if (trim.filter[1]) {
				z = y.norm
				yl = quantile(z.pos[is.finite(z.pos)], c(trim.val[2]/2, 1-(1-trim.val[1])/2))
				if (!use.normal) ylab = "Pre-normalized tumoural profile"
				else ylab = "Normalized normal profile"
			} else {
				z = y
				yl = c(0,quantile(y, 0.9995))
				ylab = "Raw counts"
			}
			binsX = 500
			bandX = ceiling(length(z) / binsX)
			binsY = 200
			bandY = diff(quantile(z[is.finite(z)],c(0.01,0.99))) / binsY
			suppressMessages(suppressWarnings(
			smoothScatter(postPlotHook=NULL, colramp=colorRampPalette(c("white","black")),
				bandwidth=c(bandX,bandY), nbin=c(binsX,binsY),
				z, cex=0.5, pch=19, col="#00000033", ylim=yl, cex.axis=1.5,cex.lab=1.5, ylab=ylab, frame.plot=FALSE)
			))
			for (i in 1:length(l))
				points(l[[i]], z[l[[i]]], cex=1, pch=pchs[i], col=cols[i])
			if (.folderExists(folder))
				dev.off()
			par(mar=old.par)
			# Venn diagrams
			if (lf > 1) {
				venn.mask = c(pem.filter>0, mapp.filter>0, mapq.filter>0, cnv.filter, trim.filter[1]>0)
				venn.l = list(which.imp,intersect(which.mapp,which.ok),intersect(which.mapq,which.ok),which.cnv,which.q)[venn.mask]
				names(venn.l) = c("PEM","mapp.","quality","CNV","trimmed")[venn.mask]
				venn.cols = c("#006388FF","#FF7400FF","#FF00BBFF","#92EC00FF","#9E2862FF")[venn.mask]
				if (.folderExists(folder))
					jpeg(file.path(folder,"seqCNA_filters_overlap.jpg"),1920,1080)
				counts = list(sapply(.overLapper(setlist=venn.l, sep="_", type="vennsets")$Venn_List, length))
				old.par = par(mar=rep(0,4))
				dummy = .vennPlot(counts=counts, lines=venn.cols, lcol=venn.cols, mymain="", mysub="", ccex=1.25)
				par(mar=old.par)
				if (.folderExists(folder))
					dev.off()
				if (.folderExists(folder)) {
					jpeg(file.path(folder,"seqCNA_filters_thresholds.jpg"),1920,1080)
					par(mfrow=c(nrows,ncols))
				}
			}
			# per-filter plots and thresholds
			if (mask[1]) {
				d = density(y.norm[y.norm<quants[2]*1.5 & y.norm>quants[1]*1.5],na.rm=TRUE)
				plot(Inf, xlim=c(min(d$x),max(d$x)), ylim=c(0,max(d$y)),
					xlab=ylab,ylab="Density",main="Trimming filter",cex.lab=1.5,cex.axis=1.5)
				rect(quants[1], -max(d$y)*2, quants[2], max(d$y)*2, col=gsub("FF$","44",all.cols[1]), lty=0)
				abline(h=0, col="grey")
				lines(d, lwd=2)
				abline(v=quants, col=all.cols[1], lwd=2)
				text(quants[1],max(d$y),round(quants[1],2),adj=c(-0.25,0.75), cex=1.5, col=all.cols[1])
				text(quants[2],max(d$y),round(quants[2],2),adj=c(-0.25,0.75), cex=1.5, col=all.cols[1])
			}
			if (mask[2]) {
				d = density(genome.data$Mapp,na.rm=TRUE)
				plot(Inf, xlim=c(0,1), ylim=c(0,max(d$y)),
					xlab="Mappability",ylab="Density",main="Mappability filter",cex.lab=1.5,cex.axis=1.5)
				rect(mapp.filter, -max(d$y)*2, 2, max(d$y)*2, col=gsub("FF$","44",all.cols[2]), lty=0)
				abline(h=0, col="grey")
				lines(d, lwd=2)
				abline(v=mapp.filter, col=all.cols[2], lwd=2)
				text(mapp.filter,max(d$y),mapp.filter,adj=c(-0.25,0.75), cex=1.5, col=all.cols[2])
			}
			if (mask[3]) {
				d = density(rco@mapq,na.rm=TRUE)
				plot(Inf, xlim=c(0,max(d$x)), ylim=c(0,max(d$y)),
					xlab="Mean mapping quality",ylab="Density",main="Mapping quality filter",cex.lab=1.5,cex.axis=1.5)
				rect(mapq.filter, -max(d$y)*2, max(d$x)*2, max(d$y)*2, col=gsub("FF$","44",all.cols[3]), lty=0)
				abline(h=0, col="grey")
				lines(d, lwd=2)
				abline(v=mapq.filter, col=all.cols[3], lwd=2)
				text(mapq.filter,max(d$y),mapq.filter,adj=c(-0.25,0.75), cex=1.5, col=all.cols[3])
			}
			if (mask[4]) {
				d = density(improper[improper<thr*1.5],na.rm=TRUE)
				plot(Inf, xlim=c(0,max(d$x)), ylim=c(0,max(d$y)),
					xlab="Improper/proper read ratio",ylab="Density",main="PEM filter",cex.lab=1.5,cex.axis=1.5)
				rect(-1, -max(d$y)*2, thr, max(d$y)*2, col=gsub("FF$","44",all.cols[4]), lty=0)
				abline(h=0, col="grey")
				lines(d, lwd=2)
				abline(v=thr, col=all.cols[4], lwd=2)
				text(thr,max(d$y),round(thr,2),adj=c(-0.25,0.75), cex=1.5, col=all.cols[4])
			}
			if (.folderExists(folder)) dev.off()
		} else {
			message("There are no filtered windows to plot.")
		}
	}
	rco@y = y
	rco@x = x
	rco@skip = as.numeric(which.filter)
	rco
}

runSeqnorm = function(rco, norm.win=NULL, method="quadratic", lambdabreak=8, minSeg=7, maxSeg=35, nproc=2, plots=TRUE, folder=NULL) {
	if (length(rco@tumour) == 0) stop("Run readSeqsumm first!")
	if (length(rco@y) == 0) stop("Run applyFilters first!")
	if (length(rco@x) == 0) stop("Run applyFilters first!")
	if (!is.null(norm.win)) {
		if ((norm.win/rco@win) %% 1 != 0)
			message("Note: normalization window will be approximated to the closest multiple of the summarization window.")
	} else {
		if (length(rco@y) > 200000) {
			f = round(length(rco@y) / 50000)
			norm.win = rco@win * f
		} else {
			norm.win = rco@win
		}
	}
	rcount.prof = .seqnorm(rco@x, rco@y, chr=rco@seq, skip=rco@skip, use.normal=length(rco@normal)!=0,
			resolution=rco@win/norm.win, nproc=nproc,method=method, out.dir=folder, 
			out.jpg=file.path(folder,"seqCNA_normalization.jpg"), plots=plots,
			lambdabreak=lambdabreak, minRegions=minSeg, maxRegions=maxSeg)
	profiles = cbind(rco@seq, (rco@pos-1)*rco@win, round(rcount.prof,4))
	colnames(profiles) = c("chrom", "win.start", "normalized")
	rco@output = data.frame(profiles)
	rco@output$normalized = as.numeric(as.vector(rco@output$normalized))
	rco
}

runGLAD = function(rco, lambdabreak=8, nproc=2) {
	if (length(rco@tumour) == 0) stop("Run readSeqsumm first!")
	if (length(rco@y) == 0) stop("Run applyFilters first!")
	if (ncol(rco@output) == 0) stop("Run runSeqnorm first!")
	message("Running segmentation...")
	flush.console()
	which.ok = which(!is.na(rco@output$normalized))
	chrs = rle(as.vector(rco@output$chrom))$values
	ch = 0
	all.prof = rco@output[, c(3,1)]
	cluster = makeCluster(rep("localhost",nproc), type="SOCK") 
	registerDoSNOW(cluster)
	res = foreach(ch=chrs, .packages="GLAD") %dopar% {
		prof = all.prof[which(all.prof$chrom==ch), ]
		input = cbind(c(1:nrow(prof)),prof)
		colnames(input) = c("PosOrder","LogRatio","Chromosome")
		profileCGH = as.profileCGH(data.frame(input))
		res = glad(profileCGH, verbose=FALSE, bandwidth=1, lambdabreak=lambdabreak)$profileValues$Smoothing
	}
	stopCluster(cluster)
	res = unlist(res)
	rcount.prof.segm = rep(NA, nrow(rco@output))
	rcount.prof.segm[which.ok] = res
	rco@output$segmented = rcount.prof.segm
	rco
}

applyThresholds = function(rco, thresholds, min.CN) {
	if (length(rco@tumour) == 0) stop("Run readSeqsumm first!")
	if (length(rco@y) == 0) stop("Run applyFilters first!")
	if (ncol(rco@output) == 0) stop("Run runSeqnorm first!")
	if (ncol(rco@output) < 4) stop("Run runGLAD first!")
	cu = cut(rco@output$segmented, breaks=c(-Inf,thresholds,Inf), labels=min.CN+c(0:length(thresholds)))
	rco@output$CN = cu
	rco@thr = thresholds
	rco@minCN = min.CN
	rco
}

writeCNProfile = function(rco, folder) {
	if (ncol(rco@output) == 0) stop("Run runSeqnorm first!")
	write.table(rco@output, file=paste(folder,"/seqCNA_out.txt",sep=""), quote=FALSE,row.names=FALSE,sep="\t")
}

plotCNProfile = function(rco, folder=NULL) {
	if (ncol(rco@output) == 0) stop("Run runSeqnorm first!")
	profiles = rco@output
	if (.folderExists(folder)) {
		jpeg(file.path(folder,"seqCNA_out.jpg"),1920,1080)
		cex = 2.5
	} else {
		cex = 1.4
	}
	par(mfrow=c(1,1))
	par(mar=c(5,4,4,0.5))
	par(oma=c(0,0,0,0))
	binsX = 500
	bandX = ceiling(length(profiles$normalized) / binsX)
	binsY = 200
	bandY = diff(quantile(profiles$normalized[is.finite(profiles$normalized)],c(0.01,0.99))) / binsY
	suppressMessages(suppressWarnings(
	smoothScatter(postPlotHook=NULL, colramp=colorRampPalette(c("white","black")),
		bandwidth=c(bandX,bandY), nbin=c(binsX,binsY),
		profiles$normalized, cex=0.5, pch=19, col="#00000055",
		ylim=quantile(profiles$normalized[is.finite(profiles$normalized)],probs=c(0.005,0.995)),
		xlab="Genomic window index", ylab="Normalized ratio", frame.plot=FALSE,cex.axis=1.5,cex.lab=1.5)
	))
	abline(v=which(diff(as.numeric(as.factor(rco@seq)))!=0), lwd=2)
	minh = floor(min(profiles$normalized,na.rm=T))
	maxh = ceiling(max(profiles$normalized,na.rm=T))
	rang = maxh-minh
	if (rang>5) byh = 1
	else if (rang>1) byh = 0.2
	else if (rang>0.2) byh = 0.05
	else byh = 0.01
	abline(h=seq(minh,maxh,by=byh),lty=2,col="grey",lwd=2)
	if ("segmented"%in%colnames(profiles)) {		
		wh.ok = which(!is.na(profiles$segmented))
		wh.ch = wh.ok[diff(profiles$segmented[wh.ok])!=0]
		en = c(wh.ch, tail(wh.ok,1))
		st = c(1, en[1:(length(en)-1)]+1)
		vals = profiles$segmented[en]
		dummy = sapply(1:length(en), function(o) lines(c(st[o],en[o]),rep(vals[o],2),lwd=4,col="green"))
	}
	if ("CN"%in%colnames(profiles)) {
		for (cn in 0:length(rco@thr)) {
			ypos = median(profiles$segmented[profiles$CN==rco@minCN+cn], na.rm=T)
			text(0, ypos, paste("CN",rco@minCN+cn,sep=""), cex=cex, col="#ff0000", adj=1)
		}
		for (i in 1:length(rco@thr))
			abline(h=rco@thr[i], col="#ff0000", lwd=3)
	}
	if (.folderExists(folder))
		dev.off()
}

##################################################################
### SEQNORM FUNCTION
### this is the normalization function, which admits GC-based and
### paired normal modes, with different regression strategies
##################################################################
.seqnorm = function(x,y, nproc=1, resolution=1, use.normal=FALSE, skip=NULL, chr=rep(1,length(x)), 
		method=c("loess","cubic","quadratic")[3], out.dir=getwd(), out.jpg, plots=TRUE,
		lambdabreak=8, minRegions=7, maxRegions=35) {
	if (length(x) != length(y)) {
		if (! use.normal)
			stop("Tumour sample and GC content lengths differ.")
		else
			stop("Tumour and normal sample lengths differ.")
	}
	if (any(x[is.finite(x)]<0)) {
		if (! use.normal)
			stop("It seems that some GC content values are negative. Try recoding from -1 to NA in the GC file.")
		else
			stop("Hmmm... Something is not right, some read counts are negative!")
	}
	if (any(y[is.finite(y)]<0))
		stop("Hmmm... Something is not right, some read counts are negative!")
	if (nproc < 1)
		stop("Try setting nproc to a positive number. At least one processor core is needed!")
	if (resolution <= 0 || resolution >1)
		stop("resolution should be greater than 0 and as big as 1.")
	if (nproc > 64)
		warning("Warning: unless you really have a computer with more than 64 processor cores, try setting nproc to a lower number.", immediate.=TRUE)

	# work only with positive read counts and restore the rest after processing
	ret = rep(NA, length(x))
	xy.a = intersect(which(!is.na(x)), which(!is.na(y)))
	xy.a = intersect(xy.a, intersect(which(x>0),which(y>0)))
	if (! is.null(skip))
		xy.a = setdiff(xy.a, skip) # filter certain windows expected to misbehave
	x.a = x
	y.a = y
	chr.a = chr
	x = x[xy.a]
	y = y[xy.a]
	chr = chr[xy.a]
	chrtab = rle(as.vector(chr))$values
	chrlist = lapply(chrtab, function(chri) which(chr==chri))
	chrlist = chrlist[order(sapply(chrlist,head,1))]
	
	# work with bigger summarized windows if resolution < 1
	summ = 1
	if (resolution < 1) {
		chrlist.full = chrlist
		chrlist = list()
		max.ind = 0
		x.full = x
		y.full = y
		summ = floor(1/resolution)
		x = y = c()
		for (l in chrlist.full) {
			xchr = x.full[l]
			ychr = y.full[l]
			len = ceiling(length(xchr)/summ)
			x = c(x, sapply(split(xchr, rep(c(1:len),rep(summ,len))[1:length(xchr)]), mean, na.rm=T))
			y = c(y, sapply(split(ychr, rep(c(1:len),rep(summ,len))[1:length(ychr)]), mean, na.rm=T))
			chrlist[[length(chrlist)+1]] = c(1:len) + max.ind
			max.ind = tail(chrlist[[length(chrlist)]], 1)
		}
	}

	message("Creating segments...")
	flush.console()
	mins = sapply(chrlist,head,1)
	maxs = sapply(chrlist,tail,1)
	reg.method = ifelse(method=="loess", "loess", "lm")
	classicLomo = eval(call(reg.method, .makeFormula("x","y",method,use.normal,environment())))
	classic.rcount.prof = classicLomo$residuals/classicLomo$fitted
	cluster = makeCluster(rep("localhost",nproc), type="SOCK") 
	registerDoSNOW(cluster)
	i = 0
	bps = foreach(i=1:length(chrlist), .packages="GLAD") %dopar% {
		prof = classic.rcount.prof[chrlist[[i]]]
		input = cbind(c(1:length(prof)), prof, rep(1,length(prof)))
		colnames(input) = c("PosOrder","LogRatio","Chromosome")
		profileCGH = as.profileCGH(data.frame(input))
		res = glad(profileCGH, verbose=FALSE, bandwidth=1)$profileValues$Smoothing
		which(diff(res)!=0) + mins[i]-1
	}
	breaks = round(sort(c(1, unlist(bps), maxs[-length(maxs)], length(y)+1)))
	segmLength = diff(breaks)
	message("  Segments total: ", appendLF=FALSE)
	message(length(segmLength))
	flush.console()
	segmDomain = sapply(c(1:(length(breaks)-1)), function(i) { 
			Reduce("-",quantile(x[breaks[i]:(breaks[i+1]-1)],c(0.95,0.05)))
		} )
	segmRange = sapply(c(1:(length(breaks)-1)), function(i) { 
			Reduce("-",quantile(classic.rcount.prof[breaks[i]:(breaks[i+1]-1)],c(0.95,0.05)))
		} )
	# filter regions based on three different metrics
	minDomain = 0.95 # minimum span over X of a segment's middle 90% quantile (in quantiles)
	maxRange = 0.85 # maximum span over Y of a segment's middle 90% quantile (in quantiles)
	minLength = 0.85 # minimum segment length
	stepp = 0.025
	.getSegmLm = function() {
		segmLm = intersect(which(segmLength>quantile(segmLength,minLength)), which(segmRange<quantile(segmRange,maxRange)))
		intersect(segmLm, which(segmDomain>quantile(segmDomain,minDomain)))
	}
	segmLm = .getSegmLm()
	while (length(segmLm) < min(minRegions, length(breaks)-1) && maxRange < 1-stepp) {
		minDomain = minDomain - stepp
		maxRange = maxRange + stepp
		minLength = minLength - stepp
		segmLm = .getSegmLm()
	}
	while (length(segmLm) > maxRegions && minDomain < 1-stepp) {
		minDomain = minDomain + stepp
		maxRange = maxRange - stepp
		minLength = minLength + stepp
		segmLm = .getSegmLm()
	}
	message("  Segments passing QC: ", appendLF=FALSE)
	message(length(segmLm))
	flush.console()
	classic.rcount.prof = scale(classicLomo$residuals/classicLomo$fitted)
	if (length(segmLm) < min(minRegions, length(breaks)-1)) {
		message("  Not enough segments, falling back to typical regression.")
		flush.console()
		if (resolution < 1)
			ret[xy.a] = y / predict(classicLomo, data.frame(x=x))
		else
			ret[xy.a] = classic.rcount.prof
		return(ret)
	}
	
	message("Fitting regression models...")
	flush.console()
	if (resolution < 1) sel = seq(1,length(x.full),l=100000)
	xminwh = which.min(x)
	xmin = x[xminwh]
	ymin = y[xminwh]
	xmaxwh = which.max(x)
	xmax = x[xmaxwh]
	ymax = y[xmaxwh]
	fit = foreach(i=c(1:(length(breaks)-1))[segmLm], .export=c(".makeFormula")) %dopar% {
		rang = breaks[i]:(breaks[i+1]-1)
		xLm = x[rang]
		yLm = y[rang]
		if (reg.method=="loess") {
			xLm = c(xLm, xmin,xmax)
			yLm = c(yLm, ymin,ymax)
		}
		lomo = eval(call(reg.method, .makeFormula("xLm","yLm",method,use.normal,environment())))
		if (resolution < 1) p = x.full[sel]
		else p = x
		predict(lomo, data.frame("xLm"=p))
	}
	fit = do.call(rbind, fit)
	# subtract shift to ref regression
	baseLomo = median(classicLomo$fitted)
	fit = t(apply(fit, 1, function(i) i / (median(i)/baseLomo))) 
	# for each point, merge sub-regression estimations
	chunk.len = ceiling(ncol(fit)/nproc)
	n = 0
	medianFit = foreach(n=1:nproc) %dopar% {
		fits = c()
		for (i in c(1:chunk.len)+(n-1)*chunk.len)
			if (i <= ncol(fit))
				fits = c(fits, median(fit[,i]))
		fits
	}
	stopCluster(cluster)
	medianFit = unlist(medianFit)
	if (resolution < 1) {
		xSel = x.full[sel]
		lomo = eval(call(reg.method, .makeFormula("xSel","medianFit",method,use.normal,environment())))
		lomoFitted = predict(lomo, data.frame(xSel=x.full))
		rcount.prof = y.full / lomoFitted
	} else {
		lomo = eval(call(reg.method, .makeFormula("x","medianFit",method,use.normal,environment())))
		lomoFitted = lomo$fitted
		rcount.prof = y / lomoFitted
	}
	rcount.prof = scale(rcount.prof)

	if (plots) {
		message("Plotting normalization...")
		flush.console()
		if (.folderExists(out.dir)) {
			jpeg(file.path(out.dir,"seqCNA_segment_QC.jpg"),1000,700)
			segmLengthCut = cut(segmLength, breaks=c(-Inf,quantile(segmLength,0.5),Inf), labels=c(1,2))
			layout(matrix(c(1:4),ncol=2), widths=c(1,4), heights=c(4,1))
			par(oma=c(0,0,0,0))
			par(mar=c(4,0,1,1))
			d = density(segmRange[which(segmLengthCut==2)], from=min(segmRange), to=max(segmRange), n=10000)
			plot(-d$y, d$x, frame.plot=FALSE, xaxt="n",yaxt="n", type="p", col="black", xlab="",ylab="", pch=19, cex=0.25)
			d = density(segmRange[which(segmLengthCut==1)], from=min(segmRange), to=max(segmRange), n=10000)
			points(-d$y, d$x, col="grey", pch=19, cex=0.25)
			par(mar=c(1,0,1,1))
			plot.new()
			legend("topleft",c("Short segments","","Long segments"), col=c("Grey","White","Black"),lwd=4, box.col="white")
			par(mar=c(4,4,1,1))
			plot(segmDomain, segmRange, col=c("#88888888","#000000aa")[segmLengthCut],
				frame.plot=FALSE, xlab="Segment domain", ylab="Segment range", cex=1, pch=19)
			abline(h=quantile(segmRange,0.75))
			abline(v=quantile(segmDomain,0.75))
			par(mar=c(0,4,1,1))
			d = density(segmDomain[which(segmLengthCut==1)], from=min(segmDomain), to=max(segmDomain), n=10000)
			plot(d$x, d$y, frame.plot=FALSE, xaxt="n",yaxt="n", type="p", xlab="",ylab="", col="grey", pch=19, cex=0.25)
			d = density(segmDomain[which(segmLengthCut==2)], from=min(segmDomain), to=max(segmDomain), n=10000)
			points(d, col="black", pch=19, cex=0.25)
			dev.off()
		}
		if (.folderExists(out.dir)) {
			jpeg(out.jpg,2000,1600)
			cex = 2
		} else {
			cex = 1
		}
		par(mfrow=c(1,2))
		par(oma=c(0,0,0,0))
		par(mar=c(4,5,1,1))
		qu = quantile(rcount.prof, c(0.001,0.999))
		wh = which(rcount.prof>qu[1] & rcount.prof<qu[2])
		plot(density(rcount.prof[wh],n=5000,adjust=1/summ), col="#ff6600ff", xlim=quantile(rcount.prof, c(0.01,0.99)), main="",
			xlab="Normalized ratio", frame.plot=FALSE,cex.axis=cex,cex.lab=cex, lwd=2)
		if (resolution == 1) {
			lines(density(classic.rcount.prof[wh],n=5000), col="#0000FFFF", lwd=2)
		}
		xlab = ifelse(use.normal, "Window counts (normal)", "GC content")
		suppressMessages(suppressWarnings(
		smoothScatter(colramp=colorRampPalette(c("white","black")),
			x,y, col="#00000022", cex=1, pch=19, xlab=xlab, ylab="Window counts (tumour)",
			xlim=quantile(x, c(0.001,0.999)), ylim=quantile(y, c(0.01,0.99)), cex.axis=cex, cex.lab=cex)
		))
		if (resolution < 1) {
			ord = order(x.full)[sel]
			lines(x.full[ord], lomoFitted[ord], pch=19, col="#FF6600FF", lwd=3)
		} else {
			ord = order(x)
			lines(x[ord], lomoFitted[ord], pch=19, col="#FF6600FF", lwd=3)
			lines(x[ord], classicLomo$fitted[ord], pch=19, col="#0000FFFF", lwd=3)
			legend("topright",c("Standard regression","","seqnorm regression"),
				col=c("#0000FFFF","#FFFFFF00","#FF6600FF"),lwd=4, box.col="#00000000", cex=cex*1.2)
		}
		if (.folderExists(out.dir))  dev.off()
	}

	ret[xy.a] = rcount.prof
	ret
}

##################################################################
### AUXILIARY FUNCTIONS
##################################################################
.folderExists = function(x) {
	if (is.null(x)) return(0)
	if (is.na(x)) return(0)
	x = gsub("[\\/]*$","",x)
	if (!file.exists(x)) return(0)
	ret = 1
	wd = getwd()
	try.wd = try(setwd(wd), silent=TRUE)
	if (class(try.wd) == "try-error")
		ret = 0
	else
		setwd(wd)
	ret
}
.outputExists = function(x) length(list.files(path=x, pattern="^seqsumm_out.txt$")) > 0
.resample = function(info, new.win, nproc=2) {
	new.win = new.win*1000
	win = diff(info$win.start[1:2])
	if (new.win %% win != 0 || new.win < win)
		stop(paste("New window size should be an exact multiple of ",win/1000,".",sep=""))
	m.factor = new.win / win
	info$reads.gc[info$reads.gc==0] = NA
	info$reads.mapq[info$reads.mapq==0] = NA
	chrs = rle(as.character(info$chrom))$values
	cluster = makeCluster(rep("localhost",nproc), type="SOCK") 
	registerDoSNOW(cluster)
	chr = 0
	ss = foreach(chr=chrs) %dopar% {
		s = list()
		chr.info = info[info$chrom==chr, ]
		l = ceiling(nrow(chr.info)/m.factor)
		spls = lapply(c(3:ncol(info)), function(i) split(chr.info[,i], rep(c(1:l),rep(m.factor,l))[1:nrow(chr.info)]))
		for (i in 1:2) # weighted average for GC content and mapping quality
			s[[i]] = sapply(c(1:length(spls[[i]])), function(j) 
				sum(spls[[i]][[j]] * spls[[3]][[j]], na.rm=TRUE) /  sum(spls[[3]][[j]], na.rm=TRUE))
		for (i in 3:(ncol(info)-2)) # sum for reads
			s[[i]] = sapply(c(1:length(spls[[i]])), function(j) 
				sum(spls[[i]][[j]], na.rm=TRUE))
		do.call(cbind, s)
	}
	stopCluster(cluster)
	dat = do.call(rbind, ss)
	l = sapply(ss, nrow)
	ch = rep(chrs, l)
	win.st = unlist(lapply(l, function(i) c(0:(i-1))*new.win))
	newinfo = data.frame(chrom=ch, win.start=win.st, reads.gc=dat[,1], reads.mapq=dat[,2], stringsAsFactors=FALSE)
	for (i in 3:ncol(dat))
		newinfo = cbind(newinfo, dat[,i])
	colnames(newinfo) = colnames(info)
	newinfo
}
.makeFormula = function(xn, yn, method, use.normal, env) {
	formulae = paste(yn,"~",xn)
	if (method != "loess") formulae = paste(formulae, "+I(",xn,"^2)+I(",xn,"^3)")
	if (method == "quadratic") formulae = paste(formulae, "+I(",xn,"^4)")
	if (use.normal) formulae = paste(formulae, "-1")
	as.formula(formulae, env=env)
}
.buildGenomeInfo = function(info, info_len, chrs, win, nproc=2) {
	csum = cumsum(ceiling(info_len$length/1000))
	names(csum) = info_len$chr
	en = csum[chrs]
	st = c(1, en[1:(length(en)-1)]+1)
	names(st) = names(en)
	cluster = makeCluster(rep("localhost",nproc), type="SOCK") 
	registerDoSNOW(cluster)
	chr = 0
	ss = foreach(chr=chrs) %dopar% {
		s = list()
		chr.info = info[st[chr]:en[chr], ]
		l = ceiling(nrow(chr.info)/win)
		for (i in 1:3) {
			if (win == 1) {
				s[[i]] = chr.info[,i]
			} else {
				spl = split(chr.info[,i], rep(c(1:l),rep(win,l))[1:nrow(chr.info)])
				s[[i]] = sapply(spl, mean, na.rm=T)
			}
		}
		do.call(cbind, s)
	}
	stopCluster(cluster)
	dat = do.call(rbind, ss)
	l = sapply(ss, nrow)
	ch = rep(chrs, l)
	win.st = unlist(lapply(l, function(i) c(0:(i-1))*win))
	newinfo = cbind(ch, win.st, dat)
	colnames(newinfo) = c("Chr", "Start", colnames(info))
	newinfo = as.data.frame(newinfo)
	for (i in 3:5)
		newinfo[,i] = as.numeric(as.character(newinfo[,i]))
	newinfo
}
.autoTrim = function(rc, from=0.98, by=0.0005, nproc=2) {
	to = 1 - 10 / length(rc)
	quants = seq(from,to,by)
	cluster = makeCluster(rep("localhost",nproc), type="SOCK") 
	registerDoSNOW(cluster)
	chunk.len = ceiling(length(quants)/nproc)
	n = 0
	ws = foreach(n=1:nproc, .packages="adehabitatLT") %dopar% {
		w = c()
		for (i in c(1:chunk.len)+(n-1)*chunk.len)
			if (i <= length(quants))
				w = c(w, wawotest(rc > quantile(rc, quants[i]))[1])
		w
	}
	stopCluster(cluster)
	ws = unlist(ws)
	names(ws) = quants
	ws.smoothed = loess(ws~quants, span=0.5)$fitted
	whm = which.max(ws.smoothed)
	rang = whm:length(ws.smoothed)
	ws.smoothed = ws.smoothed[rang]
	y = ws.smoothed - min(ws.smoothed)
	y = y / max(y)
	x = quantile(rc,quants)[rang]
	x = x - min(x)
	x = x / max(x)
	last.valley = length(y) - tail(rle(diff(y)<0)$lengths,1)
	sel = last.valley : length(y)
	if (all(diff(y[sel])>=0))
		sep = length(quants)
	else
		sep = which.min(x[sel]+y[sel]) + whm-1 + last.valley-1
	trim = quants[sep]
	trim
}
Any scripts or data that you put into this service are public.
seqCNA documentation built on Nov. 8, 2020, 7:09 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
seqCNA
Copy number analysis of high-throughput sequencing cancer data

R/seqCNA_functions.r
In seqCNA: Copy number analysis of high-throughput sequencing cancer data

Defines functions .autoTrim .buildGenomeInfo .makeFormula .resample .outputExists .folderExists .seqnorm plotCNProfile writeCNProfile applyThresholds runGLAD runSeqnorm applyFilters readSeqsumm runSeqsumm

Documented in applyFilters applyThresholds plotCNProfile readSeqsumm runGLAD runSeqnorm runSeqsumm writeCNProfile

Try the seqCNA package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

seqCNA Copy number analysis of high-throughput sequencing cancer data

R/seqCNA_functions.r In seqCNA: Copy number analysis of high-throughput sequencing cancer data

Defines functions .autoTrim .buildGenomeInfo .makeFormula .resample .outputExists .folderExists .seqnorm plotCNProfile writeCNProfile applyThresholds runGLAD runSeqnorm applyFilters readSeqsumm runSeqsumm

Documented in applyFilters applyThresholds plotCNProfile readSeqsumm runGLAD runSeqnorm runSeqsumm writeCNProfile

Try the seqCNA package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

seqCNA
Copy number analysis of high-throughput sequencing cancer data

R/seqCNA_functions.r
In seqCNA: Copy number analysis of high-throughput sequencing cancer data