R/utils.R

Defines functions normalizeMatrix logger.cmd.args parseMcTable.epp parseMcTable.bissnp mergeMethGr plotRepeatAlignmentStats getAlnStats getReadStatsFromSample getFlagStats explainSamFlags bitwCompl getFgColorForBg colorize.value convertPhredCharToInt getRepeatMaskerTable countUnmappedReads.bam countMappedReads.bam countMappedReads.bam.perSeq countAllReads.bam getCharVecHeadString getFileTypeFromFileName

Documented in logger.cmd.args normalizeMatrix plotRepeatAlignmentStats

#' getFileTypeFromFileName
#'
#' get the file type from a vector of file names
#'
#' @param fns		vector of filenames
#' @param ignoreZip ignore extions gz,zip,tar,tar.gz und use the main extension
#' @return a vector of file types
#'
#' @author Fabian Mueller
#' @noRd
getFileTypeFromFileName <- function(fns, ignoreZip=FALSE) {
	require(tools)
	if (ignoreZip){
		fns <- gsub("\\.(gz|zip|tar|tar\\.gz)$", "", fns)
	}
	fTypes <- file_ext(fns)
	fTypes[fTypes=="fa"] <- "fasta"
	fTypes[fTypes=="fq"] <- "fastq"
	return(fTypes)
}

#' getCharVecHeadString
#'
#' returns the head of a string vector as comma-separated string
#'
#' @param ss vector of filenames
#' @return a character with the head of the string vector separated by commas
#'
#' @author Fabian Mueller
#' @noRd
getCharVecHeadString <- function(ss, n=5){
	res <- ""
	if (length(ss) < n){
		res <- paste(ss, collapse=", ")
	} else {
		res <- paste(c(ss[1:n], "..."), collapse=", ")
	}
	return(res)
}

#' countAllReads.bam
#'
#' fast count mapped reads to all reference sequence in indexed bamfile using samtools idxstats
#'
#' @param fName		indexed mapped reads file (BAM)
#' @return the number of mapped reads across all reference contigs
#'
#' @author Fabian Mueller
#' @noRd
countAllReads.bam <- function(fName){
	cmd <- paste(.config$samtools.exec,"idxstats",fName)
	res <- NULL
	tryCatch({
			ss <- data.frame(t(data.frame(strsplit(system(cmd, intern=TRUE),"\t"))),stringsAsFactors=FALSE)
			res <- sum(as.numeric(ss[,3])) + sum(as.numeric(ss[,4]))
		},
		error = function(ee) {
			logger.error(c("Not an indexed bam file:",fName,"(",ee$message,")"))
			res <- NULL
		}
	)
	return(res)
}
#' countMappedReads.bam.perSeq
#'
#' fast count the number of mapped reads to each reference contig (and * for unmapped) in indexed bamfile using samtools idxstats
#'
#' @param fName		indexed mapped reads file (BAM)
#' @return a named vector containing the number of reads mapping to each reference contig
#'
#' @author Fabian Mueller
#' @noRd
countMappedReads.bam.perSeq <- function(fName){
	cmd <- paste(.config$samtools.exec,"idxstats",fName)
	res <- NULL
	tryCatch({
			ss <- data.frame(t(data.frame(strsplit(system(cmd, intern=TRUE),"\t"))),stringsAsFactors=FALSE)
			res <- as.numeric(ss[,3])
			names(res) <- ss[,1]
		},
		error = function(ee) {
			logger.error(c("Not an indexed bam file:",fName,"(",ee$message,")"))
			res <- NULL
		}
	)
	return(res)
}
#' countMappedReads.bam
#'
#' fast count mapped reads to all reference sequence (and the * (unmapped) contig) in indexed bamfile using samtools idxstats
#'
#' @param fName		indexed mapped reads file (BAM)
#' @return the sum of all mapped reads across all reference contigs (and the * (unmapped) contig)
#'
#' @author Fabian Mueller
#' @noRd
countMappedReads.bam <- function(fName){
	sum(countMappedReads.bam.perSeq(fName))
}
#' countUnmappedReads.bam
#'
#' fast count unmapped reads in indexed bamfile using samtools idxstats
#'
#' @param fName		indexed mapped reads file (BAM)
#' @return the number of unmapped reads
#'
#' @author Fabian Mueller
#' @noRd
countUnmappedReads.bam <- function(fName){
	cmd <- paste(.config$samtools.exec,"idxstats",fName)
	res <- NULL
	tryCatch({
			ss <- data.frame(t(data.frame(strsplit(system(cmd, intern=TRUE),"\t"))),stringsAsFactors=FALSE)
			res <- sum(as.numeric(ss[,4]))
		},
		error = function(ee) {
			logger.error(c("Not an indexed bam file:",fName,"(",ee$message,")"))
			res <- NULL
		}
	)
	return(res)
}

#' getRepeatMaskerTable
#'
#' obtain the repeat masker track for a given genome from UCSC. NOT USED YET
#'
#' @param species		species. Currently supports only "human" and "mouse"
#' @return UCSC repeat masker table
#'
#' @author Fabian Mueller
#' @noRd
getRepeatMaskerTable <- function(species=.config$species){
	require(rtracklayer)
	session <- browserSession()
	if (species=="human"){
		genome(session) <- "hg19"
		tt <- getTable(ucscTableQuery(session, track="RepeatMasker", table="rmsk"))
	} else if (species=="mouse"){
		genome(session) <- "mm10"
		tt <- getTable(ucscTableQuery(session, track="RepeatMasker", table="rmsk"))
	} else {
		stop("Unknown species")
	}
	return(tt)
}

#' convertPhredCharToInt
#'
#' convert a single character string containing Phred symbols to an integer vector
#'
#' @param ss		single character vector containing Phred symbols (offset:33)
#' @return an integer vector of Phred scores
#'
#' @author Fabian Mueller
#' @noRd
convertPhredCharToInt <- function(ss){
	exploded <- strsplit(ss,"")
	res <- lapply(exploded,FUN=function(x){
		#convert ascii char to integer and subtract 33
		unname(vapply(x,FUN=function(cc){
			strtoi(charToRaw(cc),16L)
		}, integer(1))) - 33L
	})
	return(res)
}

#' colorize.value
#'
#' transform a vector of values to color values
#'
#' @param val		numeric values to be plotted
#' @param rng		output range of colors. Values outside the bounds will be truncated
#' @param colscheme	color panel to be mapped to
#' @return a vector of color values
#'
#' @author Fabian Mueller
#' @noRd
colorize.value <- function(val, rng=c(min(val,na.rm=TRUE),max(val,na.rm=TRUE)), colscheme=colorpanel(100,"white","red")){
    ifelse(val < rng[1], plotval <- rng[1], plotval <- val)
    return(colscheme[round((plotval - rng[1]) / (rng[2] -rng[1])  * (length(colscheme)-1),0)+1])
}

#' getFgColorForBg
#'
#' get the appropriate foreground color for given background colors. "white" for dark background, "black" for light background
#'
#' @param bg		 a vector of background color values
#' @param thresh	mean rgb color intensity threshold for switching colors
#' @return a vector of foreground color values
#'
#' @author Fabian Mueller
#' @noRd
getFgColorForBg <- function(bg,thresh=100){
    col.rgb.vec <- as.vector(col2rgb(bg))
    ifelse(mean(col.rgb.vec)<thresh,return("white"),return("black"))
}

#' bitwCompl
#'
#' get the bitwise complement of an integer representation of a number, i.e. negate
#' the bitwise representation
#'
#' @param a an integer representation of a binary
#' @return integer representation of the bitwise complement
#'
#' @author Fabian Mueller
#' @noRd
bitwCompl <- function(a){
	2^ceiling(log2(a))-a-1
}

#' explainSamFlags
#'
#' Convert SAM flag integer(s) into bit representation
#'
#' @param x  (vector of) integer representation(s) of SAM flag(s)
#' @return a bit matrix containing explanations
#'
#' @author Fabian Mueller
#' @noRd
explainSamFlags <- function(x){
	xbit <- do.call("rbind", lapply(x, FUN=function(xc) {as.logical(intToBits(xc))}))
	xbit <- xbit[,1:12,drop=FALSE]
	colnames(xbit) <- c(
		"read_paired",
		"read_properPair",
		"read_unmapped",
		"mate_unmapped",
		"read_reverse",
		"mate_reverse",
		"read_read1_inpair",
		"read_read2_inpair",
		"read_secondary_alignment",
		"read_fails_qc",
		"read_duplicate",
		"read_supplementary_alignment"
	)
	return(xbit)
}

#' getFlagStats
#'
#' gets the samtools flagstat results for a given bam file.
#'
#' @param bamFn bam file
#' @return ...
#'
#' @author Fabian Mueller
#' @noRd
getFlagStats <- function(bamFn){
	require(stringr)
	selFlags <- c(
		"total"      = "in total",
		"mapped"     = "mapped",
		"paired"     = "paired in sequencing",
		"properPair" = "properly paired",
		"read1"      = "read1",
		"read2"      = "read2",
		"duplicate"  = "duplicates",
		"secondary"  = "secondary",
		"singletons" = "singletons"
	)
	parseFlagStats <- function(ss){
		ss <- str_trim(gsub("\\(.+\\)", "", ss))
		sl <- strsplit(ss," ")
		tt <- do.call("rbind", lapply(sl, FUN=function(x){
			if (x[2] != "+") stop(paste0("error in pearsing flagstats for file ",bamFn, ": second element in line is non '+'"))
			return(c(x[1], x[3], paste(x[4:length(x)], collapse=" ")))
		}))
		tt <- data.frame(
			qc_pass=as.integer(tt[,1]),
			qc_fail=as.integer(tt[,2]),
			description=tt[,3],
			stringsAsFactors=FALSE
		)
		tt$total <- tt$qc_pass + tt$qc_fail
		return(tt)
	}
	cmd <- paste(.config$samtools.exec,"flagstat",bamFn)
	res <- NULL
	tryCatch({
			res <- parseFlagStats(system(cmd, intern=TRUE))
		},
		error = function(ee) {
			logger.error(c("Error parsing flagstats for file:", bamFn, "(",ee$message,")"))
			res <- NULL
		}
	)
	selflagInDesc <- selFlags %in% res$description
	if (!all(selflagInDesc)){
		stop(paste0("The following flags were not found in the flagstats for file", bamFn, paste(names(selFlags)[!selflagInDesc],collapse=",")))
	}
	res <- res[!duplicated(res$description),c("qc_pass", "qc_fail", "total", "description")]
	rownames(res) <- res$description
	res <- res[selFlags,]
	rownames(res) <- names(selFlags)
	return(res)
}

#' getReadStatsFromSample
#'
#' sample a bam file and get read statistics from that sample
#'
#' @param bamFn bam file
#' @param frac  fraction of reads to be sampled
#' @return a list containing read statistics
#'
#' @author Fabian Mueller
#' @noRd
getReadStatsFromSample <- function(bamFn, frac=0.001){
	# tmpBamFn <- tempfile(pattern="sampledBam_", fileext=".bam")
	ss <- system2(.config$samtools.exec, c("view", "-s", frac, bamFn), stdout=TRUE)
	ss <- strsplit(ss, "\t")
	readLens <- vapply(ss, FUN=function(x){nchar(x[10])}, integer(1))
	flags <- vapply(ss, FUN=function(x){as.integer(x[2])}, integer(1))
	flagTab <- explainSamFlags(flags)
	flagRates <- colSums(flagTab)/nrow(flagTab) 
	res <- list(
		readLength.summary = summary(readLens),
		flagRates = flagRates
	)
	return(res)
}

#' getAlnStats
#'
#' Retrieve a table of alignment statistics for pair of repeat alignment and
#' backgound BAM files
#'
#' @param bamFn.aln   bam file for the repeat alignment
#' @param bamFn.unaln bam file for the background (typically the BAM file containing all reads
#'                    that is the basis of alignment)
#' @return a data.frame containing read statistics:
#' \item{totalReads}{Total number of reads in the background BAM file}
#' \item{mappedReads}{Number of reads mapped to the repeat reference}
#' \item{mappingRate}{mappedReads/totalReads}
#' \item{readLengthAlnMin}{Minimum length of reads aligned to the repeat referce}
#' \item{readLengthAlnMax}{Maximum length of reads aligned to the repeat referce}
#' \item{readLengthAlnMedian}{Median length of reads aligned to the repeat referce}
#' \item{readLengthAllMin}{Minimum length of reads in the background BAM file}
#' \item{readLengthAllMax}{Maximum length of reads in the background BAM file}
#' \item{readLengthAllMedian}{Median length of reads in the background BAM file}
#' \item{pairedRateAln}{Percentage of paired reads aligned to the repeat referce}
#' \item{pairedRateAll}{Percentage of paired reads in the background BAM file}
#'
#' @author Fabian Mueller
#' @noRd
getAlnStats <- function(bamFn.aln, bamFn.unaln){
	ra <- RepeatAlignment(bamFn.aln)
	rcl.aln <- getReadCounts(ra, useIdxStats=TRUE, addGlobalCounts=TRUE)

	rc.mapped <- attr(rcl.aln, "global")[[".mapped"]]
	rc.total <- countAllReads.bam(bamFn.unaln)

	readStats.aln <- getReadStatsFromSample(bamFn.aln)
	readStats.unaln <- getReadStatsFromSample(bamFn.unaln)
	res <- data.frame(
		totalReads=rc.total,
		mappedReads=rc.mapped,
		mappingRate=rc.mapped/rc.total,
		readLengthAlnMin    = as.numeric(readStats.aln$readLength.summary["Min."]),
		readLengthAlnMax    = as.numeric(readStats.aln$readLength.summary["Max."]),
		readLengthAlnMedian = as.numeric(readStats.aln$readLength.summary["Median"]),
		readLengthAllMin    = as.numeric(readStats.unaln$readLength.summary["Min."]),
		readLengthAllMax    = as.numeric(readStats.unaln$readLength.summary["Max."]),
		readLengthAllMedian = as.numeric(readStats.unaln$readLength.summary["Median"]),
		pairedRateAln       = readStats.aln$flagRates["read_paired"],
		pairedRateAll       = readStats.unaln$flagRates["read_paired"]
	)
	return(res)
}


#' plotRepeatAlignmentStats
#'
#' plot the repeat alignment statistic from the table output by the pipeline step "repeatAlignmentStats"
#'
#' @param x  filename for the table or data frame containing alignment statistics
#' @return nothing of particular intest
#'
#' @author Fabian Mueller
#' @export
plotRepeatAlignmentStats <- function(x){
	if (is.character(x)){
		if (file.exists(x)){
			tt <- read.table(x, sep="\t", header=TRUE, stringsAsFactors=FALSE)
		} else {
			stop("invalid alignment stats file")
		}
	} else if (is.data.frame(x)){
		tt <- x
	} else {
		stop("invalid table argument")
	}
	df2p <- tt[,c("sampleName", "mark", "dataType", "totalReads", "mappedReads", "mappingRate")]
	pp <- ggplot(df2p) + aes(x=totalReads, y=mappingRate, color=mark, shape=dataType) + geom_point() #+ geom_text(aes(label=sampleName))	
	pp
}

#' mergeMethGr
#'
#' Merge methylation and total counts from both strands and identical coordinates
#'
#' @param gr GRanges object with methylation and total read counts annotated with T and M in the metadata
#' @return a GRanges object containing summed methylation calls from both strands for identical coordinates
#'
#' @author Fabian Mueller
#' @noRd
mergeMethGr <- function(gr){
	gr <- sort(gr, ignore.strand=TRUE)
	# djb <- disjointBins(gr, ignore.strand=TRUE)
	# create GRanges object with unique coordinates
	gr.dj <- gr
	strand(gr.dj) <- "*"
	elementMetadata(gr.dj) <- NULL
	gr.dj <- unique(gr.dj)
	oo <- findOverlaps(gr.dj, gr, type="equal", ignore.strand=TRUE)
	tt <- elementMetadata(gr)
	tt <- tt[subjectHits(oo),]
	sumsM <- as.integer(tapply(tt$M, queryHits(oo), FUN=sum))
	sumsT <- as.integer(tapply(tt$T, queryHits(oo), FUN=sum))
	elementMetadata(gr.dj) <- data.frame(M=sumsM, T=sumsT)
	return(gr.dj)
}
#' parseMcTable.bissnp
#'
#' Parse methylation calls from BisSNP output format
#'
#' @param mcFile methylation call file
#' @return a GRanges object containing methylation calls for cytosine positions
#'
#' @author Fabian Mueller
#' @noRd
parseMcTable.bissnp <- function(mcFile){
	tt <- read.table(mcFile, sep="\t", header=FALSE, skip=1)
	colnames(tt)[1:6] <- c("chrom", "start", "end", "perc", "T", "strand")
	doShift <- tt$strand != "-"
	tt$start[doShift] <- tt$start[doShift] + 1L
	tt$end[doShift] <- tt$end[doShift] + 1L

	gr <- GRanges(tt$chrom, IRanges(tt$start, width=2), strand=tt$strand)
	elementMetadata(gr) <- data.frame(M=as.integer(round(tt$perc/100 * tt$T)),T=tt$T)

	gr.merged <- mergeMethGr(gr)
	return(gr.merged)
}
#' parseMcTable.epp
#'
#' Parse methylation calls from EPP output format
#'
#' @param mcFile methylation call file
#' @return a GRanges object containing methylation calls for cytosine positions
#'
#' @author Fabian Mueller
#' @noRd
parseMcTable.epp <- function(mcFile){
	tt <- read.table(mcFile, sep="\t", header=FALSE, stringsAsFactors=FALSE)
	colnames(tt)[1:6] <- c("chrom", "start", "end", "methStr", "score", "strand")

	valM <- as.integer(gsub("^([0-9]+)/([0-9]+)$", "\\1", tt$methStr))
	valT <- as.integer(gsub("^([0-9]+)/([0-9]+)$", "\\2", tt$methStr))

	gr <- GRanges(tt$chrom, IRanges(tt$start, width=2), strand=tt$strand)
	elementMetadata(gr) <- data.frame(M=valM,T=valT)

	gr.merged <- mergeMethGr(gr)
	return(gr.merged)
}

#' logger.cmd.args
#'
#' Log command line options
#'
#' @param cmdArgs command line options as returned by \code{parse_args} of the \code{argparse} package
#' @return nothing of particular interest
#'
#' @author Fabian Mueller
#' @export
logger.cmd.args <- function(cmdArgs){
	require(stringr)
	argNames <- names(cmdArgs)
	argStrings <- as.character(cmdArgs)
	argStrings <- paste0(str_pad(argNames, max(nchar(argNames)), side="right"), " : ", argStrings)
	logger.start("Parameter settings")
		for (i in 1:length(argStrings)){
			logger.info(argStrings[i])
		}
	logger.completed()
	invisible(NULL)
}

#' normalizeMatrix
#'
#' Apply normalizatio to columns of a given matrix
#'
#' @param x      a matrix of values to be normalized
#' @param method method of normalization. Currently supported are:
#'               \code{none} (no normalization),
#'               \code{standard} (subtract the mean, devide by standard deviation),
#'               \code{scale} (scale to the interval [0,1]) and
#'               \code{quantile} (Quantile normalization)
#' @param ...    arguments passed down to the actual normalization functions
#' @return the normalized matrix
#'
#' @author Fabian Mueller
#' @export
normalizeMatrix <- function(x, method="standard", ...){
	require(matrixStats)
	normFuns <- list(
		none = function(x){
			return(x)
		},
		standard = function(x){
			rr <- t( (t(x) - colMeans(x, na.rm=TRUE))/colSds(x, na.rm=TRUE) )
			return(rr)
		},
		quantile = function(x){
			require(preprocessCore)
			# xr <- apply(x, 2, FUN=function(cc){rank(cc, ties.method="min")})
			# xs <- apply(x, 2, sort)
			# xo <- apply(x, 2, FUN=function(cc){order(cc)})
			rr <- normalize.quantiles(x)
			return(rr)
		},
		scale = function(x, a=0, b=1){
			rr <- a + t( ((t(x) - colMins(x, na.rm=TRUE))*(b-a))/(colMaxs(x, na.rm=TRUE)-colMins(x, na.rm=TRUE)) )
			return(rr)
		}
	)
	if (!is.matrix(x)) logger.error("expected matrix [normalizeMatrix]")
	if (!is.element(method, names(normFuns))) logger.error(c("unknown normalization method:", method))
	res <- normFuns[[method]](x, ...)
	return(res)
}
MPIIComputationalEpigenetics/epiRepeatR documentation built on March 22, 2021, 11:09 p.m.