#' Read an abra.sv file
#'
#' Read in a sample's abra.sv.txt file, or a merged (concatenated)
#' abra.sv.txt file with an extra (initial) column giving the sample name.
#'
#' @param file The abra.sv fusion data file to load. Can be a single abra sv
#' file, or a merged file with results for multuple samples.
#'
#' @param stranded Boolean indicating if the abra file has strand information.
#' Currently, setting this TRUE will prevent processing as stradedness is not
#' implemented. Default is FALSE.
#'
#' @param orderedEnds Boolean indicating if the output should order the two
#' fusions ends canonically (ref, then pos), so the earlier one is first.
#' Default is TRUE.
#'
#' @param tumorOnly Boolean indicating if the abra file has one column, e.g.
#' tumor only or two columns of data, e.g. tumor and normal. Default is FALSE.
#'
#' @return Returns a data frame describing the fusions, with columns:
#' \itemize{
#' \item{ \bold{sample} [optional] The sample this fusion came from.}
#' \item{ \bold{group} The group name (within sample) of equivalent fusions.}
#' \item{ \bold{chr1} End1's chromosome, parsed from <chr>:<pos>.}
#' \item{ \bold{pos1} End1's position, parsed from <chr>:<pos>.}
#' \item{ \bold{chr1} End2's chromosome, parsed from <chr>:<pos>.}
#' \item{ \bold{pos1} End2's position, parsed from <chr>:<pos>.}
#' \item{ \bold{orientation} The relative directionality of end1 and end2, as
#' blank " " if in the same direction (FF or RR from input), or as "I" if
#' relatively inverted (FR or RF fromn input).}
#' \item{ \bold{normal_count} The number of split reads containing the fusion
#' point in the normal sample.}
#' \item{ \bold{tumor_count} The number of split reads containing the fusion
#' point in the tumor sample.}
#' }
#'
#' @examples
#' svFile <- "tests/testthat/data/perSampleAbraSv.tsv"
#' svFile <- system.file( svFile, package= "fusionClust" )
#' fusions <- loadAbraSv( svFile )
#' @export
readAbraSv <- function (file, stranded= FALSE, orderedEnds= TRUE, tumorOnly= FALSE) {
# Can't handle stranded data
if (stranded) {
stop(
"Can't handle abra.sv data with absolute strand information.",
call.= FALSE
)
}
# Load data
# Have to specify column classes as all "F" in orientation column will show
# up as FALSE.
df <- read.csv( file, header= FALSE, sep= "\t", stringsAsFactors= FALSE, colClasses = "character" )
# Set row names equal to the line from the file
rownames(df) <- 1:nrow(df)
# Set input column names, may be single sample or merged cohort
countColumnNames = c("normal_count", "tumor_count")
if (tumorOnly) {
countColumnNames = c("tumor_count")
}
names <- c( "group", "end1", "end2", "strand1", "strand2", countColumnNames )
keepColumns <- c( "id", "group", "chr1", "pos1", "chr2", "pos2",
"orientation", countColumnNames )
# Use column count to guess if have sample names.
withSample <- FALSE
if ((tumorOnly & ncol(df) == 7) | ( ! tumorOnly & ncol(df) == 8)) {
withSample <- TRUE
}
if (withSample) {
names <- c("sample", names)
keepColumns <- c("sample", keepColumns)
}
else if (( tumorOnly & ncol(df) != 6 ) | ( ! tumorOnly & ncol(df) != 7 )) {
stop(
"Doesn't look like an abra.sv file, has the wrong column count.",
call.= FALSE
)
}
colnames(df) <- names
df$id <- 1:nrow(df)
# Convert chr:pos entries to separate columns, for each end
df$chr1 <- sub( ":.*", "", df$end1 )
df$chr2 <- sub( ":.*", "", df$end2 )
df$pos1 <- as.integer(sub( ".*:", "", df$end1 ))
df$pos2 <- as.integer(sub( ".*:", "", df$end2 ))
# Fix false stranded info to "relative orientation"
df$orientation <- paste0(df$strand1, df$strand2)
df$orientation <- sub( "FR|RF", "I", df$orientation)
df$orientation <- sub( "FF|RR", "", df$orientation)
# Convert data columns back to integers
df$tumor_count <- as.integer( df$tumor_count)
if (! tumorOnly ) {
df$normal_count <- as.integer( df$normal_count)
}
# Set output column names, may be single sample or merged cohort
df <- df[, keepColumns]
if (orderedEnds) {
df <- orderFusionEnds( df )
}
}
#' Write an abra.sv file
#'
#' Write a sample's abra.sv.txt file, or a merged (concatenated)
#' abra.sv.txt file with an extra (initial) column giving the sample name.
#'
#' @param file The abra.sv fusion data file to write.
#' @param fusions The file Can be a single abra sv
#' file, or a merged file with results for multuple samples.
#'
#' @return A data frame object describing the fusions, with columns:
#' \itemize{
#' \item{ \bold{sample} [optional] The sample this fusion came from.}
#' \item{ \bold{group} The group name (within sample) of equivalent fusions.}
#' \item{ \bold{chr1} End1's chromosome, parsed from <chr>:<pos>.}
#' \item{ \bold{pos1} End1's position, parsed from <chr>:<pos>.}
#' \item{ \bold{chr1} End2's chromosome, parsed from <chr>:<pos>.}
#' \item{ \bold{pos1} End2's position, parsed from <chr>:<pos>.}
#' \item{ \bold{orientation} The relative directionality of end1 and end2, as
#' blank " " if in the same direction (FF or RR from input), or as "I" if
#' relatively inverted (FR or RF fromn input).}
#' \item{ \bold{normal_count} The number of split reads containing the fusion
#' point in the normal sample.}
#' \item{ \bold{tumor_count} The number of split reads containing the fusion
#' point in the tumor sample.}
#' }
#'
#' @examples
#' svFile <- "tests/testthat/data/perSampleAbraSv.tsv"
#' svFile <- system.file( svFile, package= "fusionClust" )
#' fusions <- loadAbraSv( svFile )
#' @export
writeAbraSv <- function (fusions, file, stranded= FALSE) {
# Can't handle stranded data
if (stranded) {
stop(
"Can't handle abra.sv data with absolute strand information.",
call.= FALSE
)
}
if (is.null(fusions$sample)) {
}
outDF <- data.frame(
stringsAsFactors = FALSE
)
}
#' Order fusion ends in the cohort.
#'
#' Cannonically ordered fusions are ordered with lower end first. This switches
#' them if the high end was first. Low means alphabetic ordering by chr, and
#' then numeric ordering by position. No fusion can have the same left and right
#' side as positions are the last included base on each end.
#'
#' @param fusions A fusion cohort as a data frame with at least the columns:
#' chr1, pos1, chr2, and pos2.
#'
#' @return The input fusion cohort, but with all fusions listed low end first.
#'
#' @export
orderFusionEnds <- function( fusions ) {
# Order ignores canonical chromosome order; only using alphabetic
# Get a logical vector indicating which rows need fusion ends swapping.
swapChr <- fusions$chr1 > fusions$chr2
swapPos <- (fusions$chr1 == fusions$chr2) & (fusions$pos1 > fusions$pos2)
swap <- swapChr | swapPos
# Do swap based on logical swap vector
chrOld <- fusions$chr1
fusions$chr1[swap] <- fusions$chr2[swap]
fusions$chr2[swap] <- chrOld[swap]
rm(chrOld)
posOld <- fusions$pos1
fusions$pos1[swap] <- fusions$pos2[swap]
fusions$pos2[swap] <- posOld[swap]
rm(posOld)
fusions
}
#' Sort fusions
#'
#' @param fusions The fusions to sort
#'
#' @return Data frame of fusion ends, sorted from low to high, with fusion
#' id end columns added to allow mapping back to its source.
#'
#' @export
sortFusions <- function ( fusions ) {
if ( ! all( c("chr1", "chr2", "pos1", "pos2") %in% names(fusions) )) {
stop( "This doesn't appear to be a fusion frame", call.= FALSE )
}
# Build data frame of merged left and right fusion ends.
fusions <- data.frame( fusionId= rownames(fusions),
chr= c(fusions$chr1, fusions$chr2),
pos= c(fusions$pos1, fusions$pos2),
fusionEnd= c(rep(1,nrow(fusions)), rep(2, nrow(fusions))),
stringsAsFactors= FALSE )
# Order by chr and pos, low to high (alphabetical chr only)
end <- fusions[with(fusions, order(chr, pos, fusionEnd)),]
}
#' Fusion endpoint spacings
#'
#' Merges the two enpoints into a single list of positions and spacings, then
#' sorts, splits by chromosome, and calculates the \code{diff()} of successive
#' endpints.
#'
#' @param fusions Data frame of fusions to cluster
#'
#' @return A list by chromosome, each a vector of fusion endpoint spacings on
#' that chromosome.
#'
#' @export
fusionGaps <- function ( fusions ) {
end <- sortFusions(fusions)
## Get gaps between successive ends, per chr
# Get chr names
chrNames <- unique( end$chr )
# Pre-create list with placeholders for gap vectors
gaps <- as.list( rep( NA_integer_, length(chrNames)))
names(gaps) <- chrNames
# Get gaps
for (chr in chrNames) {
posSet <- end$pos[end$chr == chr]
if (length(posSet > 1))
gaps[[chr]] <- diff(posSet)
}
gaps
}
#' Cluster fusions
#'
#' @param fusions The fusions to cluster
#'
#' @param window The window size to use for clustering
#'
#' @return Data frame with fusions annotated by the cluster each end belongs to
#' and by paired fusion.
#'
#' @export
clusterFusions <- function ( fusions, window= 10 ) {
ends <- sortFusions(fusions)
clusteredEnds <- clusterFusionEnds( ends, window )
fusionCount <- nrow(fusions)
clusteredEnds1 <- clusteredEnds[clusteredEnds$fusionEnd == 1,]
fusions[clusteredEnds1$fusionId, "end1Cluster"] <- clusteredEnds1$clusterId
fusions[clusteredEnds1$fusionId, "end1ClusterLow"] <- clusteredEnds1$clusterLow
fusions[clusteredEnds1$fusionId, "end1ClusterHigh"] <- clusteredEnds1$clusterHigh
fusions$end2Cluster <- rep(NA_integer_, fusionCount)
clusteredEnds2 <- clusteredEnds[clusteredEnds$fusionEnd == 2,]
fusions[clusteredEnds2$fusionId, "end2Cluster"] <- clusteredEnds2$clusterId
fusions[clusteredEnds2$fusionId, "end2ClusterLow"] <- clusteredEnds2$clusterLow
fusions[clusteredEnds2$fusionId, "end2ClusterHigh"] <- clusteredEnds2$clusterHigh
clustClustNames <- unique(paste0(fusions$end1Cluster, "~", fusions$end2Cluster))
map <- 1:length(clustClustNames)
names(map) <- sort(clustClustNames)
fusions$fusionClusterId <- map[paste0(fusions$end1Cluster, "~", fusions$end2Cluster)]
fusions
}
#' Cluster and annotate a fusion end list
#'
#' @param ends A dataframe of fusion ends, sorted.
#' @param window The size of the clustering window, in base pairs. Two fusion
#' ends where the difference in the last sequenced base position is less than
#' this will be merged, e.g. if the positions are 10 and 19, a window of 10
#' will merge these, a window of 9 will not.
#'
#' @return The data frame of fusion ends, annotated with the cluster they
#' belong to (adding columns clusterId, clusterLow, and clusterHigh).
#'
#' @export
clusterFusionEnds <- function ( ends, window= 10 ) {
endsCount <- nrow(ends)
rownames(ends) <- NULL
# Pre-allocating extra space; creates cluster list the same size as the fusion
# end list, will only use the rows that are not NA when done. Wastes space to
# avoid growing a data frame.
ends$clusterId <- rep(NA_integer_, endsCount)
ends$clusterLow <- rep(NA_integer_, endsCount)
ends$clusterHigh <- rep(NA_integer_, endsCount)
ends$chained <- rep(NA, endsCount)
# Init first fusion
previousChr <- ends[1, "chr"]
previousPos <- ends[1, "pos"]
# Init first cluster
clusterId <- 1
lowPos <- previousPos
highPos <- previousPos
fusionIdStart <- 1
for (fusionIdCurrent in 2:endsCount) {
# Init new fusion
currentChr <- ends[fusionIdCurrent, "chr"]
currentPos <- ends[fusionIdCurrent, "pos"]
if ( previousChr == currentChr && currentPos - previousPos < window ) {
### Grow cluster
# fusionIdStart <- unchanged as same cluster
# clusterId <- unchanged as same cluster
# lowPos <- unchanged as expect ends to be ordered
highPos <- currentPos
### Set old fusion
# previousChr <- unchanged as only way to be in this block
previousPos <- currentPos
}
else {
### Report cluster.
# Now know previous fusion ended cluster (if any), so report that value.
# Actually ending at position of last fusion in cluster, which was the
# previous one, [fusionIdCurrent - 1]. Counting from 2 so never OOB
for (fusionId in fusionIdStart:(fusionIdCurrent - 1)) {
ends[fusionId, c("clusterId", "clusterLow", "clusterHigh")] <-
list( clusterId, lowPos, highPos )
}
### Init new cluster
clusterId <- clusterId + 1
lowPos <- currentPos
highPos <- currentPos
fusionIdStart <- fusionIdCurrent
### Set old fusion
previousChr <- currentChr
previousPos <- currentPos
}
}
### Only now know previous cluster was last, so end and report it
for (fusionId in fusionIdStart:(fusionIdCurrent)) {
ends[fusionId, c("clusterId", "clusterLow", "clusterHigh")] <-
list( clusterId, lowPos, highPos )
}
ends
}
#' Load a bed per-base coverage file
#'
#' @param file The file to load
#' @param merged TRUE if merged file from multiple samples with the first
#' column the "sample"
#'
#' @return A data frame with the coverage info, including a sample column
#' if a merged file.
#'
#' @export
loadBedCoverage <- function(file, merged=FALSE) {
df <- read.csv( file, header= FALSE, sep= "\t", stringsAsFactors= FALSE )
names <- c( "chr", "start", "end", "pos", "depth")
if (merged) {
names <- c("sample", names)
}
colnames(df) <- names
df
}
# plotCoverage( coverageDF, feature ) {
# feature <- coverage[coverage$chr=="gi|333031|lcl|HPV16REF.1|",]
# }
#' Merge fusion site counts by cluster
#'
#' @param clusteredFusions The clustered fusion sites
#'
#' @return A data frame with the same columns as clusteredFusions, but with
#' different entries for the same cluster in the same sample collapsed.
#'
#' @export
mergeCountsByCluster <- function ( clusteredFusions ) {
sumColumns <- c("normal_count", "tumor_count")
byColumns <- c( "sample", "fusionClusterId", "chr1", "chr2",
"end1Cluster", "end1ClusterLow", "end1ClusterHigh",
"end2Cluster", "end2ClusterLow", "end2ClusterHigh" )
summed <- aggregate(
cbind(normal_count, tumor_count) ~
sample + fusionClusterId + chr1 + chr2 +
end1Cluster + end1ClusterLow + end1ClusterHigh +
end2Cluster + end2ClusterLow + end2ClusterHigh,
clusteredFusions[c(byColumns, sumColumns)],
FUN= sum
)
summed[with(summed, order(sample, fusionClusterId)),]
}
#' Load virus count file
#'
#' @param file Single sample or cohort virus count file. Cohort has extra (first)
#' column giving sample name.
#'
#' @return A matrix of samples (as rownames) x virus (as columnNames), giving
#' the counts of viruses found in the samples.
#'
#' @examples
#'
#' @export
readVirusCounts <- function(file) {
# Load data
df <- read.csv( file, header= FALSE, sep= "\t", stringsAsFactors= FALSE )
# Set input column names, may be single sample or merged cohort
names <- c( "virus", "start", "end", "count" )
if ( ncol(df) == 5 ) {
names <- c("sample", names)
}
else if ( ncol(df) != 4 ) {
stop(
"Doesn't look like an virus counts file, has the wrong column count.",
call.= FALSE
)
}
colnames(df) <- names
# Extract to matrix, one row per sample, one column per virus
sampleNames <- unique(df[,"sample"])
virusNames <- unique(df[ df["sample"] == sampleNames[1], "virus" ])
virusCounts <- as.data.frame(
t(sapply( sampleNames, function (sampleName) {
df[ df["sample"] == sampleName, "count" ]
}))
)
colnames(virusCounts) <- virusNames
virusCounts
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.