require(openxlsx)
require(stringr)
require(ggplot2)
require(gridExtra)
#' This script is for processing BASESPACE output
#'
#' Get the names of amplicons from excel sheet names
.findSheetsWAmps <- function(file) {
shts <- getSheetNames(file)
shts <- shts[!sapply(shts, grepl, file)]
if(length(shts) > 0) return(shts)
# If this fails, just remove first sheet
shts <- getSheetNames(file)
shts <- shts[-1]
return(shts)
}
#' Guess standards beta based on file name
#' P.S. This function does well but is not as smart as you are
#' It will fail if the filenames are too obscure (or for other reasons)
#' PLEASE double check the results...
#' And when in doubt, just pick out the standards manually
#' @import readxl
#' @import openxlsx
#' @import stringr
#' @param filenames names of files
#' @return data.frame with following columns
#' ID: possible sample ID
#' beta: beta value
#' IsStd: is the sample a standard
#' file: file path
# guessStdBetas <- function(filenames) {
if(FALSE) {
myFilelist <- basename(filenames)
fnParse <- str_split_fixed(myFilelist, "[-_\\.]", 10)
# Check if part of the file name has 0 and 100
# samples <- unique(fnParse[,1])
# fnIsStd <- sapply(samples, function(y) apply(fnParse[fnParse[,1] %in% y,], 2, function(x) any(x == 0) & any(x == 100)))
fnIsStd <- apply(fnParse, 2, function(x) any(x == 0) & any(x == 100))
fnIsStd[is.na(fnIsStd)] <- FALSE
# SpIsStd <- apply(fnIsStd, 2, any)
if(!any(fnIsStd) | sum(fnIsStd) > 1) {
message("Unable to determine standards methylation value. ",
"Methylation values are determined by a value embedded in ",
"_ or . For example, Standard_0_m.xlsx, ",
"Standard_50_m.xlsx, Standard_100_m.xlsx")
return(NULL)
}
# Get standard betas
StdBeta <- fnParse[,fnIsStd]
SampleIsStd <- grepl("^[0-9]+$", StdBeta)
SampleIDs <- apply(fnParse[,!fnIsStd], 1, paste, collapse = "_")
SampleIDs <- gsub("_targeted.*", "", SampleIDs)
# Find which samples have standards
StdIDs <- sapply(unique(SampleIDs), function(x) all(SampleIsStd[SampleIDs %in% x]))
StdIDs <- names(StdIDs)[StdIDs]
res <- data.frame(ID = SampleIDs, beta = as.numeric(StdBeta),
IsStd = SampleIsStd, file = filenames,
stringsAsFactors=FALSE)
return(res)
}
#' stdcurveBSampseq
#' Generate standard curve for BS amplicon sequencing
#'
#' Outliers are defined as points which are +/- StdFitPercent
#' from the median beta value for each set of percent methylation
#' (i.e. median = 0.4, StdFitPercent = 0.4, acceptable range = 0-0.8)
#'
#' @import readxl
#' @import openxlsx
#'
#' @param filenames processed excel files (use processBSampseq)
#' @param betas beta values of files
#' @param name name of sample
#' @param StdFitPercent (0.4) beta deviation from median to be classified as outlier
#' @param outputDir (NULL) output folder for saving excels
#' @param saveExcel (TRUE) save excel file
#' @param return (FALSE) returns value
#'
#' @return data.frame or excel file with standard curves
#'
#'
bs_stdcurveBSampseq <- function(filenames, betas, name, StdFitPercent = 0.4,
outputDir = NULL, saveExcel = TRUE, return = FALSE) {
# Order files by increasing beta
myFileList <- filenames[order(betas)]
StdBeta <- betas[order(betas)]
# Find common amplicons in all standards file
mySheets <- lapply(myFileList, .findSheetsWAmps)
mySheets.common <- sapply(mySheets, function(x) mySheets[[1]] %in% x)
if(class(mySheets.common) == "logical") {
mySheets.common <- all(mySheets.common)
} else {
mySheets.common <- apply(mySheets.common, 1, all)
}
if(!all(mySheets.common)) message("Some amplicons were not found in all standards, only common amplicons were used.")
mySheets <- mySheets[[1]][mySheets.common]
# Read data for all standards
Stds <- lapply(myFileList, function(x) lapply(mySheets, function(y) read.xlsx(x, sheet = y, colNames = TRUE)))
# If no correct header, but has over 8 columns
if(!"Beta" %in% names(Stds[[1]][[1]])) {
Stds <- lapply(myFileList, function(x) lapply(mySheets, function(y) read.xlsx(x, sheet = y, colNames = FALSE)))
if(all(sapply(Stds[[1]], ncol) >= 7)) {
Stds <- lapply(Stds, function(x) lapply(x, function(y) setNames(y, c("Chrm","Position","Strand","Methyl Read","UnMethyl Reads","Context","Seq","Beta"))))
} else {
stop("Error reading standards, make sure the columns are properly labeled.")
}
}
names(Stds) <- StdBeta
Stds <- lapply(Stds, function(x) setNames(x, mySheets))
# Identify only common columns and trim off excess columns
myCols <- lapply(Stds, function(x) sapply(x, function(y) names(y)))
myCols <- sapply(unique(as.vector(myCols[[1]])), function(z)
all(sapply(myCols, function(x) all(apply(x, 2, function(y) z %in% y))))
)
myCols <- names(myCols[myCols])
myCols <- myCols[myCols %in% c("Chrm","Position","Strand","Meth_Reads","UnMeth_Reads","Context","Seq","Beta")]
Stds <- lapply(Stds, function(x) lapply(x, function(y) y[,myCols]))
if(!"Beta" %in% myCols) stop("Beta column not found in one or more files.")
# Add percent methylation value
for (i in 1:length(Stds)) {
Stds[[i]] <- lapply(Stds[[i]], function(x) data.frame(x, Exp_Beta=StdBeta[i]))
}
# Combine all standards for each amplicon
AmpStds <- list()
for (i in 1:length(mySheets)) {
AmpStds[[mySheets[i]]] <- do.call("rbind", sapply(Stds, function(x) x[i]))
}
# Identify outliers for each amplicon
for(iAmp in 1:length(AmpStds)) {
AmpDat <- AmpStds[[iAmp]]
# Create a list of CpGs, using position as name
PrimerGood <- rep(TRUE, length(unique(AmpDat$Position)))
names(PrimerGood) <- unique(AmpDat$Position)
# Go through each CpG and check if the median is greater than StdFitPercent
for (i in unique(AmpDat$Exp_Beta)) {
CpGNums <- match(AmpDat$Position[AmpDat$Exp_Beta %in% i], names(PrimerGood))
OneBeta <- AmpDat$Beta[AmpDat$Exp_Beta %in% i]
PrimerGood[CpGNums] <- !abs(OneBeta - median(OneBeta)) > StdFitPercent & PrimerGood[CpGNums]
}
AmpDat$Outlier <- !PrimerGood[match(AmpDat$Position, names(PrimerGood))]
AmpStds[[iAmp]] <- AmpDat
}
# Check if there are no values in amplicon
AmpLMStds <- AmpStds
AmpYesReads <- sapply(AmpStds, function(x) any(!x$Outlier))
AmpYesReads[is.na(AmpYesReads)] <- FALSE
if(any(!AmpYesReads)) {
AmpLMStds <- AmpStds[AmpYesReads]
print(paste("No reads found in the following amplicons, no standard curve were made:",
paste(names(which(!AmpYesReads)), collapse=", ")))
}
# Calculate linear model and r-square (exclude outliers)
PrimerRsq <- sapply(AmpLMStds, function(x) summary(lm(Beta ~ Exp_Beta, x[!x$Outlier,], na.action=na.omit))$r.squared)
PrimerLM <- sapply(AmpLMStds, function(x) lm(Beta ~ Exp_Beta, x[!x$Outlier,], na.action=na.omit)$coefficients)
# Add these numbers to matrices for export
for (i in names(AmpLMStds)) {
AmpStds[[i]]$variable <- c("R-squared", "Gradient", "Intercept", rep(NA, nrow(AmpStds[[i]])-3))
AmpStds[[i]]$value <- c(PrimerRsq[i], PrimerLM[2,i], PrimerLM[1,i], rep(NA, nrow(AmpStds[[i]])-3))
}
# Write excel file
outputfile <- paste(name, "_StdCurve.xlsx", sep="")
# outputfile <- gsub(pattern = ".xlsx$", replacement = "_StdCurve.xlsx", myFileList[length(myFileList)])
# outputfile <- gsub(pattern = "[\\_\\.]100", replacement = "", outputfile)
if(!is.null(outputDir)) outputfile <- file.path(outputDir, outputfile)
if(saveExcel) write.xlsx(AmpStds, file = outputfile, col.names = TRUE)
if(return) return(AmpStds)
}
#' plotStdCurve
#' Plot standard curve for one amplicon
#'
#' @import ggplot2
#'
#' @param df data.frame with data
#' @param line.yint stdcurve y-intercept
#' @param line.slope stdcurve slope
#' @param line.rsq stdcurve r-squared
#' @param beta name of observed beta column
#' @param expected name of expected beta column
#' @param outlier name of outlier column
#' @param legend logical: include legend?
#' @param title title of plot
#' @param plotgraph logical: plot the graph in R?
#' @param savefile location of file to save or FALSE
#'
bs_plotStdCurve <- function(df, line.yint, line.slope, line.rsq,
beta="Beta", expected="Exp_Beta", outlier="Outlier",
legend = TRUE, title,
plotgraph = FALSE, savefile = FALSE) {
# Look for abline and rsq values if not defined
if(all(missing(line.yint), missing(line.slope), missing(line.rsq))) {
if(all(c("variable", "value") %in% names(df))) {
myVars <- df$value
names(myVars) <- df$variable
line.yint <- myVars["Intercept"]
line.slope <- myVars["Gradient"]
line.rsq <- myVars["R-squared"]
}
}
# Create base plot
myPlot <- ggplot(df) +
geom_point(aes_string(x=expected, y=beta, color=outlier),
size = 2, na.rm = TRUE) +
scale_color_manual(values=c("TRUE"="red", "FALSE"="black")) +
ylim(0,1)
# Add title
if(!missing(title)) {
myPlot <- myPlot +
labs(title = title) +
theme(plot.title = element_text(hjust = 0.5))
}
# Add best fit line
if(!any(missing(line.yint), missing(line.slope))) {
myPlot <- myPlot + geom_abline(intercept = line.yint, slope=line.slope)
}
# Add R-squared value
if(!missing(line.rsq)) {
myPlot <- myPlot + annotate("text", x=0, y=1, hjust=0, vjust=1, size=3, parse=TRUE,
label = paste("R^2 ==", round(line.rsq,3)))
}
# Exclude legend
if(!legend) myPlot <- myPlot + theme(legend.position="none")
# Show, save or return plot
if(plotgraph) plot(myPlot)
if(savefile != FALSE) ggsave(savefile, plot=myPlot, device="png",
width = 150, height = 100, units = "mm", dpi = 300)
return(myPlot)
}
#' plotAllStdCurve
#' Wrapper for plotStdCurve
#' Generates plot for each amplicon, and an additional plot with all amplicons
#'
#' @import ggplot2
#' @import gridExtra
#'
#' @param dflist output from stdcurveBSampseq
#' could be either list, or excel file
#' @param sampleID name of sample displayed on plots
#' @param outputDir output directory
#'
bs_plotAllStdCurve <- function(dflist, sampleID="", outputDir=NULL) {
# If dflist is not a list but a character, try to load it as excel
if(is.character(dflist) & all(grepl("xlsx$", dflist))) {
dflist <- .read.xlsx.allsheets(dflist)
}
# Create a list of plots
AmpPlots <- list()
for (iAmp in 1:length(dflist)) {
AmpDat <- dflist[[iAmp]]
outputfile <- paste(sampleID, names(dflist[iAmp]), "StdCurve.png", sep="_")
if(!is.null(outputDir)) outputfile <- file.path(outputDir, outputfile)
plotStdCurve(AmpDat,
title = paste(sampleID, names(dflist)[iAmp], "Standard Curve"),
plotgraph = FALSE, savefile = outputfile)
AmpPlots[[iAmp]] <- plotStdCurve(AmpDat, title=names(dflist[iAmp]),
plotgraph=FALSE, legend=FALSE)
}
# Create plot with all amplicons in one image
names(AmpPlots) <- names(dflist)
# Calculate height + width of plots
iAmp <- round(sqrt(length(dflist)))
iAmp[2] <- ceiling(length(dflist) / iAmp)
# Create image and save file
AmpGrobs <- arrangeGrob(grobs = AmpPlots, top=sampleID)
outputfile <- paste(sampleID, "ALL_StdCurve.png", sep="_")
if(!is.null(outputDir)) outputfile <- file.path(outputDir, outputfile)
ggsave(outputfile, AmpGrobs,
device="png", width = 100*iAmp[1], height = 75*iAmp[2],
units = "mm", dpi=300)
}
#' Read all excel sheets
.read.xlsx.allsheets <- function(fn) {
shts <- getSheetNames(fn)
xl <- lapply(shts, read.xlsx , xlsxFile=fn)
names(xl) <- shts
return(xl)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.