#' Process single-cell data
#'
#' This function subsets the data and prepares it for visualizing by
#' generating representation methylation-state matrices from
#' single-cell methylation data (for example, sc-MNT data).
#' We assume the data has already been preprocess using the subsetSC
#' function in methylscaper. See the vignette for a more thorough
#' explanation of each parameter. The output should be passed directly to the
#' plotting function.
#'
#' @param dataIn A list object containing two elements labelled gch and hcg (already pre-processed.)
#' @param startPos The index of the first position to include
#' in the visualization. If using this within the R console it is
#' recomended to specify the start and end directly.
#' In the Shiny app, a slider will let the user refine these positions.
#' @param endPos The index of the final position to include in the visualization.
#' @param updateProgress A function for generating progress bars in the Shiny app.
#' Should be left NULL otherwise.
#' @return The output is a list containing the elements 'gch' and 'hcg.
#' Each is a dataframe with reads/cells on the rows and each column
#' is a base-pair. The matrix is coded as follows:
#' -2: unmethylated GCH or HCG site
#' -1: base pairs between two unmethylated GCH or HCG sites
#' 0: base pairs between mismatching methylation states of
#' two GCH or HCG sites
#' 1: base pairs between two methylated GCH or HCG sites
#' 2: methylated GCH or HCG site
#' @importFrom utils tail
#' @import data.table
#' @importFrom methods is
#' @export
#' @examples
#'
#' data(singlecell_subset)
#' prepsc.out <- prepSC(singlecell_subset,
#' startPos = 105636488, endPos = 105636993)
prepSC <- function(dataIn, startPos=NULL, endPos=NULL,
updateProgress = NULL)
{
if (is(dataIn, "SummarizedExperiment") | is(dataIn, "SingleCellExperiment")) {
dataIn <- reformatSCE(dataIn)
}
if(is.null(updateProgress) & (endPos - startPos) >= 50000) {
message("Selected range is longer than 50k bp,
plot may take a few seconds to render.")}
if(is.null(updateProgress) & (endPos - startPos) >= 100000) {
message("Selected range is longer than 100k bp,
this may not be optimal. Plot will render but may
take a few minutes.")}
gc_seq_data <- dataIn$gch
cg_seq_data <- dataIn$hcg
if (is.function(updateProgress))
updateProgress(message = "Filtering HCG
(endogenous methylation) site data", value = 0.1)
cg_seq_sub <- lapply(cg_seq_data, function(x) {
tempSub <- x[order(x$pos),]
tempSub = subset(tempSub, tempSub$pos >= startPos & tempSub$pos <= endPos)
return(tempSub)
})
if (is.function(updateProgress))
updateProgress(message = "Filtering GCH (accessibility/occupancy) site data", value = 0.5)
gc_seq_sub <- lapply(gc_seq_data, function(x) {
tempSub <- x[order(x$pos),]
tempSub = subset(tempSub, tempSub$pos >= startPos & tempSub$pos <= endPos)
return(tempSub)
})
all_cg_sites <- unique(do.call(c, cg_seq_sub))
all_gc_sites <- unique(do.call(c, gc_seq_sub))
useseq <- intersect(which(vapply(cg_seq_sub, function(x) nrow(x), numeric(1)) > 1 ),
which(vapply(gc_seq_sub, function(x) nrow(x), numeric(1)) > 1 ))
if (length(useseq) == 0) return(NULL)
cg_seq_sub <- cg_seq_sub[useseq]
gc_seq_sub <- gc_seq_sub[useseq]
if (is.function(updateProgress))
updateProgress(message = "Mapping HCG (endogenous methylation) site data", value = 0.75)
cg_outseq <- lapply(cg_seq_sub, function(x) {
if (nrow(x) == 0) NULL
else mapSC(x, startPos, endPos)
})
cg_outseq <- cg_outseq[!vapply(cg_outseq, is.null, logical(1))]
hcg <- data.matrix(do.call(rbind, cg_outseq))
rownames(hcg) <- as.character(seq(1,nrow(hcg)))
if (is.function(updateProgress))
updateProgress(message = "Mapping GCH (accessibility/occupancy) site data", value = 0.9)
gc_outseq <- lapply(gc_seq_sub, function(x) {
if (nrow(x) == 0) NULL
else mapSC(x, startPos, endPos)
})
gc_outseq <- gc_outseq[!vapply(gc_outseq, is.null, logical(1))]
gch <- data.matrix(do.call(rbind, gc_outseq))
rownames(gch) <- as.character(seq(1,nrow(gch)))
return(list(gch = gch, hcg = hcg))
}
mapSC <- function(inSeq, startPos, endPos) {
inSeq$basePos <- as.numeric(inSeq$pos) - startPos + 1
fill_1 <- seq(startPos, endPos) - startPos + 1
someMethyl <- which(inSeq$rate > 0)
noMethyl <- which(inSeq$rate <= 0)
fill_1[fill_1 %in% inSeq[someMethyl,]$basePos] <- 2
fill_1[fill_1 %in% inSeq[noMethyl,]$basePos] <- -2
fill_1[abs(fill_1) != 2] <- "."
sites = inSeq$basePos
sites <- sites[sites > 0]
editseq = fill_1
sites_temp <- c(0, sites, max(sites)+1)
for (j in seq(1,(length(sites_temp)-1))) {
if (sites_temp[j+1] == 1) { # skip
} else {
tofill <- seq(sites_temp[j]+1,(sites_temp[j+1]-1))
s1 <- editseq[pmax(1, sites_temp[j])]
s2 <- editseq[pmin(length(editseq), sites_temp[j+1])]
if (s1 == "2" & s2 == "2") {
fillvec <- 1 } else if (s1 == "2" & s2 == "-2") {
fillvec <- 0} else if (s1 == "-2" & s2 == "2") {
fillvec <- 0} else if (s1 == "-2" & s2 == "-2") {
fillvec <- -1} else {fillvec <- 0}
fillvec <- rep(fillvec, length(tofill))
editseq[tofill] <- fillvec
}
}
return(editseq)
}
#' Load in methylation data
#'
#' This function loads the single-cell files. It takes a path to the data files
#' and a chromosome number as arguments and returns the desired subset of the
#' data. Processing by chromosome is important for speed and memory efficiency.
#' The input files should be tab separated with three columns.
#' The first column is the chromosome, the second is the position (basepair), and the third
#' is the methylation indicator/rate. The folder should contain two subfolders titled
#' met and acc, with the endogenous methylation and accessibility methylation files,
#' respectively.
#'
#' @param path Path to the folder containing the single-cell files.
#' @param chromosome The chromosome to subset the files to.
#' @param startPos The index of the first position to include
#' in the subsetting. This is optional as further narrowing of the
#' position can be done in the visualization step/tab.
#' In the Shiny app, a slider will let the user refine the positions.
#' @param endPos The index of the final position to include in subset.
#' @param updateProgress A function for generating progress bars in the Shiny app.
#' Should be left NULL otherwise.
#' @return The output is RDS files that can be loaded into the visualization
#' tab on the Shiny app
#' @import data.table
#' @export
#'
#' @examples
#' # example not run since needs directory input from user
#' #subsc.out <- subsetSC("filepath", chromosome=19)
subsetSC <- function(path, chromosome, startPos = NULL,
endPos = NULL, updateProgress = NULL) {
if (!is.list(path)){
cgfiles <- paste0(path,"/","met/", sort(grep("met", list.files(paste0(path,"/met")), value = TRUE)))
gcfiles <- paste0(path,"/","acc/", sort(grep("acc", list.files(paste0(path,"/acc")), value = TRUE)))
} else {
cgfiles <- path[[1]]
gcfiles <- path[[2]]
getord <- order(path[[3]])
cgfiles <- cgfiles[getord]
getord <- order(path[[4]])
gcfiles <- gcfiles[getord]
}
if (length(cgfiles) != length(gcfiles)) {stop("Must have the same number of methylation and
accessibility files!")}
useChr <- chromosome
cg_seq <- list()
for(i in seq(1,length(cgfiles))) {
in_cg_seq <- fread(cgfiles[i],
header=FALSE, stringsAsFactors = FALSE)
# A couple possible formats from bismark:
if(ncol(in_cg_seq) %in% c(4,6)) {
in_cg_seq <- in_cg_seq[,c(1,2,4)]
}
if(ncol(in_cg_seq) != 3) {stop("Data is not formatted correctly. See vignette for details.")}
if (in_cg_seq[1,1] == "chr") {
in_cg_seq <- in_cg_seq[-1,]
}
colnames(in_cg_seq) <- c("chr", "pos", "rate")
in_cg_seq$pos <- as.numeric(in_cg_seq$pos)
in_cg_seq$rate <- as.numeric(in_cg_seq$rate)
in_cg_seq <- subset(in_cg_seq, in_cg_seq$chr==useChr)
if (!is.null(startPos) & !is.null(endPos)) {
in_cg_seq <- subset(in_cg_seq, in_cg_seq$pos >= startPos & in_cg_seq$pos <= endPos)
}
cg_seq[[i]] <- in_cg_seq
if (is.function(updateProgress))
updateProgress(message = "Reading HCG (endogenous methylation) site files", value = i / length(cgfiles))
}
gc_seq<- list()
for(i in seq(1,length(gcfiles))) {
in_gc_seq <- fread(gcfiles[i], header=FALSE, stringsAsFactors = FALSE)
if(ncol(in_gc_seq) %in% c(4,6)) {
in_gc_seq <- in_gc_seq[,c(1,2,4)]
}
if(ncol(in_gc_seq) != 3) {stop("Data is not formatted correctly. See vignette for details.")}
colnames(in_gc_seq) <- c("chr", "pos", "rate")
if (in_gc_seq[1,1] == "chr") {
in_gc_seq <- in_gc_seq[-1,]
}
colnames(in_gc_seq) <- c("chr", "pos", "rate")
in_gc_seq$pos <- as.numeric(in_gc_seq$pos)
in_gc_seq$rate <- as.numeric(in_gc_seq$rate)
in_gc_seq <- subset(in_gc_seq, in_gc_seq$chr==useChr)
if (!is.null(startPos) & !is.null(endPos)) {
in_gc_seq <- subset(in_gc_seq, in_gc_seq$pos >= startPos & in_gc_seq$pos <= endPos)
}
gc_seq[[i]] <- in_gc_seq
if (is.function(updateProgress))
updateProgress(message = "Reading GCH (accessibility/occupancy) site files", value = i / length(gcfiles))
}
list(hcg = cg_seq, gch = gc_seq)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.