#!/usr/bin/env R
# Author: Sean Maden
# Functions for `rmpipeline` and `recountmethylation` packages.
#----------
# Utilities
#----------
#' Get list of index blocks
#'
#' Get list of index blocks, allowing for remainders.
#'
#' @param slength Total length of index vector
#' @param bsize Size of index blocks along length
#' @return List of index blocks of min length `slength`/`bsize`
#' @export
getblocks <- function(slength, bsize){
iv <- list()
if(slength < bsize){
iv[[1]] <- seq(1, slength, 1)
} else{
sc <- 1; ec <- sc + bsize - 1
nblocks <- slength %/% bsize
for(b in 1:nblocks){
iv[[b]] <- seq(sc, ec, 1)
sc <- ec + 1; ec <- ec + bsize
}
# add final indices
if(nblocks < (slength/bsize)){
iv[[length(iv) + 1]] <- seq(sc, slength, 1)
}
}
return(iv)
}
#' Append metadata to HDF5 or SE object
#'
#' Passes version info and timestamp from Python to object metadata
#' @param title Object title
#' @param version Numeric version to be passed, should conform to ##.##.##
#' nomenclature
#' @param pname Name of pipeline package (default: "rmpipeline")
#' @param sname Name of Python script (default: "get_timestamp.y")
#' @return Metadata content for the object
#' @export
get_metadata <- function(title, version, pname = "rmpipeline",
sname = "get_timestamp.py"){
mdl <- list(title = title, version = version)
# get timestamp from package python script
path <- paste(system.file(package = pname), sname, sep="/")
#ts <- system(paste("python", path, sep = " "),
# intern = TRUE, wait = FALSE)
ts <- gsub("\\..*", "", as.character(as.numeric(Sys.time())))
mdl[["timestamp"]] <- ts; return(mdl)
}
#---------------------
# Get array file sizes
#---------------------
#' Get matrices of IDAT file sizes before and after performing 2 quality
#' filters.
#'
#' Stores matrices with IDAT file sizes before and after applying size
#' filters. IDAT hlink files are filtered based on absolute file
#' size ranges expected for the instance array platform ID. Second, a
#' filter is performed to remove file pairs of differing sizes. Results
#' matrices are saved at mdpath with "msize-all_idat_hlinks_" and
#' "msize-filt_idat_hlinks_" fn patterns.
#'
#' @param idats.path Path to the instance directory containing IDAT hlink files.
#' @param mdpath Path to instance directory containing the metadata.
#' @return Returns a size-filtered matrix of IDAT hlink filenames.
#' @export
get_idat_sizem <- function(idats.path = file.path("recount-methylation-files",
"idats"),
mdpath = file.path("recount-methylation-files",
"metadata")){
md <- rmp_handle_metadata(); arrayid <- md[["accessionID"]]
sfilt.max <- ifelse(arrayid == "GPL13534", 1e7,
ifelse(arrayid == "GPL21145", 1e8, "NA"))
sfilt.min <- ifelse(arrayid == "GPL13534", 4e6,
ifelse(arrayid == "GPL21145", 1e7, "NA"))
message("Using absolute size (bytes) filters ",
"min = ", sfilt.min, ", max = ", sfilt.max, "...")
message("Getting the file sizes matrix...")
idats.lfv <- list.files(idats.path)
idats.lfv.filt <- idats.lfv[grepl(".*hlink.*", idats.lfv)]
if(length(idats.lfv.filt) == 0){
stop("Didn't find IDAT hlinks at path ",idats.path, ".")}
idats.fpathv <- file.path(idats.path, idats.lfv.filt)
msize <- do.call(rbind, lapply(idats.fpathv, function(x){
c(x, file.info(x)$size)})); colnames(msize) <- c("hlink.fname","size")
msize <- as.data.frame(msize, stringsAsFactors = FALSE)
msize[,2] <- as.numeric(msize[,2])
message("Got size data for ", nrow(msize), " IDAT hlink files...")
msize.fname <- paste0("msize-all_idat_hlinks_", md[["timestamp"]], ".rda")
msize.fpath <- file.path(mdpath, msize.fname)
message("Saving full size matrix file to ", msize.fpath)
save(msize, file = msize.fpath); message("Performing size filters...")
msf <- msize[msize[,2] > sfilt.min & msize[,2] < sfilt.max,]
message("After filtering on absolute size, retained ", nrow(msf)," files...")
gsmv <- unique(basename(gsub("\\..*", "", msf[,1])))
hfiltv <- unlist(lapply(gsmv, function(x){
cond <- FALSE; gfv <- msf[grepl(x, msf[,1]),1]
if(length(gfv) == 2){
sffv <- msf[msf[,1] %in% gfv, 2]
if(sffv[1] == sffv[2]){cond <- TRUE}
}; if(cond){return(gfv)} else{return("")}
})); msf <- msf[msf[,1] %in% hfiltv,]
message("After filtering matched IDAT sizes, retained ", nrow(msf)," files.")
msf.fname <- paste0("msize-filt_idat_hlinks_", md[["timestamp"]], ".rda")
msf.fpath <- file.path(mdpath, msf.fname)
message("Saving filtered size matrix file to ", msf.fpath)
save(msf, file = msf.fpath); return(msf)
}
#----------------------------
# Make data tables from IDATs
#----------------------------
#' Get red and grn signal data tables
#'
#' Generates 2-channel signal data tables. Validates IDATs, handles new paths
#' and data.table options, and provides verbose input from main for loop to
#' write data.
#'
#' @param platform The DNAm array platform of files to read/write (either
#' "hm450k" or "epic").
#' @param version Version of the run for data table filenames.
#' @param ts NTP timestamp integer, for filenames (see get_metadata function).
#' @param verbose Whether to return verbose notifications.
#' @param gsmint Number of GSMs to process at a time, typically runs best near 50 samples.
#' @param overwrite Whether to overwrite existing data.table files with same destpath (default TRUE).
#' @param fnstem Filename stem for data tables.
#' @param sepval Separator symbol for data being written (default " ").
#' @param idatspath Path to IDAT files to read.
#' @param getnb Whether to get noob-normalized Beta-vlaues (default: FALSE).
#' @return NULL, writes new data tables from IDATs using `dt_write_rg()`.
#' @examples
#' #version = "0.0.1"
#' #timestamp = get_metadata("title", version)[["timestamp"]]
#' #dtables_rg(version = version, timestamp = timestamp)
#' @export
dtables_rg <- function(platform = c("hm450k", "epic"), version, ts,
verbose = TRUE, gsmint = 60, overwrite = TRUE, fnstem = "mdat.compilation",
sepval = " ", idatspath = file.path("recount-methylation-files", "idats"),
destpath = file.path("recount-methylation-files", "compilations")){
idatinfo <- dt_checkidat(idatspath = idatspath, verbose = verbose)
hlinkv <- idatinfo[["hlinkv"]]; gsmu <- idatinfo[["gsmu"]]
if(verbose){message("Found ", length(gsmu), " GSM IDs with valid IDATs.")}
gsmii <- getblocks(length(gsmu), gsmint) # get id vectors list
if(verbose){message("Making new data tables...")}
dtinfo <- dt_makefiles(platform = platform, hlinkv = hlinkv,
idatspath = idatspath, destpath = destpath, version = version, ts=ts,
overwrite = overwrite, sepval = sepval, verbose = verbose)
if(verbose){message("Wrote data with ", dtinfo[["num.assays"]], " assays.")}
dtcond <- dtinfo[["dtcond"]]; num.assays = dtinfo[["num.assays"]]
if(dtcond){red.path <- dtinfo[["reds.path"]];grn.path <- dtinfo[["grns.path"]]
if(verbose){message("Appending new data for ", length(gsmii)," chunks...")}
if(platform == "hm450k"){
require(minfiData); data(RGsetEx); rgi <- RGsetEx
} else if(platform %in% c("epic", "epic-hm850k")){
require(minfiDataEPIC); data(RGsetEPIC); rgi <- RGsetEPIC
} else(stop("Error, invalid platform provided."))
tt <- Sys.time(); probesv <- rownames(rgi)
for(i in 1:length(gsmii)){hlinkvi <- hlinkv[gsmii[[i]]]
dt_write_rg(probesv = probesv, hlinkv = hlinkvi, idatspath = idatspath,
reds.path = red.path, grns.path = grn.path, verbose=verbose)
if(verbose){message("Finished chunk ", i , " time: ", Sys.time() - tt)}
}
} else{stop("Problem encountered handling data tables.")}
if(verbose){message("Successfully completed data tables.")}; return(NULL)
}
#' Make and manage data.table files for red and grn signal
#'
#' Handles options for signal tables, returns paths used to write data chunks.
#' @param platform The DNAm array platform of files to read/write (either
#' "hm450k" or "epic").
#' @param hlinkv Vector of GSM IDAT hlinks, or the basenames used to read data.
#' @param idatspath Path to idat files to read
#' @param destpath Destination path to new signal data tables.
#' @param version File version information for file names.
#' @param ts NTP timestamp integer, for filenames (see get_metadata function).
#' @param overwrite Whether to overwrite existing data.table files with same
#' destpath (default TRUE).
#' @param fnstem Filename stem for data tables.
#' @param sepval Separator symbol for tables (" ").
#' @param verbose Whether to return verbose notifications.
#' @return list containing dtcond (try conditions results), and new dt paths
#' (reds.path and grns.path)
#' @export
dt_makefiles <- function(platform = c("hm450k", "epic"), hlinkv, idatspath,
destpath, version, ts, overwrite = TRUE, fnstem = "compilation",
sepval = " ", verbose = TRUE){
reds.fn <- paste("redsignal", platform, "mdat", ts,
gsub("\\.", "-", version), sep="_")
reds.fn <- paste(reds.fn, fnstem, sep = ".")
grns.fn <- paste("greensignal", platform, "mdat", ts,
gsub("\\.", "-", version), sep="_")
grns.fn <- paste(grns.fn, fnstem, sep = ".")
reds.path <- file.path(destpath, reds.fn)
grns.path <- file.path(destpath, grns.fn); cn <- c("gsmi")
rgi <- minfi::read.metharray(c(file.path(idatspath, hlinkv[1:2])),force=FALSE)
if(verbose){message("Getting platform data...")}
if(platform == "hm450k"){
require(minfiData); data(RGsetEx); rgi <- RGsetEx
} else if(platform %in% c("epic", "epic-hm850k")){
require(minfiDataEPIC); data(RGsetEPIC); rgi <- RGsetEPIC
} else{stop("Error, invalid platform provided.")}
rgcni <- colnames(t(minfi::getRed(rgi)));rgcn <- matrix(c(cn, rgcni),nrow=1)
if(overwrite){if(verbose){message("Making/verifying data tables...")}
dt1 <- try(data.table::fwrite(rgcn, reds.path, sep = sepval,
append = FALSE, col.names = F))
dt2 <- try(data.table::fwrite(rgcn, grns.path, sep = sepval,
append = FALSE, col.names = F))
dtcond <- is.null(dt1) & is.null(dt2)
} else{dt1 <- try(file.exists(reds.path)); dt2 <- try(file.exists(grns.path))
dtcond <- dt1 == TRUE & dt1 == TRUE}
lr <- list("dtcond" = dtcond, "reds.path" = reds.path,
"grns.path" = grns.path, "num.assays" = nrow(rgi))
return(lr)
}
#' Validate idats at path, returns gsmu and hlinkv
#'
#' Checks for valid idat file pairs, checks for latest timestamps,
#' files with matching timestamps, and valid red/grn IDAT file pairs,
#' gets valid gsms idats dir.
#'
#' @param idatspath File path to directory containing IDATs.
#' @param verbose Whether to return status messages (boolean, TRUE).
#' @return list containing gsmu and hlinkv
#' @export
dt_checkidat <- function(idatspath, verbose = TRUE){
if(verbose){message("Getting valid GSM IDs from IDAT filenames...")}
idats.lf <- list.files(idatspath)
which.valid1 <- grepl("\\.idat$", substr(idats.lf, nchar(idats.lf) - 4,
nchar(idats.lf))) # idat ext
which.valid2 <- grepl(".*hlink.*", idats.lf) # hlink
which.valid3 <- length(gsub("\\..*", "", gsub(".*\\.", "", idats.lf))) > 0
idats.valid <- idats.lf[which.valid1 & which.valid2 & which.valid3]
gsmu <- gsub("\\..*","",idats.valid);gsmu<-unique(gsmu[grepl("^GSM.*",gsmu)])
if(verbose){message("Finding paired red and grn channel files...")}
gstr<-gsub(".*hlink\\.", "",gsub("(_Red.idat$|_Grn.idat$)", "",idats.valid))
rsub <- idats.valid[grepl(".*_Red.idat$", idats.valid)]
gsub <- idats.valid[grepl(".*_Grn.idat$", idats.valid)]
if(verbose){message("Intersecting .hlink and _Red.idat or _Grn.idat")}
gstr.rg <- intersect(gsub(".*hlink\\.", "", gsub("_Red.idat$", "", rsub)),
gsub(".*hlink\\.", "", gsub("_Grn.idat$", "", gsub)))
idats.validf <- idats.valid[gstr %in% gstr.rg]
gpath <- unique(gsub("(_Red.idat|_Grn.idat)", "", idats.validf))
if(verbose){message("Filtering files on latest timestamps...")}
gpath.ts <- c(); gpath.gid <- c(); gidv <- gsub("\\..*", "", gpath)
tidv <- gsub(".*\\.", "", gsub("\\.hlink.*", "", gpath))
tsu <- unique(tidv); tsu <- tsu[rev(order(tsu))] # iterate on timestamps
for(t in tsu){
gpathf <- gpath[tidv == t & !gidv %in% gpath.gid]
if(verbose){message("Removing redundant GSM IDs in this iteration...")}
gpathf.gid <- gsub("\\..*", "", gpathf)
gpathf <- gpathf[!duplicated(gpathf.gid)]
if(verbose){message("Appending new fn")}; gpath.ts <- c(gpath.ts, gpathf)
if(verbose){message("remaking gid list")}
gpath.gid <- gsub("\\..*", "", gpath.ts)
}; hlinkv <- gpath.ts; message("Performing IDATs file sizes filters...")
msf.hv <- basename(get_idat_sizem(idatspath)[,1])
msf.hv <- unique(gsub("(_Red.idat|_Grn.idat)", "", msf.hv))
hlfinal <- intersect(hlinkv, msf.hv);gsmu <- gsub("\\..*", "", hlfinal)
lr <- list("gsmu" = gsmu, "hlinkv" = hlfinal); return(lr)
}
#' Write red and grn signal data to flat tables.
#'
#' This function takes a filtered list of valid IDAT hlink files and
#' attempts to read their data in chunks with `minfi::read.metharray()`.
#' To read as many samples as possible, the function attempts to read
#' and combine individual valid samples if an error is thrown for a
#' given chunk or sample subset.
#'
#' @param probesv Vector of probe labels.
#' @param hlinkv Vector of GSM IDAT hlink file names, or file basenames.
#' @param idatspath Path to IDAT files directory.
#' @param reds.path Path to new red signal data table.
#' @param grns.path Path to new grn signal data table.
#' @param min.cols Minimum data columns to write DNAm data (integer, )
#' @param sepval Separator symbol for data tables.
#' @param verbose Whether to include verbose messages.
#' @return NULL, writes data chunks as side effect
#' @export
dt_write_rg <- function(probesv, hlinkv, idatspath, reds.path, grns.path,
sepval = " ", verbose = TRUE){
if(verbose){message("Reading data...")};
pathl <- unique(file.path(idatspath, hlinkv))
rgi <- try(minfi::read.metharray(pathl, force = TRUE))
if(!class(rgi) == "RGChannelSet"){
message("Error reading chunk. Reading samples processively...")
rgl <- lapply(pathl, function(x){try(minfi::read.metharray(x))})
rgi <- do.call(cbind, lapply(rgl, function(x){
if(class(x) == "RGChannelSet"){return(x)}}))
rgl.filt <- which(unlist(lapply(rgl,
function(x){class(x) == "RGChannelSet"})))
rgi <- do.call(combine, rgl[rgl.filt])}
if(class(rgi) == "RGChannelSet"){
if(verbose){message("Read signals for ",ncol(rgi)," samples. ",
"Getting signal data..")}
red.dat <- minfi::getRed(rgi); grn.dat <- minfi::getGreen(rgi)
probesv.out <- probesv[!probesv %in% rownames(rgi)]
if(length(probesv.out) > 0){
outdat <- rep(rep("NA", length(probesv.out)), ncol(red.dat))
outm <- matrix(outdat, ncol = ncol(red.dat));rownames(outm) <- probesv.out
red.dat <- rbind(red.dat, outm); grn.dat <- rbind(grn.dat, outm)}
reorder.red <- order(match(rownames(red.dat), probesv))
reorder.grn <- order(match(rownames(grn.dat), probesv))
rdat.order <- red.dat[reorder.red,, drop = FALSE]
gdat.order <- grn.dat[reorder.grn,, drop = FALSE]
cond <- identical(rownames(rdat.order), probesv) &
identical(rownames(gdat.order), probesv)
if(cond){if(verbose){message("Appending new data...")}
redi <- matrix(c(colnames(rgi),t(rdat.order)), ncol = nrow(rdat.order)+1)
grni <- matrix(c(colnames(rgi),t(gdat.order)), ncol = nrow(gdat.order)+1)
data.table::fwrite(redi, reds.path, sep = sepval, append = TRUE)
data.table::fwrite(grni, grns.path, sep = sepval, append = TRUE)
} else{if(verbose){message("Error matching probe vectors. Skipping...")}}
} else{if(verbose){message("Error reading chunk. Skipping...")}};return(NULL)
}
#------------------------------------
# Make and populate the HDF5 database
#------------------------------------
#' Append sample metadata to HDF5 or SE object
#'
#' Add sample metadata table to the HDF5 database.
#' @param dbn Name of H5 dataset, passed from make_h5db().
#' @param mdpath Path to metadata file.
#' @param dsn Name of new entity in HDF5 database.
#' @param verbose Whether to return verbose status updates.
#' @return Adds metadata table and colnames object to HDF5 database
#' @export
h5_addmd = function(dbn, mdpath, dsn = "mdpost", verbose = TRUE){
cnn = paste(dsn, "colnames", sep = ".")
if(verbose){message("Reading in sample metadata from file: ", mdpath)}
mdt <- get(load(mdpath)); if(verbose){message("Formatting metadata...")}
mmf = as.matrix(mdt); class(mmf) = "character"; mmf.colnames = colnames(mmf)
if(verbose){message("Making new entities for HDF5 db...")}
rhdf5::h5createDataset(dbn, dsn, dims = c(nrow(mmf), ncol(mmf)),
maxdims = c(rhdf5::H5Sunlimited(), rhdf5::H5Sunlimited()),
storage.mode = "character",
level = 5, chunk = c(10, 16), size = 256)
rhdf5::h5createDataset(dbn, cnn, dims = length(mmf.colnames),
maxdims = c(rhdf5::H5Sunlimited()),storage.mode = "character",
level = 5, chunk = c(5), size = 256)
if(verbose){message("Populating new HDF5 entities...")}
rhdf5::h5write(mmf, file = dbn, name = dsn,
index = list(1:nrow(mmf), 1:ncol(mmf)))
rhdf5::h5write(mmf.colnames, file = dbn, name = cnn,
index = list(1:length(mmf.colnames)))
if(verbose){message("Finished adding metadata.")}; rhdf5::h5closeAll()
return(NULL)
}
#' Make and populate a new HDF5 database with red and green signal, and metadata
#'
#' @param dbfnstem Stem of filename for h5 database (to which ts and version
#' appended)
#' @param dbpath Database file path to write to.
#' @param version Version of new database file.
#' @param ts Timestamp of new database file.
#' @param platform DNAm array platform for instance.
#' @param fnl Vector of signal tables containing data to be added.
#' @param fnpath Path to dir containing files in fnl.
#' @param dsnl Vector of data set names in HDF5 database to be populated.
#' @param newtables Whether to also add new data tables (noob-norm. Beta-values,
#' meth. and unmeth. signal).
#' @param mdpath If addmd, the path to the metadata file to load.
#' @param ngsm.block Number of GSMs (samples) per process block (default 50,
#' passed to `h5_newtables()`).
#' @param verbose Whether to show verbose status updates.
#' @param dsnl Vector of new h5 data set objects in the h5db object,
#' corresponding (1:1) to the files declared in fnl.
#' @return Populates the HDF5 database
#' @export
make_h5db_rg <- function(dbfnstem, dbpath, version, ts, fnpath,
platform = c("hm450k", "epic"), fnl,
dsnl = c("redsignal", "greensignal"), rmoldh5 = TRUE,
newtables = FALSE, mdpath = NULL, ngsm.block = 50,
verbose = TRUE){
require(rhdf5);fn <- paste(dbfnstem, platform, "h5", "rg",
ts, gsub("\\.", "-", version), sep = "_")
dbn <- file.path(dbpath, paste(fn, "h5", sep = "."))
if(verbose){message("Making new h5 db file: ", dbn)}
suppressMessages(try(rhdf5::h5createFile(dbn), silent = TRUE))
if(rmoldh5){if(verbose){message("Removing old data...")}
for(d in dsnl){suppressMessages(try(rhdf5::h5delete(dbn, d), silent = TRUE))
suppressMessages(try(rhdf5::h5delete(dbn, paste0(d, ".colnames")),
silent = TRUE))
suppressMessages(try(rhdf5::h5delete(dbn, paste0(d, ".rownames")),
silent = TRUE))}}
if(verbose){message("Adding red and green signal data tables to HDF5 db...")}
for(di in seq(length(dsnl))){
h5_add_tables(dsn = dsnl[di], fnl = fnl, platform = platform,
fnpath = fnpath, dbn = dbn, ngsm.block = ngsm.block,
verbose = verbose)}
if(verbose){message("Finished adding red and green channel data.")}
if(!is.null(mdpath)){if(verbose){message("Adding sample metadata to db...")}
h5_addmd(dbn, mdpath, verbose = verbose)
} else{if(verbose){message("No mdpath specified!")}}
rhdf5::h5closeAll();if(verbose){message("Finished all processes. Returning.")}
return(NULL)
}
#' Add red or green signal data tables to an HDF5 database.
#'
#' @param dsn Name of new signal data table.
#' @param fnl Vector of signal tables containing data to be added.
#' @param platform DNAm array platform for instance.
#' @param fnpath Path to dir containing files in fnl.
#' @param dbn Database path to write data.
#' @param ngsm.block Number of GSMs (samples) per process block (default 50)
#' @param verbose Whether to show verbose status updates (TRUE).
#' @return Populates a new HDF5 database with red or green signal data.
#' @seealso make_h5db_rg
#' @export
h5_add_tables <- function(dsn, fnl, platform, fnpath, dbn, ngsm.block,verbose){
if(dsn == "redsignal"){fnread = fnl[grepl("red", fnl)]}else{
fnread <- fnl[grepl("green", fnl)]}
if(verbose){message("for dsn ", dsn, " using fnread ", fnread, "...")}
cnn = paste0(dsn,".colnames"); rnn = paste0(dsn, ".rownames"); rn = cn = c()
if(verbose){message("Getting dims...")}
cmax <- ifelse(platform == "hm450k", 622399,
ifelse(platform %in% c("epic", "epic-hm850k"), 1052641, "NA"))
if(cmax == "NA"){readline(prompt=paste0("Couldn't generate num. assays",
" for platform. Enter cmax ",
"(num. assays):"))}
if(verbose){message("Counting rows...")}
# con <- file(file.path(fnpath, fnread), "r")
# rmax <- length(readLines(con)) - 1; close(con)
lct <- system2("wc", args = c("-l", file.path(fnpath, fnread),
" | awk '{print $1}'"),
stdout = TRUE)[1]
lctv <- unlist(strsplit(lct, " "))
rmax <- as.numeric(lctv[!lctv==""][1]) - 1
if(verbose){message("Making dataset....")}
rhdf5::h5createDataset(dbn, dsn, dims = c(rmax, cmax),
storage.mode = "double", level = 5,
maxdims = c(rhdf5::H5Sunlimited(),
rhdf5::H5Sunlimited()),
chunk = c(100, 5000))
if(verbose){message("Adding colnames...")}
con <- file(file.path(fnpath, fnread), "r")
cn <- unlist(strsplit(readLines(con, n = 1), " ")); cn <- cn[2:length(cn)]
cn <- gsub("\n", "",gsub('\"', '', cn[1:cmax]))
rhdf5::h5createDataset(dbn,cnn,dims=length(cn),storage.mode="character",
maxdims = c(rhdf5::H5Sunlimited()), level = 5, chunk = c(20), size = 256)
rhdf5::h5write(cn, file = dbn, name = cnn, index = list(1:length(cn)))
if(verbose){message("Making rownames dataset....")}
rhdf5::h5createDataset(dbn, rnn, dims = rmax,
maxdims = c(rhdf5::H5Sunlimited()), storage.mode = "character",
level = 5, chunk = c(20), size = 256)
if(verbose){message("Working on sample indices...")}
nri <- getblocks(rmax + 1, ngsm.block) # add 1 for colnames
tt <- Sys.time()
con <- file(file.path(fnpath, fnread), "r") # remake con
for(bi in seq(length(nri))){
ni <- nri[[bi]];start.index <- min(ni);
end.index <- ifelse(bi == 1, max(ni)-1, max(ni))
if(verbose){message("Reading lines...")}
dati <- do.call(rbind,
strsplit(readLines(
con, n = length(ni)), " "))
dati.filt <- dati[, c(2:ncol(dati))]
rn <- dati[,1]
if(bi==1){
# filter colnames for first indices
dati.filt <- dati.filt[c(2:nrow(dati)),]
rn <- rn[c(2:nrow(dati))]
}
class(dati.filt) <- "numeric"
wt <- try(rhdf5::h5write(dati.filt, file = dbn, name = dsn,
index = list(start.index:end.index, 1:cmax)))
if(!class(wt) == "type-error"){
if(verbose){message("Successful table write. Writing new row entries...")}
rhdf5::h5write(rn,file=dbn,name=rnn,index=list(start.index:end.index))
} else{if(verbose){message("Error writing new table data. Skipping...")}}
message("For ds ", dsn,", finished reading index ", start.index,
" to ", end.index, ", time; ", Sys.time() - tt)
};if(verbose){message("Completed writing data for ds, ", dsn)}
rhdf5::h5closeAll();if(verbose){message("Finished adding signal dataset")}
return(NULL)
}
#-----------------------
# Make the SE-H5 objects
#-----------------------
#' Retrieve samples metadata from HDF5 db.
#'
#' Retrieves sample metadata from an HDF5 database.
#'
#' @param dbn Path to HDF5 database file.
#' @param dsn Name or group path to HDF5 dataset containing postprocessed
#' metadata.
#' @return Postprocessed metadata as a `data.frame`.
#' @export
data_mdpost = function(dbn, dsn){
mdp <- as.data.frame(rhdf5::h5read(file = dbn, name = dsn), stringsAsFactors = F)
colnames(mdp) <- rhdf5::h5read(file = dbn, name = paste(dsn, "colnames", sep = "."))
return(mdp)
}
#' Append phenotype data to a SummarizedExperiment object
#'
#' Append phenotype data to a SummarizedExperiment object.
#' Note, this data should be a data.frame with certain data specified by
#' colnames "basename" and "gsm".
#'
#' @param version The data file version
#' @param ts Data file NTP timestamp (integer)
#' @param dbn Name of h5 file to load data matrices from.
#' @param fnstem File name stem for new H5SE file
#' @param dsnv Vector of the data set file names to read from h5 file.
#' @param dsn.md Name of metadata file in h5 file.
#' @param verbose Whether to show verbose status messages (default TRUE).
#' @param addmd Whether to add sample metadata.
#' @param mdpath External path to sample metadata. If FALSE, looks for dsn.md
#' in the dbn h5 file.
#' @param dsn.rnv Vector for rownames datasets for dsnv sets.
#' @param dsn.cnv Vector of colnames datasets for dsnv sets.
#' @param semd H5SE object metadata information.
#' @returns New H5SE object.
#' @export
se_addpheno <- function(phenopath, se, pdat = NULL, verbose = TRUE){
if(is.null(pdat)){
if(verbose){message("Loading pheno data from path...")}
mdp <- get(load(phenopath))} else{mdp <- pdat}
if(verbose){message("Adding pheno data to SE objects..")}
mdp <- mdp[mdp$basename %in% colnames(se),];bnv <- colnames(se)
gsmv <- gsub("\\..*", "", bnv);gsmfilt <- !gsmv %in% mdp$gsm
gsm.ov <- gsmv[gsmfilt];bnv.ov <- bnv[gsmfilt]
nacol <- rep("NA", length(gsm.ov))
mdp.ov <- matrix(c(gsm.ov, rep(nacol, 6), bnv.ov, rep(nacol, 11)),
nrow = length(gsm.ov));colnames(mdp.ov) <- colnames(mdp)
mdp <- rbind(mdp, mdp.ov);mdp <- mdp[order(match(mdp$gsm, gsmv)),]
if(identical(mdp$gsm, gsmv)){
mdp$basename <- colnames(se);rownames(mdp) <- colnames(se)
identical(rownames(mdp), colnames(se))
if(verbose){message("Adding metadata to se...")}
minfi::pData(se) <- S4Vectors::DataFrame(mdp)
return(se)} else{stop("Problem matching GSM IDs in mdp, se...")}
return(NULL)
}
#' make_h5se_rg
#'
#' Make an RGChannelSet HDF5-SummarizedExperiment object from an HDF5 database.
#'
#' @param platform Array platform (either "hm450k" or "epic").
#' @param files.dname Files dir name for instance ("recount-methylation-files")
#' @param comp.dname Compilations dir name, located in files.dname
#' ("compilations").
#' @param version The data file version.
#' @param ts Data file NTP timestamp (character).
#' @param dbn Name/path of HDF5 database file.
#' @param newfnstem File name stem for new database file.
#' @param dsnv Table names vector.
#' @param add.metadata Whether to add samples metadata (boolean, FALSE).
#' @param mdpath Path to metadata pile (if NULL, search the HDF5 database).
#' @param dsn.md Metadata table name, used if add.metadata is TRUE.
#' @param dsn.md.cn Metadata columns object, used if add.metadata is TRUE.
#' @param verbose Whether to show status messages (boolean, TRUE).
#' @param replace.opt Whether to replace/overwrite old H5SE object (TRUE).
#' @param dsn.rnv Names vector of rownames objects in HDF5 database.
#' @param dsn.cnv Names vector of colnames objects in HDF5 database.
#' @param semd Metadata vector for new H5SE object to be written.
#' @returns New h5se object.
#' @export
make_h5se_rg <- function(platform = c("hm450k", "epic"),
files.dname = "recount-methylation-files",
comp.dname = "compilations",
version = "0.0.1", ts = "1590090412",
dbn = "remethdb_1590090412_0-0-1.h5",
newfnstem = "remethdb_h5se-rg",
dsnv = c("redsignal", "greensignal"),
add.metadata=FALSE, mdpath=NULL, dsn.md="mdpost",
dsn.md.cn=paste0(dsn.md,".colnames"),
verbose = TRUE, replace.opt = TRUE,
dsn.rnv = c(paste0(dsnv[1], ".rownames"),
paste0(dsnv[2], ".rownames")),
dsn.cnv = c(paste0(dsnv[1], ".colnames"),
paste0(dsnv[2], ".colnames")),
semd=list("title"=paste0("RGChannelSet HDF5-",
"SummarizedExperiment"),
"preprocessing"="raw")){
newfn <- paste(newfnstem, platform, "h5se", "rg",
ts, gsub("\\.", "-", version), sep = "_")
newfpath <- file.path(files.dname, comp.dname, newfn)
if(verbose){message("Setting annotation info...")}
if(platform == "hm450k"){
anno = c("IlluminaHumanMethylation450k", "ilmn12.hg19")
} else if(platform %in% c("epic", "epic-hm850k")){
anno = c("IlluminaHumanMethylationEPIC", "ilm10b2.hg19")
} else{stop("Error, didn't recognize provided platform.")}
names(anno) = c("array", "annotation")
if(verbose){message("Getting HDF5 datasets...")};ldat <- list()
for(i in 1:length(dsnv)){nb <- HDF5Array::HDF5Array(dbn, dsnv[i])
rn <- rhdf5::h5read(dbn, dsn.rnv[i]);cn <- rhdf5::h5read(dbn, dsn.cnv[i])
nb <- nb[c(1:length(rn)),c(1:length(cn))];rownames(nb)<-as.character(rn)
colnames(nb) <- as.character(cn);nb <- t(nb);ldat[[dsnv[i]]] <- nb
};if(verbose){message("Making the H5SE set...")}
gri <- minfi::RGChannelSet(Red = ldat[[1]], Green = ldat[[2]], anno = anno)
S4Vectors::metadata(gri) <- semd
if(add.metadata){message("Adding metadata...")
if(is.null(mdpath)){
if(dsn.md %in% h5ls(dbn)$name & dsn.md.cn %in% h5ls(dbn)$name){
if(verbose){message("Adding metadata from dbn...")}
mdp <- recountmethylation::data_mdpost(dbn, dsn.md)
gri <- se_addpheno(pdat = mdp, se = gri)
} else{message("Couldn't find md objects in h5 db, skipping..")}
} else{if(verbose){message("Adding metadata from phenopath...")}
gri <- try(se_addpheno(mdpath, se = gri))
}}else{message("Skipping metadata addition step...")}
if(verbose){message("Adding signals data to file ", newfn)};t1 <- Sys.time()
HDF5Array::saveHDF5SummarizedExperiment(gri,dir=newfpath,replace=replace.opt)
if(verbose){message("Data additions complete, time elapsed:",Sys.time()-t1)}
return(NULL)
}
#' Make an HDF5 db file with MethylSet data, from an RGChannelSet
#' h5se file
#'
#' Makes an HDF5 db file with raw/unnormalized methylated and unmethylated
#' signal, including rows and columns as datasets.
#'
#' @param dbn Path to the RGChannelSet H5SE data directory.
#' @param version Version for new h5 file.
#' @param ts Timestamp for new h5 file.
#' @param blocksize Size of blocks/number of samples to process at a time
#' (default 65).
#' @param files.dname Files dir name for instance ("recount-methylation-files")
#' @param comp.dname Compilations dir name, located in files.dname
#' @param platform Array platform (either "hm450k" or "epic").
#' @param newfnstem New filename stem for h5 file.
#' @param verbose Whether to return verbose messages (default TRUE).
#' @param dsv Vector of new datasets to add to h5 HDF5 database.
#' @param replace.opt Whether to replace h5 files with duplicate filename
#' (default TRUE).
#' @returns Path to new HDF5 object.
#' @export
make_h5_gm <- function(dbn, version, ts, blocksize = 65,
files.dname = "recount-methylation-files",
comp.dname = "compilations",
platform = c("hm450k", "epic"),
newfnstem = "remethdb", verbose = TRUE,
dsv = c("meth", "unmeth"), replace.opt = TRUE){
if(platform == "hm450k"){num.assays = 485512
} else if(platform %in% c("epic", "epic-hm850k")){num.assays = 866836
} else{stop("Error, didn't recognize platform.")}
rg <- HDF5Array::loadHDF5SummarizedExperiment(dbn); num.samp <- ncol(rg)
h5dbn <- paste(newfnstem, platform, "h5", "gm",
gsub("\\.", "-", version), ts, sep = "_")
h5dbn <- paste0(c(h5dbn, "h5"), collapse = ".");
h5dbn<-file.path(files.dname,comp.dname,h5dbn);message("Making h5 db ",h5dbn)
if(!file.exists(h5dbn)){rhdf5::h5createFile(h5dbn)}
for(d in dsv){rhdf5::h5createDataset(h5dbn, d,dims=list(num.assays,num.samp),
maxdims = c(rhdf5::H5Sunlimited(), rhdf5::H5Sunlimited()),
storage.mode = "double", level = 5, chunk = c(1000, 50))}
rhdf5::h5createDataset(h5dbn, "rownames", dims = list(num.assays),
maxdims = c(rhdf5::H5Sunlimited()), storage.mode = "character", size = 256,
level = 5, chunk = c(1000))
rhdf5::h5createDataset(h5dbn, "colnames", dims = list(num.samp),
maxdims = c(rhdf5::H5Sunlimited()), storage.mode = "character", size = 256,
level = 5, chunk = c(1000))
if(verbose){message("Getting blocks of sample indices.")}
blocks <- getblocks(ncol(rg), bsize = blocksize)
b<-1;cindices<-blocks[[b]];rgi<-rg[,cindices];gm<-minfi::preprocessRaw(rgi)
meth<-as.matrix(minfi::getMeth(gm));unmeth<-as.matrix(minfi::getUnmeth(gm))
class(meth) <- class(unmeth) <- "numeric"
lindex <- list(1:num.assays, cindices[1]:cindices[length(cindices)])
if(verbose){message("Writing first data blocks...")}
rhdf5::h5write(meth, file = h5dbn, name = "meth", index = lindex)
rhdf5::h5write(unmeth, file = h5dbn, name = "unmeth", index = lindex)
if(verbose){message("Writing rows and cols.")}
rhdf5::h5write(colnames(rg), file = h5dbn, name = "colnames", index = list(1:ncol(rg)))
rhdf5::h5write(rownames(gm), file = h5dbn, name = "rownames", index = list(1:num.assays))
message("Writing next data blocks...");t1 <- Sys.time()
if(length(blocks) > 1){
for(b in 2:length(blocks)){
cindices <- blocks[[b]]; gm <- minfi::preprocessRaw(rg[,cindices])
meth<-as.matrix(minfi::getMeth(gm));unmeth<-as.matrix(minfi::getUnmeth(gm))
rhdf5::h5write(meth, file = h5dbn, name = "meth",
index = list(1:num.assays, cindices))
rhdf5::h5write(unmeth,file = h5dbn, name = "unmeth",
index = list(1:num.assays, cindices))
if(verbose){message("Finished block ",b,", time = ",Sys.time()-t1)}
}
};message("Finished writing data blocks. Returning h5 path.");return(h5dbn)
}
#' Make an HDF5 h5 file with GenomicRatioSet data, from an RGChannelSet
#' h5se file
#'
#' Makes an HDF5 h5 file containing noob-normalized Beta-values, including
#' rows and columns as separate datasets.
#' @param dbn Path to the RGChannelSet H5SE data directory.
#' @param version Version for new h5 file.
#' @param ts Timestamp for new h5 file.
#' @param blocksize Size of blocks/number of samples to process at a time
#' (default 65).
#' @param files.dname Files dir name for instance ("recount-methylation-files")
#' @param comp.dname Compilations dir name, located in files.dname
#' @param platform Array platform (either "hm450k" or "epic").
#' @param newfnstem New filename stem for h5 file.
#' @param verbose Whether to return verbose messages (default TRUE).
#' @param replace.opt Whether to replace h5 files with duplicate filename
#' (default TRUE).
#' @returns Path to new h5 object.
#' @export
make_h5_gr <- function(dbn, version, ts, blocksize = 65,
files.dname = "recount-methylation-files",
comp.dname = "compilations",
verbose = TRUE, platform = c("hm450k", "epic"),
newfnstem = "remethdb", replace.opt = TRUE){
if(platform == "hm450k"){num.assays = 485512
} else if(platform %in% c("epic", "epic-hm850k")){num.assays = 866836
} else{stop("Error, didn't recognize platform.")}
rg <- HDF5Array::loadHDF5SummarizedExperiment(dbn)
num.samp <- ncol(rg); # gr <- preprocessNoob(rg)
h5dbn <- paste(newfnstem, platform, "h5", "gr", ts,
gsub("\\.", "-", version), sep = "_")
h5dbn <- file.path(files.dname, comp.dname, h5dbn)
message("Making h5 db: ", h5dbn);h5dbn <- paste0(c(h5dbn, "h5"),collapse=".")
if(!file.exists(h5dbn)){rhdf5::h5createFile(h5dbn)}
if(verbose){message("Making main noobbeta data table...")}
rhdf5::h5createDataset(h5dbn, "noobbeta", dims = list(num.assays, num.samp),
maxdims = c(rhdf5::H5Sunlimited(), rhdf5::H5Sunlimited()),
storage.mode = "double", level = 5, chunk = c(1000, 50))
if(verbose){message("Making colnames data table...")}
rhdf5::h5createDataset(h5dbn, "colnames", dims = list(num.samp),
maxdims = c(rhdf5::H5Sunlimited()), storage.mode = "character", size = 256,
level = 5, chunk = c(1000))
if(verbose){message("Making, writing rownames data table...")}
rhdf5::h5createDataset(h5dbn, "rownames", dims = list(num.assays),
maxdims = c(rhdf5::H5Sunlimited()), storage.mode = "character", size = 256,
level = 5, chunk = c(1000))
if(verbose){message("Getting blocks of sample indices.")}
blocks <- getblocks(ncol(rg), bsize = blocksize)
cindices <- blocks[[1]]; rgi <- rg[, cindices]
redi<-as.matrix(minfi::getRed(rgi));grni<-as.matrix(minfi::getGreen(rgi))
class(redi) <- class(grni) <- "numeric"
rgii <- minfi::RGChannelSet(Red = redi, Green = grni,
annotation = annotation(rgi))
gmi <- minfi::preprocessNoob(rgii)
nbm <- as.matrix(minfi::getBeta(gmi)); class(nbm) <- "numeric"
if(verbose){message("Writing first data blocks...")}
rhdf5::h5write(nbm, file = h5dbn, name = "noobbeta",
index = list(1:num.assays, cindices))
if(verbose){message("Writing rows and cols.")}
rhdf5::h5write(colnames(rg), file = h5dbn, name = "colnames",
index = list(1:ncol(rg)))
rhdf5::h5write(rownames(gmi), file = h5dbn, name = "rownames",
index = list(1:num.assays))
if(length(blocks) > 1){
message("Writing next data blocks...");t1 <- Sys.time()
for(b in 2:length(blocks)){
cindices <- blocks[[b]]; rgi <- rg[,cindices]
redi<-as.matrix(minfi::getRed(rgi));grni<-as.matrix(minfi::getGreen(rgi))
class(redi) <- class(grni) <- "numeric"
rgii <- minfi::RGChannelSet(Red = redi, Green = grni,
annotation = annotation(rgi))
gmi <- minfi::preprocessNoob(rgii)
nbm <- as.matrix(minfi::getBeta(gmi)); class(nbm) <- "numeric"
rhdf5::h5write(nbm, file = h5dbn, name = "noobbeta",
index = list(1:num.assays, cindices))
if(verbose){message("Finished block ",b,", time = ",Sys.time()-t1)}
}};message("Finished writing data blocks. Returning h5 path.")
return(h5dbn)
}
#' Make an H5SE MethySet object from a MethySet HDF5 database file
#'
#' @param dbn Name of the h5 file to read data from.
#' @param files.dname Files dir name for instance ("recount-methylation-files")
#' @param comp.dname Compilations dir name, located in files.dname
#' @param version Version for new filenames.
#' @param ts NTP timestamp.
#' @param platform Array platform (either "hm450k" or "epic").
#' @param replaceopt Whether to overwrite existing file of same name as new file
#' (default = TRUE).
#' @param verbose Whether to return verbose messages (default TRUE).
#' @param add.metadata Whether to add samples metadata (boolean, FALSE).
#' @param pdata Metadata object obtained from pData(se).
#' @param newdnstem Stem for new h5se object file.
#' @return NULL, generates a new h5se file.
#' @export
make_h5se_gm <- function(dbn, version, ts, platform = c("hm450k", "epic"),
files.dname = "recount-methylation-files",
comp.dname = "compilations", replaceopt = TRUE,
verbose = TRUE, add.metadata = FALSE, pdata = NULL,
newdnstem = "remethdb",
semd=list("title"= paste0("GenomicMethylSet HDF5-",
"SummarizedExperiment"),
"preprocessing"="raw")){
newfn <- paste(newdnstem, platform, "h5se", "gm",
ts, gsub("\\.", "-", version), sep="_")
newfpath <- file.path(files.dname, comp.dname, newfn)
if(verbose){message("Making new H5SE database: ", newfn)}
if(verbose){message("Getting granges...")}
if(platform == "hm450k"){require(minfiData)
ms <- minfi::mapToGenome(get(data("MsetEx")))
} else if(platform %in% c("epic", "epic-hm850k")){
require(minfiDataEPIC); ms <- minfi::mapToGenome(get(data("MsetEPIC")))
} else{stop("Error, didn't recognize platform.")}
anno <- minfi::annotation(ms);gr <- GenomicRanges::granges(ms)
message("Reading datasets.");meth <- HDF5Array::HDF5Array(dbn, "meth")
unmeth <- HDF5Array::HDF5Array(dbn, "unmeth")
cn <- rhdf5::h5read(dbn, "colnames");rn <- rhdf5::h5read(dbn, "rownames")
rownames(meth) <- rownames(unmeth) <- as.character(rn)
colnames(meth) <- colnames(unmeth) <- as.character(cn)
gr <- gr[order(match(names(gr), rownames(meth)))]
icond <- identical(rownames(meth), names(gr))
if(!icond){stop("Problem matching gr names to h5 meth rows")}
icond <- identical(rownames(unmeth), names(gr))
if(!icond){stop("Problem matching gr names to h5 unmeth rows")}
if(add.metadata & !is.null(pdata)){
icond <- identical(colnames(meth), rownames(pdata))
if(!icond){stop("Problem matching pdata rows to h5 meth cols")}
icond <- identical(colnames(unmeth), rownames(pdata))
if(!icond){stop("Problem matching pdata rows to h5 unmeth cols")}
};if(verbose){message("Defining new SummarizedExperiment object...")}
gm <- minfi::GenomicMethylSet(gr = gr, Meth = meth, Unmeth = unmeth,
annotation = anno);S4Vectors::metadata(gm) <- semd
if(add.metadata & !is.null(pdata)){message("Adding metadata...")
if(is.character(pdata)){pdata <- get(load(pdata))};pData(gm) <- pdata}
message("Writing data to new h5se object. This may take awhile.")
HDF5Array::saveHDF5SummarizedExperiment(gm,dir=newfpath,replace=replaceopt)
message("Finished writing h5se data.");return(NULL)
}
#' Make an h5se object from an h5se gr file
#'
#' @param version Version for new filenames.
#' @param ts NTP timestamp.
#' @param dbn Path to HDF5 database file to read data from.
#' @param files.dname Files dir name for instance ("recount-methylation-files")
#' @param comp.dname Compilations dir name, located in files.dname
#' @param pdata Metadata object obtained from pData(se).
#' @param replaceopt Whether to overwrite existing file of same name as new file
#' (default = TRUE).
#' @param verbose Whether to return verbose messages (default TRUE).
#' @param add.metadata Whether to add samples metadata (boolean, FALSE).
#' @param pdata Samples metadata as a DataFrame, path to file, or NULL.
#' @param newdnstem Character stme for new h5se object file
#' ("remethdb_h5se-gr").
#' @return NULL, generates a new h5se file.
#' @export
make_h5se_gr <- function(version, ts, dbn,
files.dname = "recount-methylation-files",
comp.dname = "compilations",
platform = c("hm450k", "epic"), replaceopt = TRUE,
verbose = TRUE, add.metadata = FALSE, pdata = NULL,
newdnstem = "remethdb",
semd=list("title"= paste0("GenomicMethylSet HDF5-",
"SummarizedExperiment"),
"preprocessing"= paste0("Normalization with",
" out-of-band ",
"signal (noob)"))){
newfn <- paste(newdnstem, platform, "h5se", "gr",
ts, gsub("\\.", "-", version), sep = "_")
newfpath <- file.path(files.dname, comp.dname, newfn)
if(verbose){message("Making new H5SE database: ", newfn)}
if(platform == "hm450k"){require(minfiData)
ms <- minfi::mapToGenome(get(data("MsetEx")))
} else if(platform %in% c("epic", "epic-hm850k")){
require(minfiDataEPIC); ms <- minfi::mapToGenome(get(data("MsetEPIC")))
} else{stop("Error, provided platform isn't supported.")}
anno <- minfi::annotation(ms);gr <- GenomicRanges::granges(ms)
message("Reading datasets.");nbeta <- HDF5Array::HDF5Array(dbn, "noobbeta")
cn<-rhdf5::h5read(dbn,"colnames");rn<-rhdf5::h5read(dbn,"rownames")
rownames(nbeta) <- as.character(rn);colnames(nbeta) <- as.character(cn)
gr <- gr[order(match(names(gr), rownames(nbeta)))]
icond <- identical(rownames(nbeta), names(gr))
if(!icond){stop("Problem matching gr names to h5 nbeta rows")}
gmi <- minfi::GenomicRatioSet(gr = gr, Beta = nbeta, annotation = anno)
S4Vectors::metadata(gmi) <- semd # H5SE object metadata
if(add.metadata & !is.null(pdata)){message("Checking and adding metadata...")
icond <- identical(colnames(nbeta), rownames(pdata))
if(!icond){stop("Problem matching pdata rows to h5 nbeta cols")}
if(is.character(pdata)){pdata <- get(load(pdata))};pData(gmi) <- pdata
};message("Writing new data to h5se object. This may take awhile.")
HDF5Array::saveHDF5SummarizedExperiment(gmi,dir=newfpath,replace=replaceopt)
message("Finished writing h5se data.");return(NULL)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.