#' Mean data file generator for meta plot.
#'
#' @author Mark Boltengagen \email{m.boltengagen@@gmail.com}
#'
#' @param datdir Directory containing dat files generated by meta.dat.
#' function
#' @param table Data table containing list of names in one column and a column
#' of sorted values. Must be set if split by either quantiles or limits.
#' @param outdir Output directory.
#' @param file_name Name of output file (optional).
#' @param split Argument showing how to split: 'no' - take all data (default);
#' 'quantiles' - split by quantiles q1-q4; 'limits' - split by given vector of
#' values.
#' @param limits A vector of values to split, e.g. c(0, 25, 50, 75, 100)
#' splits into for groups 0-25, 25-50, 50-75, 75-100, where 0 is a minimal
#' value and 100 is maximal.
#'
#' @seealso \code{\link{meta.data}}
#'
#' @export
meta.mean <- function(datdir,
outdir,
file_name="Meandata",
split=c('no', 'limits', 'quantiles', 'groups', 'factor'),
table=NULL,
val.name=NULL,
limits=NULL,
ngr=4) {
split <- match.arg(split)
if (split == 'limits' & is.null(limits)) stop("Specify \"limits\" as a vector of values.")
if (any(split == c('limits', 'quantiles', 'groups', 'factor'))) {
if (is.null(table)) stop("Specify \"table\" file with features ~ values.")
dt <- MNuc::read.data.table(file=table, val.name=val.name) }
if (split == 'no') {
win <- 1
selected_names <- NULL
suffix <- '_All_'
} else if (split == 'limits') {
win <- length(limits)-1
suffix <- '_L_'
} else if (split == 'quantiles') {
limits <- as.numeric(quantile(as.numeric(unlist(dt[,2]))))
win <- 4
suffix <- '_Q_'
} else if (split == 'groups') {
win <- ngr
llim <- 1
rlim <- range <- ceiling(nrow(dt)/ngr)
suffix <- '_G_'
} else if (split == 'factor') {
f <- as.factor(dt[[val.name]])
win <- length(levels(f))
suffix <- '_F_'
}
for (i in 1:win) {
if (any(split == c('limits', 'quantiles'))) {
llim <- limits[i]
rlim <- limits[i+1]
selected_names <- dt[get(names(dt)[2]) >= llim & get(names(dt)[2]) < rlim][,get(names(dt)[1])]
} else if (split == 'groups') {
selected_names <- dt[,get(names(dt)[1])[llim:rlim]]
llim <- llim + range
rlim <- rlim + range
} else if (split == 'factor') {
selected_names <- dt[get(names(dt)[2]) == levels(f)[i]][,get(names(dt)[1])]
}
file=paste(file_name, suffix, i, sep="")
MNuc::write.mean.dat(datdir=datdir, selected_names=selected_names, outdir=outdir, file_name=file)
}
}
###############################################################################
#' Data table from text file
#'
#' @details Read text file containig feature name in the first column and named
#' columns of values. Function selects the column with names and the column with
#' val.name, and return data.table sorted by val.name column (values).
#'
#' @param file txt file features in the first column ~ named corresponding values in columns
#' @param val.name Value name we would like to extract
#'
#' @return Data table with two columns (feature_name ~ corresponding_value)
#'
#' @export
read.data.table <- function(file, val.name) {
# Check if file is an 'gz'
file = ifelse(tail(unlist(strsplit(file, "[.]")), n=1)=='gz', paste0("zcat <'", file,"'"), file)
# Read text table as data.table
dt <- fread(file, header=TRUE, sep="\t", na.strings="NA")
# Assume the first column in the table is a feature name
feature.name <- names(dt)[1]
# Select feature_name ~ feature_value columns
dt.selected <- dt[, c(feature.name, val.name), with=FALSE]
# Sort by values
sorted_dt <- dt.selected[order(dt.selected[[val.name]])]
return(sorted_dt)
}
###############################################################################
#' Write mean dat file.
#'
#' @author Mark Boltengagen \email{m.boltengagen@@gmail.com}
#'
#' @param datdir Directory containing dat files generated by meta.dat.
#' function
#' @param selected_names Vector of names wich will be used to calculate mean values.
#' @param outdir Output directory.
#' @param file_name Name of output file.
#'
#' @export
write.mean.dat <- function(datdir, selected_names=NULL, outdir, file_name) {
if (is.null(datdir)) stop("Specify \"datdir\" directory containing dat files")
files <- list.files(path=datdir, pattern="\\.dat.gz$", full.names=T, recursive=FALSE)
## GET/CHECK/SET COORDINATES
coord <- MNuc::check.coord(datdir=datdir)
up <- coord$upstream
dwn <- coord$downstream
DT <- data.table(Coordinates = up:dwn)
dir.create(outdir, recursive=TRUE, showWarnings = FALSE)
for (i in 1:length(files)) {
## Read gz file as data.table
dataset <- fread(paste0("zcat <'", files[i],"'"), header=TRUE, sep="\t", na.strings="NA")
if (is.null(selected_names)) {
# Vector containing mean values across all rows in dataset (except of the first column - Coordinates)
mean.vec <- dataset[, -1, with=FALSE][, .(Mean = rowMeans(.SD, na.rm=TRUE))]
} else {
## Use list of names to subset data table by colnames and keep vec with average score across filtered genes
mean.vec <- dataset[ ,colnames(dataset) %in% selected_names, with=FALSE][, .(Mean = rowMeans(.SD, na.rm=TRUE))]
}
## Name of new column
newcolname <- tools::file_path_sans_ext(basename(files[i]))
## Add column with average score vec to the glDT
DT[, eval(newcolname) := mean.vec]
}
## Save data.table as gz. file
filename=paste(outdir, file_name, ".mean.dat", sep="")
write.table(DT, file=filename, quote=FALSE, append=FALSE, sep="\t", row.names=FALSE, col.names=TRUE)
system2("gzip", args=filename)
}
###############################################################################
#' Check coordinates in dat files.
#'
#' @details Check if coordinates in all dat files in a directory are identical. Return values: "upstream", "downstream" and "length".
#' @author Mark Boltengagen \email{m.boltengagen@@gmail.com}
#'
#' @param dat Directory containing dat files.
#'
#' @return List of integers: \item{upstream}{description upstream}, \item{downstream}{description downstream}, \item{length}{description length}
#'
#' @examples
#' coord <- check.coord(datdir="/directory/containing/dat_files")
#' upstream_value <- coord$upstream
#' downstream_value <- coord$downstream
#' length_value <- coord$lenght
#'
#' @export
check.coord <- function(datdir) {
# GET/SET COORDINATES
# list of coordinates (first columns) from all dat files
files <- list.files(path=datdir, pattern="\\.dat.gz$", full.names=T, recursive=FALSE)
coordinates <- lapply(1:length(files), function(f) {
df <- fread(paste0("zcat <'", files[f],"'"), header=TRUE, sep="\t", na.strings="NA")
df <- df[, 1]
})
# check min, max and length of coordinates
mins <- unlist(lapply(coordinates,min))
maxs <- unlist(lapply(coordinates,max))
lens <- unlist(lapply(coordinates,nrow))
if (var(mins)!=0 | var(maxs)!=0 | var(lens)!=0) stop("Coordinates in dat files must be identical.")
# start and end position
output <- list(mins[1], maxs[1], lens[1])
names(output) <- c("upstream", "downstream", "length")
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.