# geomx_functions.R
#
# Functions to help manipulate and manage GeoMx related data processing
#' Import GeoMx SampleSheet.csv data
#'
#' Import GeoMx SampleSheet.csv data
#'
#' This function can import `"SampleSheet.csv"` and `"GNP_config.ini"`
#' files, which are characterized as follows:
#'
#' * Each subset of data is preceded by a header line: `[header_name]`
#' * Data following this line is comma-delimited, or delimited with
#' `" = "`, both of which are treated as equivalent.
#' * There is a blank line between the subset of data and the next header.
#'
#' Some rules with `return_type="dflist"`:
#'
#' * Delimiters are recognized as `" = "` or `","`, but because the
#' import process calls `data.table::fread()` it will probably also
#' accept tab-delimited data.
#' * When the first row following the header appears to have column names,
#' they are used as-is as column names.
#' * The following criteria cause the first row NOT to be used as
#' column header:
#'
#' * The first entry does not contain `"Sample_ID"`, and any of:
#' * Any value following a comma begins with a number, or
#' * Any value following a comma is `"true"` or `"false"`, or
#' * Any value following a comma is purely DNA sequence `"[ATGC]+"`, or
#' * There is no `","` delimiter, or
#' * There is only one value in the subset of data.
#'
#' Therefore, when the first entry begins with `"Sample_ID"` the
#' first entry is used as column header.
#' * When the first entry is not used as column headers, the heading name
#' itself is used as the first column name, followed by `V` concatenated
#' to the integer column number, for example: `"Sequencing", "V2", "V3", "V4"`
#'
#' ## Todo:
#'
#' * Add `return_type` option to create commands to rename demux output
#' files to the expected GeoMx Sample_ID format.
#'
#' @family jam GeoMx functions
#'
#' @returns `list` or `character` vector, consistent with `return_type`.
#'
#' @param x `character` path to SampleSheet.csv formatted GeoMx file.
#' @param return_type `character` string to define the return type:
#' * `"dflist"` - `list` of `data.frame` for each heading (default)
#' * `"list"` - `list` of lines as-is.
#' * `"indices"` - `character` vector of expected indices, in format
#' `"Index+Index2"`.
#' * `"filenames"` - `data.frame` describing the expected output filenames
#' after running `demuxbyname.sh` (BBTools), and the expected
#' GeoMx filename using the Sample_ID, suitable to rename one file
#' to the other.
#' @param do_revcomp `logical` indicating whether to use reverse complement
#' for Index2, default=TRUE.
#' @param demux_prefix `character` prefix assigned when running
#' `demuxbyname.sh` (BBTools). It is usually in the form
#' `"project_hist1"` where `"hist1"` refers to the Hamming distance
#' threshold `"hdist"` used with `demuxbyname.sh`.
#' This prefix therefore is used to match the filename produced by
#' `demuxbyname.sh`, which is then renamed to the NGS filename.
#' @param demux_sheetnumber `character` used when `return_type="filenames"`
#' in formulating the correct GeoMx input filename for the NGS pipeline.
#' This value therefore affects the filename after renaming the demux
#' file, to become the NGS input filename.
#' @param ... additional arguments are ignored.
#'
#' @examples
#' samplefile <- system.file("data", "SampleSheet.csv", package="platjam")
#' samplelist <- import_geomx_samplesheet(samplefile)
#' lengths(samplelist)
#'
#' @export
import_geomx_samplesheet <- function
(x="SampleSheet.csv",
return_type=c("dflist",
"list",
"indices",
"filenames"),
do_revcomp=TRUE,
demux_sheetnumber="S1",
demux_prefix="BK-Gq-1_hdist1_",
...)
{
#
return_type <- match.arg(return_type)
#
if (length(x) == 0) {
stop("x must be the path to a GeoMx file, which was empty.")
}
x <- head(x, 1);
if (!file.exists(x)) {
stop("x must be the path to GeoMx file, file not found.")
}
samplelines <- readLines(x)
hlines <- grep("^[[]", samplelines)
hnames <- gsub(",", "", gsub("[[]|]", "", samplelines[hlines]))
samplelines_hdf <- data.frame(check.names=FALSE,
row.names=hnames,
header=hnames,
a1=hlines + 1,
a2=c(tail(hlines - 1, -1), length(samplelines)))
samplelist <- lapply(jamba::nameVector(samplelines_hdf$header), function(i){
a1 <- samplelines_hdf[i, "a1"];
a2 <- samplelines_hdf[i, "a2"];
v <- samplelines[seq(from=a1, to=a2)];
v <- v[!v %in% c("", NA)];
if (all(grepl("=", v))) {
v <- gsub(" = ", ",", v)
}
if (!"list" %in% return_type) {
if (!grepl("^Sample_ID($|,)", v[1]) && (
length(v) == 1 ||
grepl(",([-.0-9]|(true|false)($|,))", v[1]) ||
grepl(",[ATGC]+($|,)", v[1]) ||
!grepl(",", v[1]))) {
vdf <- data.table::fread(
header=FALSE,
text=v,
data.table=FALSE)
colnames(vdf) <- paste0("V", seq_len(ncol(vdf)));
colnames(vdf)[1] <- i;
vdf;
} else {
vdf <- data.table::fread(
text=v,
data.table=FALSE)
vdf;
}
} else {
v;
}
})
if ("BCLConvert_Data" %in% names(samplelist) &&
!"list" %in% return_type) {
index_colname <- head(jamba::vigrep("index$",
colnames(samplelist$BCLConvert_Data)), 1)
index2_colname <- head(jamba::vigrep("index2$",
colnames(samplelist$BCLConvert_Data)), 1)
if (length(index_colname) == 0 || length(index2_colname) == 0) {
stop("BCLConvert_Data must contain colnames 'Index' and 'Index2'")
}
use_index2 <- index2_colname;
if (TRUE %in% do_revcomp) {
index2_revcomp_colname <- paste0(index2_colname, "_revcomp");
samplelist$BCLConvert_Data[[index2_revcomp_colname]] <- revcomp(
samplelist$BCLConvert_Data[[index2_colname]])
use_index2 <- index2_revcomp_colname;
}
if ("indices" %in% return_type) {
indices <- paste0(
samplelist$BCLConvert_Data[[index_colname]],
"+",
samplelist$BCLConvert_Data[[use_index2]])
return(indices)
} else if ("filenames" %in% return_type) {
#
bcldf <- samplelist$BCLConvert_Data;
demux_base <- paste0(demux_prefix,
bcldf[[index_colname]],
"+",
bcldf[[use_index2]])
demux_files <- paste0(rep(demux_base, each=2),
"_", c(1, 2),
".fq.gz")
demux_samplefile <- paste0(
rep(bcldf[["Sample_ID"]], each=2),
"_", demux_sheetnumber,
"_L001_R", c(1, 2),
"_001.fastq.gz")
bcldf2 <- data.frame(check.names=FALSE,
bcldf,
file_from=matrix(demux_files, ncol=2, byrow=TRUE),
file_to=matrix(demux_samplefile, ncol=2, byrow=TRUE))
return(bcldf2)
}
} else if (any(c("filenames", "indices") %in% return_type)) {
stop(paste0(
"BCLConvert_Data heading was not found, ",
"required for return_type 'indices' or 'filenames'"))
}
return(samplelist)
}
#' Simple DNA reverse-complement function
#'
#' @family jam GeoMx functions
#'
#' @returns `character` string with reverse-complemented sequences.
#'
#' @param x `character` string with DNA or RNA sequence, expected to
#' contain only A,T,G,C,N, and possibly U.
#' @param molecule `character` string with `"DNA"` (default) or `"RNA"`.
#' Input recognizes either T or U, but will only produce T for DNA,
#' and U for RNA.
#'
#' @examples
#' dna_in <- "GTTACTCT";
#' revcomp(dna_in)
#'
#' @export
revcomp <- function
(x,
molecule=c("DNA",
"RNA"),
...)
{
molecule <- match.arg(molecule);
# operate only on unique elements
xu <- unique(x);
xs <- strsplit(xu, "")
# convert to complementary base
from_to <- c(A="T", U="A", T="A", G="C", C="G", N="N")
if ("RNA" %in% molecule) {
from_to <- c(A="U", U="A", T="A", G="C", C="G", N="N")
}
xu_revcomp <- jamba::cPaste(sep="", lapply(xs, function(ix){
rev(unname(from_to[toupper(ix)]))
}))
xu_revcomp[match(x, xu)]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.