R/utilities.R

Defines functions do_h5se_merge rmp_handle_platform rmp_handle_metadata match1to2 appendvar get_filt get_pstr_neg get_pstr get_pstr_neg get_eqfpath getblocks get_metadata

Documented in appendvar do_h5se_merge getblocks get_eqfpath get_filt get_metadata get_pstr get_pstr_neg match1to2 rmp_handle_metadata rmp_handle_platform

#!/usr/bin/env R

# Author: Sean Maden
# Utilities for rmpipeline and snakemake workflow.

#------------------
# instance metadata
#------------------
#' Append metadata to HDF5 or SE object
#'
#' Passes version info and timestamp from Python to object metadata
#' @param title Object title
#' @param version Numeric version to be passed, should conform to ##.##.## 
#' nomenclature
#' @param ts Timestamp. If NULL, get new timestamp using Python function.
#' @param pname Name of pipeline package (default: "rmpipeline")
#' @param sname Name of Python script (default: "get_timestamp.y")
#' @return Metadata content for the object
#' @export
get_metadata <- function(title, version, ts = NULL, pname = "rmpipeline",
                         sname = "get_timestamp.py"){
  mdl <- list(title = title, version = version)
  path <- paste(system.file(package = pname), sname, sep="/")
  if(is.null(ts)){
    # ts <- system(paste("python", path, sep = " "),intern = TRUE, wait = FALSE)}
    ts <- gsub("\\..*", "", as.character(as.numeric(Sys.time())))
  }
  mdl[["timestamp"]] <- ts; return(mdl)
}

#---------------------
# sample block indices
#---------------------

#' Get list of index blocks
#'
#' Get list of index blocks, allowing for remainders.
#'
#' @param slength Total length of index vector
#' @param bsize Size of index blocks along length
#' @return List of index blocks of min length `slength`/`bsize`
#' @export
getblocks <- function(slength, bsize){
  iv <- list()
  if(slength < bsize){
    iv[[1]] <- seq(1, slength, 1)
  } else{
    sc <- 1; ec <- sc + bsize - 1
    nblocks <- slength %/% bsize
    for(b in 1:nblocks){
      iv[[b]] <- seq(sc, ec, 1)
      sc <- ec + 1; ec <- ec + bsize
    }
    # add final indices
    if(nblocks < (slength/bsize)){
      iv[[length(iv) + 1]] <- seq(sc, slength, 1)
    }
  }
  return(iv)
}

#-------------------------------
# get path to edirect query file
#-------------------------------

#' Get an edirect query file path
#'
#' Lookup and return an edirect query file matching pattern `fn.str`.
#'
#'@param fn.str File name string matching the queried file.
#'@param eqdpath Path to directory containing the equery files
#'@return Path to the queried edirect file. 
#'@export
get_eqfpath <- function(fn.str = "gsequery_filt.*",
  eqdpath = file.path("recount-methylation-files", "equery")){
  message("Getting files at eqdpath ", eqdpath, "...")
  eqfl <- list.files(eqdpath);eqfn <- eqfl[grepl(fn.str, eqfl)]
  if(length(eqfn) > 1){
    eqts <- gsub(".*\\.", "", eqfn); eqfn <- eqfn[which(eqts == max(eqts))]
  };eqfpath <- file.path(eqdpath, eqfn);return(eqfpath)
}

#--------------
# Regex helpers
#--------------
# Helper functions for regular expression term matching and mapping

#' Get negative regex string mapping
#'
#' Do negative regex term lookup using patterns mapped by pstr().
#' 
#' @param pstr The mapped regex patterns output from a pstr() lookup.
#' @returns Mapped negative lookup strings.
#' @seealso pstr, get_filt, appendvar
#' @export
get_pstr_neg <- function(pstr){
  pstrg <- gsub("\\.\\*", "_", pstr);pstrg <- gsub("\\|", "", pstrg)
  uv <- unlist(strsplit(pstrg, "_"));uv <- uv[!uv == ""]
  for (ui in 1:length(uv)) {
    s <- uv[ui]
    if(ui == 1){
      ns <- paste(paste0(".*non-", s, ".*"), 
                  paste0(".*Non-", s, ".*"), 
                  paste0(".*non ", s, ".*"), 
                  paste0(".*Non ", s, ".*"), 
                  paste0(".*not ", s, ".*"), 
                  paste0(".*Not ", s, ".*"),
                  paste0(".*Never ", s, ".*"), 
                  paste0(".*never ", s, ".*"), 
                  paste0(".*", s, "-free.*"), 
                  paste0(".*", s, "-Free.*"), 
                  paste0(".*", s, " free.*"), 
                  paste0(".*", s, " Free.*"), 
                  sep = "|")
    } else{
      ns <- paste(ns, 
                  paste0(".*non-", s, ".*"), 
                  paste0(".*Non-", s, ".*"), 
                  paste0(".*non ", s, ".*"), 
                  paste0(".*Non ", s, ".*"), 
                  paste0(".*not ", s, ".*"), 
                  paste0(".*Not ", s, ".*"),
                  paste0(".*Never ", s, ".*"), 
                  paste0(".*never ", s, ".*"),
                  paste0(".*", s, "-free.*"), 
                  paste0(".*", s, "-Free.*"), 
                  paste0(".*", s, " free.*"), 
                  paste0(".*", s, " Free.*"), 
                  sep = "|")
    }
  }
  return(ns)
}

#' Get regex patterns for a string, or the affirmative match for the 
#' given string
#'
#' Gets the regex pattern matching the affirmative of a given string `v`.
#' Does progressive capitalization on values separated by spaces
#' for each value in v, appends flanking '.*' (matches any char)
#' for each value in v, appends "|" OR conditional separator
#'
#' @param v Character string or vector of such character strings. 
#' @returns Regex patterns for detecting matching strings from metadata.
#' @seealso get_pstr_neg, get_filt, appendvar
#' @export
get_pstr <- function(v){
  rs <- ""
  for(ci in seq(length(v))){
    c <- v[ci]
    if(ci == 1){rs = paste0(".*", c, ".*")} else{
      rs = paste(c(rs, paste0(".*", c, ".*")), collapse = "|")
    }
    uv <- unlist(strsplit(c, " ")) # num space-sep units
    # for each unit use lower- and uppercase
    uvstr <- c(paste(uv, collapse = " ")); uvl <- list(uv)
    for(i in seq(length(uv))){
      uvi <- c()
      for(ui in seq(length(uv))){
        chari <- uv[ui]
        if(ui <= i){
          if(nchar(chari)>1){
            ssi <- paste0(toupper(substr(chari, 1, 1)),
                          substr(chari, 2, nchar(chari)))
          } else{
            ssi <- paste0(toupper(substr(chari, 1, 1)))
          }
        }
        else{ssi <- chari}
        uvi <- c(uvi, ssi)
      }; uvl[[length(uvl)+1]] <- uvi
    }
    # append to new str
    for(si in 1:length(uvl)){
      s <- uvl[[si]]
      if(length(uv) > 1){
        if(!si==1){
          # space sep
          rs <- paste(c(rs, paste0(".*", paste(s, collapse = " "), ".*")), collapse = "|")
        }
        # underline sep
        rs <- paste(c(rs, paste0(".*", paste(s, collapse = "_"), ".*")), collapse = "|")
        # dash sep
        rs <- paste(c(rs, paste0(".*", paste(s, collapse = "-"), ".*")), collapse = "|")
      } else{
        if(!si==1){
          rs <- paste(c(rs, paste0(".*", s, ".*")), collapse = "|")
        }
      }
    }
  }
  return(rs)
}

#' Get the negation regex patterns for a given regex pattern
#'
#' Does progressive capitalization on values separated by spaces
#' for each value in v, appends flanking '.*' (matches any char)
#' for each value in v, appends "|" OR conditional separator
#'
#' @param pstr Regex patterns for matching, or output from `get_pstr()`.
#' @returns Regex for negation of indicated patterns.
#' @seealso get_pstr, get_filt, appendvar
#' @export
get_pstr_neg <- function(pstr){
  pstrg = gsub("\\.\\*", "_", pstr); pstrg = gsub("\\|", "", pstrg)
  uv = unlist(strsplit(pstrg, "_")); uv = uv[!uv==""]
  for(ui in 1:length(uv)){
    s = uv[ui]
    if(ui == 1){
      ns = paste(paste0(".*non-", s, ".*"), paste0(".*Non-", s, ".*"), paste0(".*non ", s, ".*"),
                 paste0(".*Non ", s, ".*"), paste0(".*not ", s, ".*"), paste0(".*Not ", s, ".*"),
                 paste0(".*", s, "-free.*"), paste0(".*", s, "-Free.*"), paste0(".*", s, " free.*"),
                 paste0(".*", s, " Free.*"), sep = "|")
    } else{
      ns = paste(ns, paste0(".*non-", s, ".*"), paste0(".*Non-", s, ".*"), paste0(".*non ", s, ".*"),
                 paste0(".*Non ", s, ".*"), paste0(".*not ", s, ".*"), paste0(".*Not ", s, ".*"),
                 paste0(".*", s, "-free.*"), paste0(".*", s, "-Free.*"), paste0(".*", s, " free.*"),
                 paste0(".*", s, " Free.*"), sep = "|")
    }
  }
  return(ns)
}

#' Get the outcome of pattern matching a string with regex patterns
#'
#' Does pattern matching for regex patterns constructed from a character
#' string or vector of such character strings. 
#'
#' @param v Character string or vector of such strings.
#' @param m Preprocessed metadata used for pattern matching/lookups (data.frame, mdpre).
#' @param filtrel Logical symbol joining each regex pattern (default "|").
#' @param ntfilt Regex pattern corresponding to negative lookup filter (default NULL).
#' @param ptfilt Regex pattern corresponding to positive lookup filter (default NULL).
#' @returns The result of assessing a regex match on a metadata variable.
#' @seealso appendvar
#' @export
get_filt <- function(v, m = mdpre, filtrel = "|", ntfilt = NULL, ptfilt = NULL,
                     varl = c("gsm_title", "sample_type", "disease_state", 
                              "anatomic_location", "misc")){
  # positive lookup filter
  filtl <- grepl(v, m[,varl[1]])
  if(!is.null(ptfilt)){
    message("Using positive lookup filter with ptfilt: ", 
            paste(ptfilt, collapse = "; "))
    filtl <- filtl & grepl(get_pstr(ptfilt), m[,varl[1]])
  }
  # negative lookup filter
  if(!is.null(ntfilt)){
    message("Using negative lookup filter with ntfilt: ", 
            paste(ntfilt, collapse = "; "))
    nfiltv <- get_pstr_neg(ntfilt)
    filtl <- filtl & !grepl(nfiltv, m[,varl[1]])
  }
  # proceed if additional vars specified
  if(length(varl)>1){
    if(!filtrel %in% c("|", "&")){
      message("Please provide a valid filter relation symbol.")
      return(NULL)
    } else{
      for(vi in varl[2:length(varl)]){
        if(filtrel == "|"){
          filtl <- filtl | grepl(v, m[,vi])
          if(!is.null(ntfilt)){filtl <- filtl & !grepl(nfiltv, m[,vi])}
        } else if(filtrel == "&"){
          filtl <- filtl & grepl(v, m[,vi])
          if(!is.null(ntfilt)){filtl <- filtl & !grepl(nfiltv, m[,vi])}
        }
      }
    }
  }
  return(filtl)
}

#' Append mapped terms to a metadata variable
#'
#' Appends new variable data to a metadata variable, preserving current
#' variable terms.
#'
#' @param var Variable in m for which to append new terms.
#' @param val Character string to append to matched samples.
#' @param filtv Vector of boolean values identifying samples for which to
#'  append the new term.
#' @param m The postprocessed metadata for which to append the new terms
#'  (data.frame, mdpost).
#' @returns The result of appending new terms to specified var.
#' @seealso get_filt, get_pstr_neg, get_pstr
#' @export
appendvar <- function(var, val, filtv, m = mdpost){
  varr <- m[, var]
  # get composite filter
  filti <- !grepl(paste0("(^|;)", val, "(;|$)"), varr)
  compfilt <- filti & filtv
  # assess filter results
  if(length(compfilt[compfilt]) == 0){
    message("No unique values to append. Returning var unchanged.")
    return(varr)
  } else{
    varr[compfilt] <- ifelse(varr[compfilt] == "NA", val,
                             paste(varr[compfilt], val, sep = ";")
    )
    message("Appended n = ", length(varr[compfilt]), " values")
    return(varr)
  }
  return(NULL)
}

#' Match and combine 2 datasets
#'
#' Orders 2 datasets on corresponding variables, then appends an NA
#' matrix as needed before binding on columns. This is mainly used to 
#' combine different types of sample metadata on common/shared GSM IDs.
#' Note columns should be matchable after calling `as.character()` for
#' each.
#'
#' @param d1 First set to combine (matrix).
#' @param d2 Second set to combine (matrix).
#' @param ci1 Column index in first dataset to match on
#' @param ci2 Column index in second dataset to match on
#' @returns Union of the 2 datasets, including NA values where appropriate.
#' @export
match1to2 <- function(d1, d2, ci1 = 2, ci2 = 1){
  id.all <- unique(c(d1[, ci1], d2[, ci2]))
  id1 <- id.all[!id.all %in% d1[, ci1]]
  id2 <- id.all[!id.all %in% d2[, ci2]]
  # append na slices as necessary
  if(length(id1) > 0){
    nav <- rep(rep("NA", length(id1)), ncol(d1) - 1)
    mna <- matrix(c(id1, nav), nrow = length(id1), ncol = ncol(d1))
    d1 <- rbind(d1, mna)
  }
  if(length(id2) > 0){
    nav <- rep(rep("NA", length(id2)), ncol(d2) - 1)
    mna <- matrix(c(id2, nav), nrow = length(id2), ncol = ncol(d2))
    d2 <- rbind(d2, mna)
  }
  # reorder and assign title var
  match.id1 <- match(as.character(d1[, ci1]), as.character(d2[, ci2]))
  order.id1 <- order(match.id1); d1 <- d1[order.id1,]
  match.id2 <- match(as.character(d2[, ci2]), as.character(d1[, ci1]))
  order.id2 <- order(match.id2); d2 <- d2[order.id2,]
  cond <- identical(as.character(d2[, ci2]), as.character(d1[, ci1]))
  if(cond){return(cbind(d1, d2))} else{
    stop("there was an issue matching the final datasets...")
  }
  return(NULL)
}

#------------------------
# handle instance options
#------------------------

#' Get instance metadata info
#'
#' Looks up and loads previously generated instance metadata 
#' (e.g. using rule `new_instance_md`), which includes the version and 
#' timestamp for the `recountmethylation` instance.
#'
#' @param files.dname Instance files dir name ("recount-methylation-files").
#' @param md.dname Metadata dir name, located in files.dname ("metadata").
#' @return Metadata object containing the instance version and timestamp
#' @export
rmp_handle_metadata <- function(files.dname = "recount-methylation-files",
                                md.dname = "metadata"){
  md.dpath <- file.path(files.dname, md.dname)
  if(!dir.exists(md.dpath)){stop("Couldn't find dir ", md.dpath,"...")}
  lfmd <- list.files(md.dpath); lfmd <- lfmd[grepl("metadata", lfmd)]
  if(length(lfmd) > 0){
    ts <- as.numeric(gsub(".*_|\\.rda", "", lfmd)); ts.max <- max(ts)
    which.md <- which(ts == ts.max)[1]
    md.fpath <- file.path(md.dpath, lfmd[which.md])
    message("Using metadata at ", md.fpath);md <- get(load(md.fpath))
    return(md)
  } else{
    stop("Couldn't find metadata for this instance.", 
         " Try running the `new_inst_md` rule first.")}
  return(NULL)
}
#' Get instance platform info
#'
#' Retrieves the previously established instance platform info (e.g. using 
#' rule `set_acc`) from the settings script `settings.py` .
#'
#' @param settings.fname Name of the settings script containing platform info.
#' @param src.path Path the `src` dir in the `recountmethylation_server` repo.
#' @return Platform info as list contianing the accession ID and name.
#' @export
rmp_handle_platform <- function(settings.fname = "settings.py",
                                src.path=file.path("recountmethylation_server",
                                                     "src")){
  settings.fpath = file.path(src.path, settings.fname)
  if(!file.exists(settings.fpath)){stop("Couldn't find ",settings.fn)}
  accid <- gsub(" |.* '|'", "", readLines(settings.fpath, n = 25)[25])
  pname <- ifelse(accid == "GPL13534", "hm450k",
                  ifelse(accid == "GPL21145", "epic-hm850k",
                         ifelse(accid == "GPL8490", "hm27k", "NA")))
  return(list(accid = accid, platform_name = pname))
}

#-------------------
# merge h5se objects
#-------------------

#' Merge 2 h5se objects with all pdata columns
#'
#' Performs a merge on 2 h5se objects, retaining all unique pdata columns
#' between them.
#'
#' @param h5se1.fpath Path to the first h5se object.
#' @param h5se2.fpath Path to the second h5se object.
#' @param new.h5se.dpath Path to the directory to contain the merged h5se object.
#' @param h5se.merge.fname Name of the new merged h5se object.
#' @param replace Whether to replace existing merged h5se object at path.
#' @return Null, saves new merged h5se object.
#' @export
do_h5se_merge <- function(h5se1.fpath, h5se2.fpath, new.h5se.dpath, 
    h5se.merge.fname, replace = TRUE){
    message("Loading h5se data...")
    # check for exsitence of h5se objects
    if(!file.exists(h5se1.fpath)){stop("Path ", h5se1.fpath, " isn't valid.")}
    if(!file.exists(h5se2.fpath)){stop("Path ", h5se2.fpath, " isn't valid.")}
    # load the h5se objects
    h5se1 <- HDF5Array::loadHDF5SummarizedExperiment(h5se1.fpath)
    h5se2 <- HDF5Array::loadHDF5SummarizedExperiment(h5se2.fpath)
    message("Merging pdata...");pd1 <- as.data.frame(pData(h5se1)); 
    pd2 <- as.data.frame(pData(h5se2));cn1 <- colnames(pData(h5se1)); 
    cn2 <- colnames(pData(h5se2));cn.out1 <- cn2[!cn2 %in% cn1]
    cn.out2 <- cn1[!cn1 %in% cn2];cnv <- unique(c(cn1, cn2))
    message("Found ",length(cn.out1), " colnames in h5se1 absent in h5se2...")
    message("Found ",length(cn.out1), " colnames in h5se2 absent in h5se1...")
    if(length(cn.out1) > 0){for(cni in cn.out1){
        pd1$cni <- "NA"; colnames(pd1)[ncol(pd1)] <- cni}}
    if(length(cn.out2) > 0){for(cni in cn.out2){
        pd2$cni <- "NA"; colnames(pd2)[ncol(pd2)] <- cni}}
    pd1 <- pd1[,order(match(colnames(pd1), colnames(pd2)))]
    cond <- identical(colnames(pd1), colnames(pd2))
    if(cond){pd3 <- rbind(pd2, pd1)}else{
        stop("Couldn't merge pdata; unique column names not matched.")}
    pData(h5se2) <- DataFrame(pd2); pData(h5se1) <- DataFrame(pd1)
    message("Binding h5se data on merged pdata...");h5se3 <- cbind(h5se2,h5se1)
    if(!dir.exists(new.h5se.dpath)){
        message("Making new dir ", new.h5se.dpath);dir.create(new.h5se.dpath)}
    new.h5sepath <- file.path(new.h5se.dpath, h5se.merge.fname)
    message("Saving merged h5se data as ",new.h5sepath,"...")
    HDF5Array::saveHDF5SummarizedExperiment(h5se3, new.h5sepath, replace = replace)
    return(NULL)
}
metamaden/recountmethylation.pipeline documentation built on Dec. 15, 2022, 2:25 a.m.