R/capture_melt_multiple.R

capture_melt_multiple <- structure(function # Capture and melt into multiple columns
### Match a regex to column names of a wide data frame (many
### columns/few rows), then melt/reshape the matching columns into
### multiple result columns in a taller/longer data table (fewer
### columns/more rows). Input should be a data frame with four or more
### regularly named columns of possibly different types to reshape,
### and output is a data table with at least two columns of reshaped
### data. For melting into a single result column, see
### capture_melt_single.
(...,
### First argument must be a data frame to melt/reshape; column names
### of this data frame will be used as the subjects for regex
### matching. Other arguments (regex/conversion/engine) are passed to
### capture_first_vec along with nomatch.error=FALSE. The regex must
### define a group named "column" -- each unique value captured in
### this group becomes a column name for the reshaped data in the
### output. There must also be at least one other group, and the
### output will contain a column for each other group -- see
### examples.
  fill=FALSE,
### If TRUE, fill missing input reshape columns with runs of rows with
### missing values in the output reshape columns. Otherwise stop with
### an error (default).
  na.rm=FALSE,
### Remove missing values from melted data? (passed to
### data.table::melt.data.table)
  verbose=getOption("datatable.verbose")
### Print verbose output messages? (passed to
### data.table::melt.data.table)
){
  L <- melt_list(measure_multiple, list(...), fill=fill)
  if("measure" %in% ls(asNamespace("data.table"))){
    melt(
      L[["data"]],
      measure.vars=L[["measure.vars"]],
      na.rm=na.rm,
      value.factor=FALSE,
      verbose=verbose)
  }else{#TO DELETE AFTER NEW data.table on CRAN.
    variable_table <- attr(L[["measure.vars"]],"variable_table")
    is.match <- seq_along(L[["data"]]) %in% unlist(L[["measure.vars"]])
    id.vars <- names(L[["data"]])[!is.match]
    out.names <- c(id.vars, names(variable_table), names(L[["measure.vars"]]))
    variable.name <- paste(out.names, collapse="")
    names.dt <- data.table(variable_table)
    set(names.dt, j=variable.name, value=paste(1:nrow(names.dt)))
    measure.vars <- list()
    missing.vec.list <- list()
    for(measure.name in names(L[["measure.vars"]])){
      measure.cols <- L[["measure.vars"]][[measure.name]]
      some.int <- na.omit(measure.cols)[1]
      if(any(is.na(measure.cols))){
        miss.dt <- data.table()
        missing.vec.list[[measure.name]] <- which(is.na(measure.cols))
      }
      measure.cols[is.na(measure.cols)] <- some.int
      measure.vars[[measure.name]] <- measure.cols
    }
    melted <- melt(
      data.table(L[["data"]]),
      measure.vars=measure.vars,
      variable.name=variable.name,
      na.rm=FALSE,
      variable.factor=FALSE,#character for join.
      value.factor=FALSE,
      verbose=verbose)
    for(rep.name in names(missing.vec.list)){
      missing.vec <- missing.vec.list[[rep.name]]
      is.missing <- melted[[variable.name]] %in% missing.vec
      set(melted, i=which(is.missing), j=rep.name, value=NA)
    }
    if(na.rm)melted <- melted[
      !apply(is.na(melted[, names(L[["measure.vars"]]), with=FALSE]), 1, any)
    ]
    names.dt[melted, out.names, with=FALSE, on=variable.name]
  }
### Data table of reshaped/melted/tall/long data, with a new column
### for each unique value of the capture group named "column", and a
### new column for each other capture group.
}, ex=function(){

  data.table::setDTthreads(1)

  ## Example 1: melt iris columns to compare Sepal and Petal dims, as
  ## in cdata package, https://winvector.github.io/cdata/
  (iris.part.cols <- nc::capture_melt_multiple(
    iris,
    column=".*?",
    "[.]",
    dim=".*"))
  iris.part.cols[Sepal<Petal] #Sepals are never smaller than Petals.
  if(require("ggplot2")){
    ggplot()+
      theme_bw()+
      theme(panel.spacing=grid::unit(0, "lines"))+
      facet_grid(dim ~ Species)+
      coord_equal()+
      geom_abline(slope=1, intercept=0, color="grey")+
      geom_point(aes(
        Petal, Sepal),
        shape=1,
        data=iris.part.cols)
  }

  ## Example 2. melt iris to Length and Width columns.
  (iris.dim.cols <- nc::capture_melt_multiple(
    iris,
    part=".*?",
    "[.]",
    column=".*"))
  iris.dim.cols[Length<Width] #Length is never less than Width.

  ## Example 3. Lots of column types, from example(melt.data.table).
  set.seed(1)
  DT <- data.table::data.table(
    i_1 = c(1:5, NA),
    i_2 = c(NA,6:10),
    f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)),
    f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered=TRUE),
    c_1 = sample(c(letters[1:3], NA), 6, TRUE),
    l_2 = list(NULL, NA, c(NA,NA), logical(), 1:2, TRUE),
    d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"),
    d_2 = as.Date(6:1, origin="2012-01-01"))
  ## nc syntax melts to three output columns of different types using
  ## a single regex (na.rm=FALSE by default in order to avoid losing
  ## information).
  nc::capture_melt_multiple(
    DT,
    column="[dfi]",
    "_",
    number="[12]", as.integer)

  ## fill=TRUE means to output NA in positions that correspond to
  ## missing input columns (in this case, there is no l_1 nor c_2).
  nc::capture_melt_multiple(
    DT,
    column=".*",
    "_",
    number="[12]", as.integer,
    fill=TRUE)

  ## Example 4, three children, one family per row, from data.table
  ## vignette.
  family.dt <- data.table::fread(text="
family_id age_mother dob_child1 dob_child2 dob_child3 gender_child1 gender_child2 gender_child3
1         30 1998-11-26 2000-01-29         NA             1             2            NA
2         27 1996-06-22         NA         NA             2            NA            NA
3         26 2002-07-11 2004-04-05 2007-09-02             2             2             1
4         32 2004-10-10 2009-08-27 2012-07-21             1             1             1
5         29 2000-12-05 2005-02-28         NA             2             1            NA")
  ## nc::field can be used to define group name and pattern at the
  ## same time, to avoid repetitive code.
  (children.nc <- nc::capture_melt_multiple(
    family.dt,
    column=".+",
    "_",
    nc::field("child", "", "[1-3]", as.integer), 
    na.rm=TRUE))

  ## Example 5: wide data CSV with 100 possible peaks per row, each
  ## peak has three attributes (Allele, Height, Size) from
  ## https://lftdi.camden.rutgers.edu/repository/PROVEDIt_1-5-Person%20CSVs%20Filtered.zip
  PROVEDIt.csv <- system.file(
    "extdata", "RD12-0002_PP16HS_5sec_GM_F_1P.csv",
    package="nc", mustWork=TRUE)
  PROVEDIt.wide <- data.table::fread(PROVEDIt.csv)
  names(PROVEDIt.wide)
  PROVEDIt.tall <- nc::capture_melt_multiple(
    PROVEDIt.wide,
    column=".*",
    " ",
    peak="[0-9]+", as.integer,
    na.rm=TRUE)
  head(PROVEDIt.tall)

  ## plot number of peaks per row.
  peaks.per.sample.marker <- PROVEDIt.tall[, .(
    peaks=.N
  ), by=.(`Sample File`, Marker)][order(peaks)]
  if(require(ggplot2)){
    ggplot()+
      geom_histogram(aes(
        peaks),
        data=peaks.per.sample.marker,
        binwidth=1)
  }

  ## which row has the most peaks?
  (most <- PROVEDIt.tall[which.max(peak), .(`Sample File`, Marker, Dye)])
  PROVEDIt.tall[most, on=names(most)]
  PROVEDIt.wide[most, on=names(most)]

})

Try the nc package in your browser

Any scripts or data that you put into this service are public.

nc documentation built on Sept. 1, 2023, 1:07 a.m.