R/intervalaverage_functions.R

Defines functions isolateoverlaps interval_weighted_avg_slow_f intervalaverage CJ.dt create_unused_name

Documented in CJ.dt intervalaverage isolateoverlaps

#' @useDynLib intervalaverage, .registration = TRUE
#' @importFrom Rcpp sourceCpp
NULL


create_unused_name <- function(x,reserved_cols){
  for(i in 1:length(x)){
    while(x[i] %in% reserved_cols){
      x[i] <- paste0("i.",x[i])
    }
  }
  x
}



#' grid expand an arbitrary number of data.tables
#'
#' similar to data.table::CJ and base::expand.grid except for rows of data.tables.
#'
#' CJ.dt computes successive cartesian join over rows of each table
#' paying no attention to whatever the tables are keyed on.
#'
#' @param ... data.tables
#' @param groups a character vector corresponding to
#' column names of grouping vars in all of the data.tables
#' @examples
#' #' CJ.dt(data.table(c(1,2,2),c(1,1,1)),data.table(c("a","b"),c("c","d")))
#' #If you want to expand x to unique values of a non-unique columns in y
#' x <- data.table(c(1,2,3),c("a","b","b"))
#' y <- data.table(id=c(1,2,2,1,3),value=c(2,4,1,7,3))
#' z <- CJ.dt(x, y[,list(id=unique(id))])
#' #if you want to merge this back to y
#' y[z,on="id",allow.cartesian=TRUE] #or z[y,on="id",allow.cartesian=TRUE]
#' @import data.table
#' @export
CJ.dt <- function(...,groups=NULL) {
  l = list(...)
  stopifnot( all(sapply(l,is.data.table)) )
  l <- lapply(l,copy) #take a copy of each table
    #copying is probably not going to cause memory or timing issues here
    #because if you don't have the space to copy the individual tables there's
      #no way you'll be able to grid expand them together
     #with that said, the rest of this function is written to try to return each table to its
     #original state so could be modified to remove the copying
     #if this is slowing a particular task down
     #edit out the copy at your own risk and know that if the function returns an error
       #all your tables will likely be modified because they have an extra column
  EVAL <- function(...)eval(parse(text=paste0(...)))
  if(any(sapply(l,nrow)==0)){stop("one or more data.tables have no rows")}
  stopifnot(all(sapply(l,data.table::is.data.table)))


  #while kvar is in names of any of the data.tables, keep prepending i. to it until it's not
  kvar <- create_unused_name("k",unlist(lapply(l,names)))

  invars <- create_unused_name(paste0("in",1:length(l)),
                             unlist(lapply(l,names)))

  for(i in 1:length(l)){
    l[[i]][,(kvar):=1]
    l[[i]][,(invars[i]):=TRUE]
    data.table::setkeyv(l[[i]],c(kvar,groups))
  }

  mymerge = function(x,y) x[y, allow.cartesian = TRUE]
  out <- Reduce(mymerge,l)
  out[,(kvar):=NULL]

  for(i in 1:length(l)){
    l[[i]][,(kvar):=NULL]
    l[[i]][,(invars[i]):=NULL]
  }

  out <- EVAL("out[",paste0(paste0("!is.na(",invars,")"),collapse="&"),"]")
  out[,(invars):=NULL]
  out[]
}




#' time-weighted average of values measured over intervals
#'
#' \code{intervalaverage} takes values recorded over
#' non-overlapping intervals and averages them to defined intervals, possibly within
#' groups (individuals/monitors/locations/etc).  This function could be used to take averages over long
#' intervals
#' of values measured over short intervals and/or to take short "averages" of values measured over
#' longer intervals (ie, downsample without smoothing). Measurement intervals and averaging intervals need
#' not align. In the event that an averaging interval contains more than one measurement interval,
#' a weighted average is calculated (ie each measurement is weighted on the duration of its interval's
#' overlap with the averaging period interval).
#'
#' All intervals are treated as closed (ie inclusive of the start and end values in interval_vars)
#'
#'
#' x and y are not copied but rather passed by reference to function internals
#' but the order of these data.tables is restored on function completion or error,
#'
#' When required_percentage is less than 100, xminstart and xmaxend may be useful to
#' determine whether an average meets specified coverage requirements in terms of not
#' just percent of missingness but whether values are represented through the range of the y interval
#'
#' maxgap variables may be useful in determing whether an average from incomplete data may be
#' representative of the true unknowable average (because of missingness) beyond what's already
#' accounted for based counts/percentage of non-missing values. Large runs of missing
#' values over time (or whatever the units of intervals are) mean missingness is not equally
#'  distributed through the period.  However, note that this gap information is simply returned and
#'  does not affect whether the average value is returned as NA or not--you'll need to apply your own
#'  criteria using this (these) maxgap variables.
#'
#' @param x a data.table containing values measured over intervals. see
#' `interval_vars` parameter
#' for how to specify interval columns and `value_vars` for how to
#'  specify value columns.
#' intervals in `x` must must be completely non-overlapping within
#' groups defined by group_vars. if `group_vars` is specified (non-`NULL`), `x` must
#' also contain columns specified in `group_vars`.
#' @param y a data.table object containing intervals over which averages of `x` values should be computed.
#' averaging intervals in `y`, unlike measurement intervals in `x`, may be overlapping within groups.
#' if `group_vars` is specified (non-`NULL`),  `y` must contains those `group_vars` column names
#'  (and this would allow different averaging periods for each group)
#' @param interval_vars a length-2 character vector of column names in both `x` and `y`.
#' These column names specify columns in x and y that define
#' closed (inclusive) starting and ending intervals. The column name
#' specifying the lower-bound column must be specified first.
#' these columns in x and y must all be of the same class and either be integer or IDate.
#' The interval_vars character vector cannot be named. This is reserved for future use allowing
#' different interval_vars column names in x and y.
#' @param value_vars a character vector of column names in `x`. This specifies
#' the columns to be averaged.
#' @param group_vars A character vector of column names in both x and y.
#' The interaction of
#' these variables define groups in which averages of `x` values will be taken.
#' specifying subjects/monitors/locations within which to take averages.
#' By default this is `NULL`, in which case averages are taken over the entire `x`
#' dataset for each `y` period.
#' The group_vars character vector cannot be named. This is reserved for future use allowing
#' different interval_vars column names in x and y.
#' @param required_percentage This percentage of the duration of each (possibly group-specific)
#' `y` interval must be observed
#' and nonmissing for a specific `value_var` in `x` in order for the return table to
#' contain a nonmissing average of the `value_var` for that `y` interval.  If the percentage
#' of the nonmissing `value_var` observations is less than `required_percentage` an NA will be be returned
#' for that average.
#'  The default is 100, meaning that if \emph{any} portion of a `y` interval is either not recorded or
#'  missing in `x`, then the corresponding return row will contain a an NA for the average of that
#'  `value_var`.
#' @param skip_overlap_check by default, FALSE. setting this to TRUE will skip
#'  internal checks to make sure x intervals are non-overlapping within
#'   groups defined by group_vars.
#'    intervals in x must be non-overlapping,
#'    but you may want to skip this check if you've  already checked this because
#'    it is computationally intensive for large datasets.
#' @param verbose include printed timing information? by default, FALSE
#' @return returns a data.table object.
#' Rows of the return data.table correspond to intervals from y. i.e, the number
#' of rows of the return will be the number of rows of y.
#' Columns of the returned data.table are as follows: \cr
#' - grouping variables as specified in `group_vars` \cr
#' - interval columns corresponding to intervals in y. These columns are named the
#'   same they were in x and y and as specified in `interval_vars`
#' - value variable columns from x, averaged to periods in y.
#'    named the same as they were in x \cr
#' - \code{yduration}: the length of the interval (ie as a count) specified in y \cr
#' - \code{xduration}: the total length of the intervals (ie as a count)
#'   from x that fall into this interval from y. this will be equal to
#'   yduration if x is comprehensive for (ie, fully covers)  this interval from y. \cr
#' - \code{nobs_<value_vars>}: for each \code{value_var} specified, this is the count of
#'  times (or whatever the units of the intervals are)
#'  where there are non-missing values in x that fall into the specified interval from y. this will be
#'   equal to xduration if the value_var contains no NA values over the y
#'   interval. If there are NAs in value variables, then \code{nobs_<value_vars>}
#'    will be different from \code{xduration} and won't necessarily be all the same
#'     for each value_var.
#' - \code{xminstart}: For each returned interval (ie the intervals from Y) the minimum of the
#' start intervals represented in x.  If the start of the earliest x interval is less than the start
#' of the y interval, the minimum of the y interval is returned. Note, this is the minimum start
#'  time in x matching with the y interval whether or not any value_vars were missing or not for that start time.
#'  If you need non-missing minimum start times, you could remove NA intervals from
#'  x prior to calling intervalaverage (this would need to be done separately for each value_var).
#' - \code{xmaxend}:  similar to xminstart but the maximum of the end intervals represented in x.
#'  Again, this does not pay attention to whether the interval in x had non-missing value_vars.
#' - \code{maxgap_<value_vars>} For each \code{value_var} specified, this is the length of the largest
#'  concurrent run of times (or whatever the units of intervals are) for which that value variable is missing
#'  (either structurally missing ie--no intervals at all in x--or missing with an NA or any combination thereof).
#' @examples
#'x <- data.table(start=seq(1L,by=7L,length=6),
#'                end=seq(7L,by=7L,length=6),
#'                pm25=c(10,12,8,14,22,18))
#'
#'y <- data.table(start=seq(3L,by=7L,length=6),
#'                end=seq(9L,by=7L,length=6))
#'
#'z <- intervalaverage(x,y,interval_vars=c("start","end"),
#'                     value_vars=c("pm25"))
#'
#' #also see vignette for more extensive examples
#' @export
intervalaverage <- function(x,
                            y,
                            interval_vars,
                            value_vars,
                            group_vars=NULL,
                            required_percentage=100,
                            skip_overlap_check=FALSE,
                            verbose=FALSE
){

  #due to NSE: to avoid notes in R CMD BUILD
  intervalaverage__xstart_copy <- intervalaverage__xend_copy <- intervalaverage__xend_copy <-
    intervalaverage__ystart_copy <-  intervalaverage__yend_copy <-
    yduration  <- xminstart <- xmaxend <- V1 <- NULL



  stopifnot(data.table::is.data.table(x))
  stopifnot(data.table::is.data.table(y))


  EVAL <- function(...)eval(parse(text=paste0(...)))


  #save states so that added columns are dropped.
  #x also may be reordered due to the overlap check. this will restore the order
  statex <- savestate(x)
  on.exit(setstate(x,statex))
  statey <- savestate(y)
  on.exit(setstate(y,statey),add=TRUE)


  stopifnot(is.null(names(interval_vars)))
  stopifnot(is.null(names(group_vars)))

  if( any(c("yduration","xduration","xminstart","xmaxend")%in% c(interval_vars,value_vars,group_vars))){
    stop(paste0("column(s) named 'yduration', 'xduration', 'xminstart', or 'xmaxend' has been detected in interval_vars,",
                " value_vars, or group_vars.         These column names ('yduration', 'xduration', 'xminstart', or 'xmaxend')",
                " are reserved for the output. please rename this (or these) column(s) in x/y/interval_vars/value_vars/group_vars."))
  }

  if(!is.null(group_vars)){
    if(!all(group_vars %in% names(x))){
      stop("every value in group_vars must be a columname in x")
    }
    if(!all(group_vars %in% names(y))){
      stop("every value in group_vars must be a columname in y")
    }
  }


   if(!all(interval_vars %in% names(x))){
     stop("every value in interval_vars must be a columname in x")
   }
   if(!all(interval_vars %in% names(y))){
     stop("every value in interval_vars must be a columname in y")
   }

  if(!all(value_vars %in% names(x))){
    stop("every value in value_vars must be a columname in x")
  }


  if(x[,any(sapply(.SD,function(m){any(is.na(m))})) ,.SDcols=interval_vars]){
    stop("columns corresponding to interval_vars cannot be missing in x")
  }

  if(y[,any(sapply(.SD,function(m){any(is.na(m))})) ,.SDcols=interval_vars]){
    stop("columns corresponding to interval_vars cannot be missing in y")
  }

  if(x[,!all(sapply(.SD,is.integer)|sapply(.SD,function(x){class(x)%in% c("IDate")})),.SDcols=interval_vars]){
    stop("interval_vars must correspond to columns in x of class integer or IDate")
  }
  if(x[,class(.SD[[1]])[1]!=class(.SD[[2]])[1],.SDcols=interval_vars]){
    stop("interval_vars must correspond to columns in x of the same class")
  }

  if(y[,!all(sapply(.SD,is.integer)|sapply(.SD,function(x){any(class(x)%in%c("IDate"))})),.SDcols=interval_vars]){
    stop("interval_vars must correspond to columns in y of class integer or IDate")
  }
  if(y[,class(.SD[[1]])[1]!=class(.SD[[2]])[1],.SDcols=interval_vars]){
    stop("interval_vars must correspond to columns in y of the same class")
  }

  #stop if there are variables specified in both groups and interval_vars
  if(!length(intersect(interval_vars,group_vars))==0){
    stop("interval_vars and group_vars cannot refer to the same column(s)")
  }

  if(!length(intersect(value_vars,group_vars))==0){
    stop("value_vars and group_vars cannot refer to the same column(s)")
  }

  if(!length(intersect(value_vars,interval_vars))==0){
    stop("value_vars and interval_vars cannot refer to the same column(s)")
  }


  #stop if interval starts are before interval ends
  if(x[, sum(.SD[[2]]-.SD[[1]] <0)!=0,.SDcols=interval_vars]){
    stop("there exist values in x[[interval_vars[1] ]] that are
         less than corresponding values in  x[[interval_vars[2] ]].
         interval_vars must specify columns corresponding to increasing intervals")
  }

  #check for exact overlaps in x
  if(sum(duplicated(x[,c(group_vars,interval_vars),with=FALSE]))!=0){
    stop("sum(duplicated(x[,c(group_vars,interval_vars),with=FALSE]))!=0 is not TRUE.
         there are replicate/duplicate intervals within groups.
         If you wish to average these together, then do this first")
  }


  ydups <- duplicated(y[,c(group_vars,interval_vars),with=FALSE])
  if(sum(ydups)!=0){
    warning("sum(duplicated(y[,c(group_vars,interval_vars),with=FALSE]))!=0 is not TRUE.
         there are replicate/duplicate intervals within groups of y.
         removing these duplicated rows automatically")
  }else{
    ydups <- FALSE
  }

  if(!skip_overlap_check){

    #stop if there are overlapping periods within groups:
    data.table::setkeyv(x,c(group_vars,interval_vars))
    any_overlaps <- any(x[, list(Cisoverlapping(.SD[[1]],.SD[[2]])),by=group_vars,.SDcols=interval_vars][, V1])

    #stop if there are overlapping periods within groups:
    if(any_overlaps){
      stop("overlapping intervals detected")
    }

    if(verbose){print(paste(Sys.time(),"passed errorcheck: x is non-overlapping."))}
  }else{
    message("skipping errorcheck. if intervals in x are  overlapping, incorrect results may be returned without error.")
  }


  ### merge x and y ####

  xx <- proc.time()


  ##coerce date to numeric. doing it once here is faser than a million coercison in the by-loop
  interval_dates <- "IDate" %in% class(x[[interval_vars[1]]])
  if(interval_dates){
    stopifnot(!any(c("intervalaverage__start__original_class_copy",
                     "intervalaverage__end__original_class_copy")%in% names(x)))
    stopifnot(!any(c("intervalaverage__start__original_class_copy",
                     "intervalaverage__end__original_class_copy")%in% names(y)))

    setnames(x,interval_vars,
             c("intervalaverage__start__original_class_copy",
               "intervalaverage__end__original_class_copy")
    )
    setnames(y,interval_vars,
             c("intervalaverage__start__original_class_copy",
               "intervalaverage__end__original_class_copy")
    )

    x[, (interval_vars):=lapply(.SD,as.integer),
      .SDcols=c("intervalaverage__start__original_class_copy",
                "intervalaverage__end__original_class_copy")]
    y[, (interval_vars):=lapply(.SD,as.integer),
      .SDcols=c("intervalaverage__start__original_class_copy",
                "intervalaverage__end__original_class_copy")]

    ##set the columns back as they were
    on.exit({
      x[, (interval_vars):=NULL]
      y[, (interval_vars):=NULL]

      setnames(x, c("intervalaverage__start__original_class_copy",
                    "intervalaverage__end__original_class_copy"),
               interval_vars
      )
      setnames(y,c("intervalaverage__start__original_class_copy",
                   "intervalaverage__end__original_class_copy"),
               interval_vars,
      )


    }, add=TRUE, after=FALSE)

  }


  #copy the interval columns so they get carried over to the result
   #if you didn't copy them, the resulting joined columns would be *just the values from*
    #but we need the values from intervals in both x and y to calculate durations

  stopifnot(!any(c("intervalaverage__xstart_copy",
              "intervalaverage__xend_copy")%in% c(names(y),names(x))))

  stopifnot(!any(c("intervalaverage__ystart_copy",
                   "intervalaverage__yend_copy")%in% c(names(x),names(y))))

  x[,`:=`(intervalaverage__xstart_copy=.SD[[1]],
          intervalaverage__xend_copy=.SD[[2]]
          ),.SDcols=interval_vars]

  y[,`:=`(intervalaverage__ystart_copy=.SD[[1]],
          intervalaverage__yend_copy=.SD[[2]]
          ),.SDcols=interval_vars]

  #these copies will be deleted on function exit thanks to the on.exit state restore calls

  value_names <- unlist(lapply(value_vars, function(x) paste0(c("","nobs_","maxgap_"),x)))
  nobs_vars_names <- paste0("nobs_",value_vars)
  q <- x[y[!ydups],
    {

      Cintervalaverage(.SD,
                            intervalaverage__xstart_copy,
                            intervalaverage__xend_copy,
                            intervalaverage__ystart_copy[1],
                            intervalaverage__yend_copy[1],
                            value_names
      )



    },
    by=.EACHI,
    on=c(group_vars,
         paste0(interval_vars[2],">=",interval_vars[1]),
         paste0(interval_vars[1],"<=",interval_vars[2])),
    .SDcols=value_vars]


  #the on nonequi join seems confusing but remember the left side of the corresponds to vars in x
  #and the right side corresponds to vars in y.
    #so this just means take rows meeting both of these conditionds:
      #-the x end is greater than the y start AND
      #-the x start is less than or equal to the y end
    #which just translates to a partial overlap join
    #in addition to matching on groups_vars

    #to x and the right sid

  ###confusingly the returned non-equi join columns are the names from x but the values from y
  #because of this inequality statement
    #comparing interval_vars[2] with interval_val[1] and vice versa
  #the names get switched. switch them back:

  setnames(q, interval_vars, rev(interval_vars))

  #count length of each interval,
  #this will be used to count percentage of observed time x has in intervals of y
  q[,yduration:=as.numeric( .SD[[2]]-.SD[[1]] + 1),.SDcols=c(interval_vars)]
  #this should probably be an integer
  #but was historically a numeric so leaving this as is for now


  #fix column type of dates since .Internal(pmin) converts to numeric
  if(interval_dates){
    q[, xminstart:=as.IDate(xminstart,origin="1970-01-01")]
    q[, xmaxend:=as.IDate(xmaxend,origin="1970-01-01")]
    q[, (interval_vars):=lapply(.SD,as.IDate),.SDcols=interval_vars]
  }



  for(i in 1:length(value_vars)){
    #remove averages with too few observations in period
    #e.g. q[100*nobs_value/yduration < required_percentage, nobs_value:=NA]
    EVAL(paste0("q[100*",nobs_vars_names[i],"/yduration < required_percentage, ", value_vars[i],":=as.numeric(NA)]"))

    #turn NaNs into NAs
    #NaNs occur when rows of y were not matched at all to x
    set(q, i=which(is.nan(q[[value_vars[i]]])),j=value_vars[i],value=as.numeric(NA))
  }

  data.table::setcolorder(q, c(group_vars,interval_vars,value_vars,"yduration","xduration",nobs_vars_names,
                   "xminstart","xmaxend"))

  setkey(q,NULL) #temporary way to get around this bug: https://github.com/Rdatatable/data.table/issues/4603
  setkeyv(q, c(group_vars,interval_vars))


  if(verbose){
    print("everything else:")
    print(proc.time()-xx)
  }
  q[]

}



#slower algorithm. used for testing since this is the simpler approach and less likely to have errors.
#Not recommended for large datasets since this expands the data into a row for every increment.
#not exported
interval_weighted_avg_slow_f <- function(x,
                                         y,
                                         interval_vars,
                                         value_vars,
                                         group_vars=NULL,
                                         required_percentage=100,
                                         skip_overlap_check=FALSE,
                                         verbose=FALSE){

  xminstart <- xmaxend <- V1 <- maxgap_____ <- NULL

  if(x[,!all(sapply(.SD,is.integer)|sapply(.SD,function(x){class(x)%in% c("IDate")})),.SDcols=interval_vars]){
    stop("interval_vars must correspond to columns in x of class integer or IDate")
  }
  if(x[,class(.SD[[1]])[1]!=class(.SD[[2]])[1],.SDcols=interval_vars]){
    stop("interval_vars must correspond to columns in x of the same class")
  }

  if(y[,!all(sapply(.SD,is.integer)|sapply(.SD,function(x){any(class(x)%in%c("IDate"))})),.SDcols=interval_vars]){
    stop("interval_vars must correspond to columns in y of class integer or IDate")
  }
  if(y[,class(.SD[[1]])[1]!=class(.SD[[2]])[1],.SDcols=interval_vars]){
    stop("interval_vars must correspond to columns in y of the same class")
  }

  EVAL <- function(...)eval(parse(text=paste0(...)))


  if(!is.null(group_vars)){
    if(!all(group_vars %in% names(x))){
      stop("every value in group_vars must be a columname in x")
    }
    if(!all(group_vars %in% names(y))){
      stop("every value in group_vars must be a columname in y")
    }
  }

  #check for exact overlaps
  if(sum(duplicated(x[,c(group_vars,interval_vars),with=FALSE]))!=0){
    stop("sum(duplicated(x[,c(group_vars,interval_vars),with=FALSE]))!=0 is not TRUE.
         there are replicate/duplicate intervals within groups.
         If you wish to average these together, then do this first")
  }


  ydups <- duplicated(y[,c(group_vars,interval_vars),with=FALSE])
  if(sum(ydups)!=0){
    warning("sum(duplicated(y[,c(group_vars,interval_vars),with=FALSE]))!=0 is not TRUE.
         there are replicate/duplicate intervals within groups of y.
         removing these duplicated rows automatically")
    y <- y[!ydups]
  }




  #set keys for testing if there are overlaps
  #groups do not need to be in x but they do need to be in y
  if(!skip_overlap_check){

    data.table::setkeyv(x,c(group_vars,interval_vars))

    any_overlaps <- any(x[, list(Cisoverlapping(.SD[[1]],.SD[[2]])),by=group_vars,.SDcols=interval_vars][, V1])

    #stop if there are overlapping periods within groups:
    if(any_overlaps){
      stop("overlapping intervals detected")
    }

    if(verbose){print(paste(Sys.time(),"passed errorcheck: x is non-overlapping."))}
  }else{
    message("skipping errorcheck. if intervals in x are  overlapping, incorrect results may be returned without error.")
  }



  #create a length-1 character vector that will be a column name in x,y, and z that is not in use already
  t <- create_unused_name("time",c(names(x),names(y)))

  #create a length-1 character vector that will be a column name in x,y, and z that is not in use already
  # measurement occured on this day but it might be measured as missing
  measurement <- create_unused_name("meas",c(names(x),names(y)))



  if("yduration"%in%names(x)|"yduration"%in%names(y)){
    stop("names(x) or names(y) contains the column name 'yduration'. A column named 'yduration' cannot be present in x or y
         because it will be a special column in the return value. please rename this column.")
  }
  ydur <- "yduration"

  if("xduration"%in%names(x)|"xduration"%in%names(y)){
    stop("names(x) or names(y) contains the column name 'xduration'. A column named 'xduration' cannot be present in x or y
         because it will be a special column in the return value. please rename this column.")
  }
  xdur <- "xduration"


  if("xminstart"%in%names(x)|"xminstart"%in%names(y)){
    stop("names(x) or names(y) contains the column name 'xduration'. A column named 'xminstart' cannot be present in x or y
         because it will be a special column in the return value. please rename this column.")
  }

  if("xmaxend"%in%names(x)|"xmaxend"%in%names(y)){
    stop("names(x) or names(y) contains the column name 'xduration'. A column named 'xmaxend' cannot be present in x or y
         because it will be a special column in the return value. please rename this column.")
  }


  #nobs vars will be the count of non-missing obs for each time-period in y
  #(ie, summed from temp nobs)
  nobs_vars <- paste("nobs",value_vars, sep="_")
  for(i in 1:length(nobs_vars)){
    while(nobs_vars[i]%in% names(x)|nobs_vars[i]%in% names(y)){
      ydur <- paste0("i.",nobs_vars[i])
    }
  }


  #expand x
  #in x values_vars represent an average over an interval
  #in expand_x, intervals are expanded such that there is now a separate row for each smallest observable
  #increment in each interval (represented as variable t).
  #values are repeated over rows that correspond to those increments,
  #and interval_vars columns specifying the original increments are retained

  x_expanded <- EVAL("x[,list(",t,"=",interval_vars[1],":",interval_vars[2],"),
                     by=c(group_vars,interval_vars,value_vars)]")

  x_expanded[,(interval_vars):=NULL]
  x_expanded[, (measurement):=1L]

  y_expanded <-  EVAL("y[,list(",t,"=",interval_vars[1],":",interval_vars[2],"),
                      by=c(interval_vars,group_vars)]")



  data.table::setkeyv(x_expanded,c(t, group_vars))
  data.table::setkeyv(y_expanded,c(t, group_vars))

  z <- x_expanded[y_expanded]

  avg_call <- paste0(value_vars,"=mean(",value_vars,",na.rm=TRUE)",collapse=",")
  nobs_call <- paste0(nobs_vars,"=sum(!is.na(",value_vars,"))",collapse=",")
  ydur_call <- paste0(ydur,"=length(unique(",t,"))")
  xdur_call <- paste0(xdur,"=sum(",measurement,",na.rm=TRUE)")

  #get the min start and max end dates for intervals that were actually provided as inputs in x
  #as of 1/12/2020 I think gforce does not work with :=,
    #so this is written to create a separate table that is then merged
  minmaxtable <-
    EVAL(
      paste0(
       "z[!is.na(",measurement,"),list(xminstart=min(time),xmaxend=max(time)),by=c(interval_vars,group_vars)]"
      )
    )


  if(any(class(x[[interval_vars[1]]])%in%c("IDate"))){
    minmaxtable[, xminstart:=as.IDate(xminstart,origin="1970-01-01")]
    minmaxtable[, xmaxend:=as.IDate(xmaxend,origin="1970-01-01")]
  }



  out <- EVAL(
    paste0(
    "z[,","list(",avg_call,",",ydur_call,",",xdur_call,",",nobs_call,")" ,
              ",by=c(interval_vars,group_vars)]"
    )
    )

  stopifnot(EVAL("out[,all(",xdur,"<=",ydur,")]"))

  ##merge in minmaxtable
  data.table::setkeyv(minmaxtable,c(group_vars,interval_vars))
  data.table::setkeyv(out,c(group_vars,interval_vars))

  out <- minmaxtable[out]

  ##add columns for maxgaps
  for(i in 1:length(value_vars)){

    #first calulate runlength ids of is.na(<value.var>) and subset to ids
    #corresponding to missing runs
    #temp is a vector of ids (ie integers)
    maxgapdt <- z[,list(maxgap_____={
      temp <- rleid(is.na(.SD[[1]]))[is.na(.SD[[1]])]
      max(tabulate(temp))
      }), .SDcols=value_vars[i],by=c(group_vars,interval_vars)]
    #count the occurance of each bin--this is the number of adjacent missings and take the max


    data.table::setkeyv(maxgapdt,c(group_vars,interval_vars))
    out[maxgapdt,maxgap_____:=maxgap_____]
    data.table::setnames(out, "maxgap_____",paste0("maxgap_",value_vars[i]))

  }


  for(i in 1:length(value_vars)){
    EVAL("out[100*",nobs_vars[i],"/",ydur,"< required_percentage,",value_vars[i],":=as.numeric(NA)]")
  }

  data.table::setcolorder(out, c(group_vars,interval_vars,value_vars,ydur,xdur,nobs_vars,
                     "xminstart","xmaxend"))

  data.table::setkeyv(out,c(group_vars,interval_vars))
  out[]
}



#' isolate sections of overlapping intervals
#'
#' Given a set of intervals in a table, isolate sections of intervals that are overlapping
#' with other in intervals (optionally, within groups). Returns a data.table that contains
#' intervals which are mutually non-overlapping or exactly overlapping with other intervals
#' (ie there are no partially overlapping intervals) (optionally within groups).
#' Note that this doesn't just return the intersects; the original interval data is conserved
#' such that for each interval/row in x, the return table has one or more
#' non-overlapping intervals that together form the union of that original interval.
#'
#' All intervals are treated as closed (ie inclusive of the start and end values in the columns
#' specified by interval_vars)
#'
#' x is not copied but rather passed by reference to function internals
#' but the order of this data.tables is restored on function completion or error.
#'
#' @param x A data.table containing a set of intervals.
#' @param interval_vars A length-2 character vector denoting column names in x.
#' these columns must be of the same class and be integer or IDate. The column name
#' specifying the lower-bound column must be specified first.
#' @param group_vars NULL, or a character vector denoting column names in x.
#'  These columns serve as grouping variables such that testing for overlaps and subsequent isolation only occur
#'  within categories defined by the combination of the group variables.
#' @param interval_vars_out The desired column names of the interval columns in the return data.table.
#' By default these columns will be generated to be named \code{c("start","end")}.
#' If x contains columns with the same name as the desired output column
#' names specified in `interval_vars_out`, the function will return in error to avoid naming
#' confusion in the return table.
#' @return A data.table with columns \code{interval_vars_out} which denote the start and
#' stop period for each new interval. This return table also contains columns in x
#' (including the original interval columns).
#' @seealso \code{\link{is.overlapping}} To test if a table contains overlapping intervals within
#'  values of \code{group_vars}
#'
#' @examples
#'set.seed(23)
#'x2 <- data.table(addr_id=rep(1:4,each=3),
#'                 exposure_start=rep(c(1L,7L,14L),times=4),
#'exposure_end=rep(c(7L,14L,21L),times=4),
#'exposure_value=c(rnorm(12))
#')
#'x2z <- isolateoverlaps(x2,interval_vars=c("exposure_start","exposure_end"),group_vars=c("addr_id"))
#'x2z
#'#x2b represents x2 when where exposure values in overlapping intervals have been averaged
#'x2b <- x2z[, list(exposure_value=mean(exposure_value)),by=c("addr_id","start","end")]
#'
#' @export
isolateoverlaps <- function(x,interval_vars,group_vars=NULL,interval_vars_out=c("start","end")){
  EVAL <- function(...)eval(parse(text=paste0(...)))

  stopifnot(is.data.table(x))

  if(x[,!all(sapply(.SD,is.integer)|sapply(.SD,function(x){class(x)%in% c("IDate")})),.SDcols=interval_vars]){
    stop("interval_vars must correspond to columns in x of class integer or IDate")
  }
  if(x[,class(.SD[[1]])[1]!=class(.SD[[2]])[1],.SDcols=interval_vars]){
    stop("interval_vars must correspond to columns in x of the same class")
  }


  if(any(interval_vars_out %in% names(x))){ stop("Column names in x detected to have the same values as interval_vars_out.
                                            This would causes naming conflicts in the return. Choose different
                                            output names by specifying the argument interval_vars_out")}
  variable_var <- create_unused_name("variable",names(x))
  i.interval_vars <- paste0("i.", interval_vars)
    if(any(i.interval_vars%in% names(x))){
      stop("names(x) contains columns ",paste0(interval_vars,collapse=" "),
      " and at least one column named ",paste0(i.interval_vars,collapse=" "),". Columns named ",paste0(i.interval_vars,collapse=" "),
      " cannot be present in x because they are reserved for use by data.table::foverlaps")
      }

  value_var <- create_unused_name("value",names(x))
  open_var <- create_unused_name("open",names(x))
  close_var <- create_unused_name("close",names(x))
  is_end_var <- create_unused_name("is_end",names(x))

  end_next_var <- create_unused_name("end_next",names(x))
  value_next_var <- create_unused_name("value_next",names(x))

  xd <- data.table::melt(x[,c(interval_vars,group_vars),with=FALSE],id.vars=group_vars,
             measure.vars=interval_vars,
             value.name=value_var,
             variable.name=variable_var)



  ##create end variable:
  EVAL("xd[",variable_var,"==interval_vars[2],(is_end_var):=TRUE]")
  EVAL("xd[",variable_var,"==interval_vars[1],(is_end_var):=FALSE]")
  data.table::setorderv(xd,c(group_vars, value_var,is_end_var))

  EVAL("xd[,(end_next_var):=shift(",is_end_var,",type='lead'),by=group_vars]")
  EVAL("xd[,(value_next_var):=shift(",value_var,",type='lead'),by=group_vars]")

  #the last row is missing because you can't get the next row when there aren't any more rows!
   #we don't need that row where end_next is missing so exclude it.
  #when data.table veresion 1.12.3 you can use fifelse to avoid coercing dates to numeric
  temp <- EVAL("xd[,.SD[!is.na(",end_next_var,"),list(
   ",interval_vars[1],"=data.table::fifelse(!",is_end_var,",",value_var,",",value_var,"+1L),
    ",interval_vars[2],"=data.table::fifelse(!",end_next_var,",",value_next_var,"-1L,",value_next_var,")
  )],by=group_vars]")

  temp <- EVAL("temp[",interval_vars[2],">=",interval_vars[1],"]")


  data.table::setkeyv(temp,c(group_vars,interval_vars))


  out <- data.table::foverlaps(x,temp,by.x=c(group_vars,interval_vars))
  data.table::setorderv(out, c(group_vars,
                   paste0("i.",interval_vars),
                   interval_vars))
  data.table::setcolorder(out, c(group_vars,interval_vars,paste0("i.",interval_vars)))

  data.table::setnames(out, interval_vars, interval_vars_out)
  data.table::setnames(out, i.interval_vars, interval_vars)


  out
}
kaufman-lab/intervalaverage documentation built on Feb. 4, 2023, 9:19 a.m.