daohtools: Tools for working with registry data and transforming to days alive and out of hospital (DAOH)

Documented in generate.smoothed.sampling.dist generate.timed.sampling.dist modify.daoh.distribution modify.timed.daoh.distribution quantile.dist quantile.dist.emp smooth.daoh

#' Generates a smoothed sampling distribution (attempting to eliminate lumps
#' from undersampling, especially of intermediate values) from which 
#'
#' @param input.dist The DAOH dataset on which to base the sampling
#'   distribution, as a vector.
#' @param values The value that can the sampling distribution can take (default
#'   min(input.dist):max(input.dist), which will fail if not integer input)
#' @return ggplot2 plot
#'
#' @export
generate.smoothed.sampling.dist = function(input.dist, 
                                           values = NULL, 
                                           smooth.func = smooth.daoh) {


  #Count outcome including zeroes
  freq.dt = data.table::data.table(table(input.dist))
  data.table::setnames(freq.dt, c('outcome','N'))
  data.table::set(x = freq.dt,
                  j = 'outcome',
                  value = as.numeric(freq.dt$outcome))

  #Append absent DAOH values if necessary
  if (length(values) == 0) {
    values = min(input.dist):max(input.dist)
  }

  #Add missing rows if there are some zeroes
  if ((sum(!values %in% freq.dt$outcome) > 0)) {
    freq.dt = rbind(freq.dt, data.table::data.table(outcome = values[!values %in% freq.dt$outcome], N=0))
  }
  setorderv(x = freq.dt,
            cols = 'outcome')
  freq.prob.dt = smooth.func(freq.dt)

  #Return input parameters for base::sample
  result = list()
  result$x = freq.prob.dt[,outcome]
  result$prob = freq.prob.dt[,probs/min(freq.prob.dt[probs>0,probs])]


  return(result)

}

#' A smoothing function particularly for DAOH.
#'
#' @param input.freq.dt Data.table with columns 'outcome' and 'N'
#' @return Data.table with same columns as input, but also 'probs' for feeding
#'   into sample()
#'
#' @export
smooth.daoh = function(input.freq.dt) {

  output.freq.prob.dt = data.table::copy(input.freq.dt)

  # #Pump up the numbers for a smoother smooth
  # output.freq.prob.dt[,probs := N*1000/min(output.freq.prob.dt[N>0,N])]
  data.table::set(x = output.freq.prob.dt,
                  j = 'probs',
                  value = output.freq.prob.dt[,N]*1000/min(output.freq.prob.dt[N>0,N]))

  #Smooth between Day 1 (i.e. skipping zero peak), and the day with maximum
  #frequency (it's usually pretty smooth after that)
  daySmoothLimits = c(1,output.freq.prob.dt[N == max(output.freq.prob.dt$N), outcome])
  indSmoothLimits = match(daySmoothLimits, output.freq.prob.dt[,outcome])

  #First smooth the whole range, then progressively smaller ones.
  nSmooths = 5
  smoothProportions = c(1,.75,.65,.5,.25)
  for (smoothInd in 1:nSmooths) {
    smoothProportion = smoothProportions[min(smoothInd, length(smoothProportions))]

    indsToSmooth = indSmoothLimits[1]:round(smoothProportion*(indSmoothLimits[2]-1))

    data.table::set(x = output.freq.prob.dt,
                    i = indsToSmooth,
                    j = 'probs',
                    value = bezierCurve(output.freq.prob.dt[indsToSmooth, outcome],
                                        output.freq.prob.dt[indsToSmooth, probs],
                                        n = length(indsToSmooth))$y)

    # output.freq.prob.dt[indsToSmooth,N := bezierCurve(output.freq.prob.dt[indsToSmooth, outcome],
    #                                         output.freq.prob.dt[indsToSmooth, N],
    #                                         n = length(indsToSmooth))$y]

  }

  # There might be some issue with low input n where daoh 1 is zero, copy that value from daoh 2 if so.
  if (output.freq.prob.dt[outcome==1, probs==0]) {
    data.table::set(x = output.freq.prob.dt,
                    i = which(output.freq.prob.dt[,outcome==1]),
                    j = 'probs',
                    value = output.freq.prob.dt[outcome==2, probs])

  }

  return(output.freq.prob.dt)
}

# x, y: the x and y coordinates of the hull points
# n: the number of points in the curve.
#'@export
bezierCurve <- function(x, y, n=10)
{
  outx <- NULL
  outy <- NULL

  i <- 1
  for (t in seq(0, 1, length.out=n))
  {
    b <- bez(x, y, t)
    outx[i] <- b$x
    outy[i] <- b$y

    i <- i+1
  }


  return (list(x=outx, y=outy))
}

bez <- function(x, y, t)
{
  outx <- 0
  outy <- 0
  n <- length(x)-1
  for (i in 0:n)
  {
    outx <- outx + choose(n, i)*((1-t)^(n-i))*t^i*x[i+1]
    outy <- outy + choose(n, i)*((1-t)^(n-i))*t^i*y[i+1]
  }

  return (list(x=outx, y=outy))
}



#Modifies a DAOH distribution (provided as a set of probabilities) by
#manipulating it until a certain quantile is equal to the provided.
#Multiplies the distribution with a straight line, from minMult to maxMult.

#' Modify a DAOH distribution, until at.quantile is target.quantile.val.
#' Uses data.table so that things can be done by reference.
#'
#' @param x Possible values for daoh
#' @param prob Probability of each daoh value
#' @param target.quantile.val The target value for at.quantile
#' @param relative.quantile.val Value that the target quantile should be relative to its current value.
#' @param at.quantile The quantile at which to insert a difference (default: 0.5).
#' @param multiple.per.step Probabilities will be adjusted as a function of this number.
#' @param output.to.console Print progress to console.
#' @param empirical.dist Calculate quantiles for calculating cost empirically (slower)
#' @param n.emp Number of samples that will be generated when empirically finding quantiles
#'
#' @return The modified distribution as a list containing values and weights.
#'
#' @export
modify.daoh.distribution = function(x = 0:90,
                                    prob = rep(1,90),
                                    target.quantile.val = NULL,
                                    relative.quantile.val = NULL,
                                    at.quantile = .5,
                                    multiple.per.step = .01,
                                    output.to.console = T,
                                    empirical.dist = F,
                                    n.emp = 100000) {

  #Set target quantile if provided in relative terms.
  if (is.null(target.quantile.val) & is.null(relative.quantile.val)) {
    stop('Wait a second... You didn\'t give modify.daoh.distribution a value to modify the quantile...')
  } else if (is.null(target.quantile.val) & !is.null(relative.quantile.val)) {
    if (empirical.dist) {
      target.quantile.val = quantile.dist.emp(x = x,
                                              prob = prob,
                                              quantile = at.quantile,
                                              n = n.emp) - relative.quantile.val
    } else {
      target.quantile.val = quantile.dist(x = x,
                                          prob = prob,
                                          quantile = at.quantile) + relative.quantile.val
    }
  }

  maxIterations = 1000
  iterations = 1
  tolerance = .01

  cost = 1000
  refineFurther = T

  #Do a few iterations after reaching the first distribution satisfying the criteria.
  lastCosts = rep(NA_real_,10)

  # targetQuantileIndex = which(x == target.quantile.val)
  best.cost = cost
  best.mean.last.costs = Inf
  best.x = NA_real_
  best.prob = NA_real_

  #Loop through modifying the distribution
  # while (( abs(cost) > tolerance) & (iterations < maxIterations)) {
  # print(( abs(cost) > tolerance) & (mean(abs(lastCosts))  tolerance) & (iterations < maxIterations))
  while (refineFurther & (iterations < maxIterations)) {

    if (empirical.dist) {
      current.quantile.val = quantile.dist.emp(x=x, prob=prob, quantile = at.quantile, n=n.emp)
    } else {
      current.quantile.val = quantile.dist(x=x, prob=prob, quantile = at.quantile)
    }
    # print(x)
    # print(prob)
    cost = current.quantile.val - target.quantile.val
    cost = unname(cost)

    lastCosts[iterations %% (length(lastCosts)+1)] = cost

    mean.last.costs = mean(lastCosts[!is.na(lastCosts)])
    #TODO: Could do something with zeroes to make things more organic?

    #If current quantile is greater than target, then increase probability of
    #lower values, in proportion to current value.
    if (cost > 0) {
      mult = (max(prob)/prob)^multiple.per.step
    } else if (cost < 0) {
      mult = 1/((max(prob)/prob)^multiple.per.step)
    } else if (cost == 0) {
      if (mean(lastCosts[!is.na(lastCosts)]) > 0) {
        mult = (max(prob)/prob)^(multiple.per.step/2)
      } else if (mean(lastCosts[!is.na(lastCosts)]) < 0) {
        mult = 1/((max(prob)/prob)^(multiple.per.step/2))
      }
    }
    prob = prob * mult

    #Remove pesky NaNs that crop up if the probability is zero
    prob[is.nan(prob)] = 0

    #Ensure that current cost is below tolerance, and that the last n were on average.
    refineFurther = !((abs(cost) <= tolerance) & (abs(mean(lastCosts[!is.na(lastCosts)])) <= tolerance))

    # #Check quantile value empirically if the function thinks that it's done.
    # if (!refineFurther) {
    #   if (abs(quantile.dist.emp(x=x, prob=prob, quantile = at.quantile) - target.quantile.val)) {
    #     refineFurther == T
    #   }
    # }

    #Check if it's a better one than the next best.
    if (abs(cost) <= abs(best.cost) | (abs(cost) == abs(best.cost) & (abs(mean.last.costs) < abs(best.mean.last.costs)))) {
      best.cost = cost
      best.mean.last.costs = mean.last.costs
      best.x = x
      best.prob = prob
    }

    iterations = iterations + 1


    if (output.to.console) {
      message(paste("Iteration:",iterations))
      message(paste("Target:",target.quantile.val))
      message(paste("Current:",current.quantile.val))
      message(paste("Last costs:", paste(lastCosts[!is.na(lastCosts)], collapse = ', ')))
      message(paste("Current cost:",cost))
      message(paste("Mean last costs:",mean.last.costs))
      message('')
    }

    if (iterations == maxIterations) {
      message(paste(maxIterations, 'iterations reached.'))
      if (output.to.console) {
        message(paste("Best cost:",best.cost))
        message(paste("Best mean last costs:",best.mean.last.costs))
      }
      return(list(x = best.x,
                  prob = best.prob))
    }

  }

  return(list(x = x,
              prob = prob))
}


#' Find a quantile from a sampling distribution.
#'
#' @param x The values.
#' @param prob The weights (same length as values).
#' @param quantile The quantile to find.
#' @return Value at requested quantile.
#' 
#' @export quantile.dist
quantile.dist = function(x, prob, quantile = 0.5) {
  cs = cumsum(prob)/sum(prob)
  csq = cs-quantile
  x[which(csq == min((csq)[csq > 0]))][1]
}

#' Empirically a quantile from a sampling distribution.
#'
#' @param x The values.
#' @param prob The weights (same length as values).
#' @param quantile The quantile to find.
#' @param n The number of random values to generate.
#' @return Value at requested quantile.
#'
#' @export quantile.dist.emp
quantile.dist.emp = function(x, prob, quantile = 0.5, n=1000000) {
  unname(quantile(sample(x = x, prob = prob, size = n, replace = T), probs = quantile))
}

#' Generate a timed.sampling.dist object from an input distribution (with
#' timings obvs)
#'
#' @param input.dt The (data.table) input distribution to emulate with outcome
#'   and time columns
#' @param outcome.col.name The name of the outcome column
#' @param n.sampling.period How many sampling periods?
#' @param sampling.period.in.start At what time should the sampling start?
#'   Should be integer.
#' @param sampling.period.in.length How long should each time period be in the
#'   output? Should be integer.
#' @param sampling.period.out.start At what time should the probability
#'   distribution in the output start? Should be integer.
#' @param sampling.period.out.start How long should each period be in the
#'   output? Should be integer.
#' @param smooth.func What function should be used for smoothing? Mainly made
#'   for DAOH, so that's default. Should be integer.
#' @param values The value that can the sampling distribution can take (default
#'   min(input.dist):max(input.dist), which will fail if not integer input)
#' @return A Timed.Sampling.Dist object
#'
#' @export
generate.timed.sampling.dist = function(input.dt,
                                        outcome.col.name,
                                        n.sampling.period = 1,
                                        sampling.period.in.start = NULL,
                                        sampling.period.in.length = NULL,
                                        sampling.period.out.start = NULL,
                                        sampling.period.out.length = NULL,
                                        smooth.func = smooth.daoh,
                                        values = NULL) {

  #Append absent DAOH values if necessary
  if (length(values) == 0) {
    values = min(input.dt[,get(outcome.col.name)]):max(input.dt[,get(outcome.col.name)])
  }


  #Check for required column names.
  required.col.names = c('time',outcome.col.name)
  if (!Reduce('&', required.col.names %in% colnames(input.dt))) {
    stop(paste0('Input distribution missing ',
                paste0(required.col.names[required.col.names %in% colnames(input.dt)], collapse = ', ')))
  }
  #Try and set missing parameters automatically.
  #Generate only one period if no input parameters.
  time.min = min(input.dt$time)
  time.max = max(input.dt$time)
  if (is.null(sampling.period.in.start)) {
    sampling.period.in.start = time.min
  }
  if (is.null(sampling.period.in.start)) {
    sampling.period.in.length = time.max - time.min
  }
  #Copy input parameters if no output parameters
  if (is.null(sampling.period.out.start)) {
    sampling.period.out.start = sampling.period.in.start
  }
  if (is.null(sampling.period.out.length)) {
    sampling.period.out.length = sampling.period.in.length
  }

  #When should each period start? (plus one for the end of the last period)
  sampling.period.in.start.times = seq(from = sampling.period.in.start,
                                       to = sampling.period.in.start + sampling.period.in.length*(n.sampling.period),
                                       by = sampling.period.in.length)


  #When should each period start? (plus one for the end of the last period)
  # sampling.period.out.start.times = seq()
  sampling.period.out.start.times = seq(from = sampling.period.out.start,
                                        to = sampling.period.out.start + sampling.period.out.length*(n.sampling.period),
                                        by = sampling.period.out.length)

  #Ensure the last period encompasses the last day.
  sampling.period.out.start.times[length(sampling.period.out.start.times)] = sampling.period.out.start.times[length(sampling.period.out.start.times)]+2
  # print(sampling.period.out.start.times)


  #Create a new timed sampling distribution
  tsd = new('Timed.Sampling.Dist')
  #Iterate through requested time periods
  for (period.ind in 1:n.sampling.period) {
    #Get beginning and end of both input and output periods.
    time.begin.in = sampling.period.in.start.times[period.ind]
    time.end.in = sampling.period.in.start.times[period.ind+1]-1
    time.begin.out = sampling.period.out.start.times[period.ind]
    time.end.out = sampling.period.out.start.times[period.ind+1]-1

    #Get the values from the time period.
    input.dist = input.dt[time >= time.begin.in & time < time.end.in,
                          get(outcome.col.name)]

    #Smooth the distribtion.
    dist = generate.smoothed.sampling.dist(input.dist, values = values)

    #Add to sampling distribution.
    tsd$add.sampling.dist(values = dist$x,
                          weights = dist$prob,
                          time.begin = time.begin.out,
                          time.end = time.end.out)
  }
#   print(time.begin.out)
#   print(time.end.out)
# asassas
  return(tsd)
}

#'Modify a timed DAOH distribution.
#'
#'If there is no split at time.begin and/or time.end, one will be inserted. Each
#'period between mod.time.begin and mod.time.end will then be individually
#'modified.
#'
#'
#'@param tsd The Timed.Sampling.Distribution object.
#'@param mod.time.begin The time to start the modification.
#'@param mod.time.end The time at which to end the modification.
#'@param target.quantile.val The target value for the quantile of interest (by
#'  default the median)
#'@param ... Arguments to be sent to modify.daoh.distribution
#'
#'@export
modify.timed.daoh.distribution = function(tsd, mod.time.begin, mod.time.end, target.quantile.val = NULL, relative.quantile.val = NULL, ...) {
  #Insert splits if needed.
  tsd$split.period(mod.time.begin, silent = T)
  tsd$split.period(mod.time.end, silent = T)
  #Get periods from requested times.
  mod.period.sampling.dt = tsd$sampling.dt[mod.time.begin <= time.begin & mod.time.end > time.end,]
  #Remove from the actual sampling period
  remaining.sampling.dt = tsd$sampling.dt[!(mod.time.begin <= time.begin & mod.time.end > time.end),]
  #Modify all distributions in selected time period.
  mod.period.sampling.dt[,
                         weights := modify.daoh.distribution(x = .SD[,values],
                                                             prob = .SD[,weights],
                                                             target.quantile.val = target.quantile.val,
                                                             relative.quantile.val = relative.quantile.val,
                                                             multiple.per.step = 0.01,
                                                             empirical.dist = F,
                                                             ...)$prob,
                         by=.(time.begin,time.end)]

  tsd$sampling.dt = rbind(remaining.sampling.dt,mod.period.sampling.dt)
}


#' Generate the timed sampling distributions from real data.
#'
#' Will also set the patients per unit time.
#'
#' @param this.site.dt The data as a data.table, with time and outcome columns.
#' @param n.sampling.period How many periods should the distribution be divided
#'   into?
#' @param outcome.col.name Name of outcome column.
#' @param sampling.period.out.start Time to start the timed sampling dist
#' @param sampling.period.out.length Length of each period in timed sampling
#'   dist
#' @param values Values that timed sampling dist can take.
#' @param smooth.func How to smooth the distribution.
#' @param site The SW.Site object to set by reference
#'
#' @return The timed sampling distribution (note however that it is set in the
#'   site by reference if site is not NULL)
#'
#' @export
generate.site.tsd = function(this.site.dt,
                             n.sampling.period,
                             outcome.col.name,
                             sampling.period.out.start,
                             sampling.period.out.length,
                             values = 0:89,
                             smooth.func = smooth.daoh,
                             site = NULL) {
  
  total.time.length = floor((this.site.dt[,max(time)] - this.site.dt[,min(time)]))
  
  
  tsd = generate.timed.sampling.dist(input.dt = this.site.dt,
                                     outcome.col.name = outcome.col.name,
                                     n.sampling.period = n.sampling.period,
                                     sampling.period.in.start = this.site.dt[,min(time)],
                                     sampling.period.in.length = total.time.length/n.sampling.period,
                                     sampling.period.out.start = sampling.period.out.start,
                                     sampling.period.out.length = sampling.period.out.length,
                                     smooth.func = smooth.func,
                                     values = values)
  
  if(!is.null(site)) {
    site$set.timed.sampling.dist(tsd)
    site$set.sim.ppt.per.unit.time(nrow(this.site.dt)/total.time.length)
  }

  return(tsd)
}

#' Generate timed sampling distributions for all sites in a
#'
#' @param input.dt Data for all sites as data.table
#' @param site.dt Data.table about sites and their clusters.
#' @param cluster.dt Data.table about clusters and when they start/stop
#' @param n.sampling.period How many periods should the distribution be divided
#'   into?
#' @param outcome.col.name Name of outcome column.
#' @param sampling.period.out.start Time to start the timed sampling dist
#' @param sampling.period.out.length Length of each period in timed sampling
#'   dist
#' @param values Values that timed sampling dist can take.
#' @param smooth.func How to smooth the distribution.
#' @param output.to.console Output progress to console.
#'
#' @return The timed sampling distribution (note however that it is set in the
#'   site by reference)
#'
#' @export
generate.all.site.tsds = function(input.dt, site.dt, cluster.dt, n.sampling.period, outcome.col.name, values = 0:89, smooth.func = smooth.daoh, output.to.console = F) {

  site.cluster.dt = merge(site.dt, cluster.dt, by = 'cluster.id') #Merge implementation dates onto sites
  # for (site.ind in 9:9) {
  for (site.ind in 1:nrow(site.dt)) {
    if (output.to.console) {
      message(site.ind)
      message(site.cluster.dt[site.ind, site.id])
      message(site.cluster.dt[site.ind, site.name])
    }
    this.site.dt = input.dt[site.id == site.cluster.dt[site.ind, site.id]]
    if (nrow(this.site.dt) == 0) {
      ms = paste0("No data for ", site.cluster.dt[site.ind, site.name], ' (', site.cluster.dt[site.ind, site.id], ')')
      message(ms)
      warning(ms)
      next
    }

    cl.id = site.cluster.dt[site.ind, cluster.id]

    site = site.cluster.dt[site.ind, site.obj][[1]]

    sampling.period.out.start = as.numeric(site.cluster.dt[site.ind, cluster.start.time])
    sampling.period.out.length = floor((site.cluster.dt[site.ind, cluster.end.time] - site.cluster.dt[site.ind, cluster.start.time])/n.sampling.period)



    generate.site.tsd(
      this.site.dt = this.site.dt,
      n.sampling.period = n.sampling.period,
      outcome.col.name = outcome.col.name,
      sampling.period.out.start = sampling.period.out.start,
      sampling.period.out.length = sampling.period.out.length,
      values = 0:88,
      smooth.func = smooth.daoh,
      site = site
    )


  }

}
mattmoo/daohtools documentation built on Feb. 5, 2023, 5:38 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
mattmoo/daohtools
Tools for working with registry data and transforming to days alive and out of hospital (DAOH)

R/simulate.R
In mattmoo/daohtools: Tools for working with registry data and transforming to days alive and out of hospital (DAOH)

Defines functions modify.timed.daoh.distribution generate.timed.sampling.dist quantile.dist.emp quantile.dist modify.daoh.distribution bez bezierCurve smooth.daoh generate.smoothed.sampling.dist

Documented in generate.smoothed.sampling.dist generate.timed.sampling.dist modify.daoh.distribution modify.timed.daoh.distribution quantile.dist quantile.dist.emp smooth.daoh

R Package Documentation

Browse R Packages

We want your feedback!

mattmoo/daohtools Tools for working with registry data and transforming to days alive and out of hospital (DAOH)

R/simulate.R In mattmoo/daohtools: Tools for working with registry data and transforming to days alive and out of hospital (DAOH)

Defines functions modify.timed.daoh.distribution generate.timed.sampling.dist quantile.dist.emp quantile.dist modify.daoh.distribution bez bezierCurve smooth.daoh generate.smoothed.sampling.dist

Documented in generate.smoothed.sampling.dist generate.timed.sampling.dist modify.daoh.distribution modify.timed.daoh.distribution quantile.dist quantile.dist.emp smooth.daoh

R Package Documentation

Browse R Packages

We want your feedback!

mattmoo/daohtools
Tools for working with registry data and transforming to days alive and out of hospital (DAOH)

R/simulate.R
In mattmoo/daohtools: Tools for working with registry data and transforming to days alive and out of hospital (DAOH)