R/adjustcarryforward.R

Defines functions acf_answers calc_step_15_no_param check_cf_string calc_temp_exclusion_15 na.as.false

Documented in acf_answers

### Define the helper function and the carryforward adjustment function ###
# Segments labeled with ADJUSTCF EDIT in comments are areas where substantive
# changes were made to the original cleangrowth logic. There are other changes
# as well that are not specifically called out.

# helper function to treat NA values as FALSE
na.as.false = function(v) {
  v[is.na(v)] = FALSE
  v
}

# function to calculate temporary exclusion for step 15 (a - q)
calc_temp_exclusion_15 <- function(
  df,
  ewma.fields, tanner.ht.vel, who.ht.vel, exclude.levels,
  ewma.exp, minfactor, maxfactor, banddiff, banddiff_plus,
  min_ht.exp_under, min_ht.exp_over, max_ht.exp_under, max_ht.exp_over){

  # avoid "no visible binding" warnings
  abs.tbc.sd <- abs.tbc.sd.next <- abs.tbc.sd.prev <- aft.g.befp1 <- NULL
  agedays <- agedays.next <- bef.g.aftm1 <- delta.agedays.next <- NULL
  delta.next.ht <- delta.prev.ht <- dewma.after <- dewma.after.prev <- NULL
  dewma.before <- dewma.before.next <- ewma.after <- ewma.all <- ewma.before <- NULL
  ht.exp <- index <- max.ht.vel <- max.whoinc.1.ht <- max.whoinc.2.ht <- max.whoinc.3.ht <- NULL
  max.whoinc.4.ht <- max.whoinc.6.ht <- maxdiff.next.ht <- maxdiff.prev.ht <- NULL
  mid.agedays <- min.ht.vel <- mindiff.next.ht <- mindiff.prev.ht <- pair <- NULL
  pair.next <- pair.prev <- sex <- tanner.months <- tbc.sd <- temp.diff <- temp.exclude <- NULL
  v <- v.next <- v.prev <- who.maxdiff.next.ht <- who.mindiff.next.ht <- whoagegrp.ht <- NULL
  whoinc.1.ht <- whoinc.2.ht <- whoinc.3.ht <- whoinc.4.ht <- whoinc.6.ht <- NULL
  whoinc.age.ht <- NULL


  # initialize fields
  df[, (ewma.fields) := as.double(NaN)]
  df[, `:=`(
    v.prev = as.double(NaN),
    v.next = as.double(NaN),
    dewma.after.prev = as.double(NaN),
    dewma.before.next = as.double(NaN),
    abs.tbc.sd.prev = as.double(NaN),
    abs.tbc.sd.next = as.double(NaN),
    agedays.next = as.integer(NaN),
    abs.2ndlast.sd = as.double(NaN),
    mindiff.prev.ht = as.double(NaN),
    mindiff.next.ht = as.double(NaN),
    maxdiff.prev.ht = as.double(NaN),
    maxdiff.next.ht = as.double(NaN),
    pair.prev = FALSE,
    pair.next = FALSE
  )]

  # ewma fields are needed later -- calculate now for efficiency
  df[, (ewma.fields) := ewma(agedays, tbc.sd, ewma.exp, TRUE)]

  # calculate some useful values (e.g. dewma values and tbc.sd) for use in later steps
  df[, `:=`(
    dewma.all = tbc.sd - ewma.all,
    dewma.before = tbc.sd - ewma.before,
    dewma.after = tbc.sd - ewma.after,
    abs.tbc.sd = abs(tbc.sd)
  )]

  # 15a.  As with steps 11 and 14, only one value will be excluded per round, and the step will be repeated until there are no more values to exclude
  # b.  For each height, calculate the d_age=agedays of next value-agedays of current value
  # NOTE: obtain next measurement, ewma.before and abs.tbc.sd as well since they are needed later

  # for efficiency, bring get.next inline here (working on valid rows within a single parameter for a single subject)
  # structure c(field.name[-1], NA) == get.next
  df[, `:=`(
    agedays.next = c(agedays[-1], NA),
    v.next = c(v[-1], NA),
    dewma.before.next = c(dewma.before[-1], NA),
    abs.tbc.sd.next = c(abs.tbc.sd[-1], NA)
  )]
  df[, delta.agedays.next := agedays.next - agedays]

  # 15c.	For each height, calculate mid_agedays=0.5*(agedays of next value + agedays of current value)
  df[, mid.agedays := 0.5 * (agedays.next + agedays)]

  # 15d.	Generate variable tanner_months= 6+12*(round(mid_agedays/365.25))
  # only calculate for rows that relate to height (may speed up subsequent processing)
  df[, tanner.months := 6 + 12 * (round(mid.agedays / 365.25))]

  # 15e.	Merge with dataset tanner_ht_vel using sex and tanner_months – this will give you min_ht_vel and max_ht_vel
  setkey(df, sex, tanner.months)
  df = tanner.ht.vel[df]

  # 15f.	Calculate the following:
  #   i.	mindiff_ht=0.5*min_ht_vel*(d_agedays/365.25)^2-3 if d_agedays<365.25
  #   ii.	replace mindiff_ht=0.5*min_ht_vel-3 if d_agedays>365.25
  df[, ht.exp := ifelse(delta.agedays.next < 365.25,
                        min_ht.exp_under,
                        min_ht.exp_over)]
  df[, `:=`(maxdiff.next.ht = as.double(NA),
            mindiff.next.ht = as.double(NaN))]
  df[, mindiff.next.ht := minfactor * min.ht.vel * (delta.agedays.next /
                                                      365.25) ^ ht.exp - banddiff]

  # 15f.iii.	maxdiff_ht=2*max_ht_vel*(d_agedays/365.25)^1.5+5.5 if d_agedays>365.25
  #   iv.	replace maxdiff_ht=2*max_ht_vel*(d_agedays/365.25)^0.33+5.5 if d_agedays<365.25
  df[, ht.exp := ifelse(delta.agedays.next < 365.25,
                        max_ht.exp_under,
                        max_ht.exp_over)]
  df[, maxdiff.next.ht := maxfactor * max.ht.vel * (delta.agedays.next /
                                                      365.25) ^ ht.exp + banddiff_plus]

  # 15g.	Generate variable whoagegrp_ht=agedays/30.4375 rounded to the nearest integer
  df[, whoagegrp.ht := round(agedays / 30.4375)]

  # 15h.	Generate variable whoinc_age_ht based on values of d_agedays_ht according the the following table
  #   d_agedays_ht	whoinc_age_ht
  #   20-45	        1
  #   46-75	        2
  #   76-106	      3
  #   107-152	      4
  #   153-198	      6
  #   All others	  missing
  df[, whoinc.age.ht :=
       ifelse(delta.agedays.next < 20 ,
              NA,
              ifelse(
                delta.agedays.next <= 45,
                1,
                ifelse(
                  delta.agedays.next <= 75,
                  2,
                  ifelse(
                    delta.agedays.next <= 106,
                    3,
                    ifelse(
                      delta.agedays.next <= 152,
                      4,
                      ifelse(delta.agedays.next <= 198, 6, NA)
                    )
                  )
                )
              ))]

  # i.	Merge using sex and whoagegrp_ht using who_ht_vel_3sd and who_ht_maxvel_3sd; this will give you varaibles whoinc_i_ht and maxwhoinc_i_ht
  #     for various intervals where i is 1,2, 3,4, 6 and corresponds to whoinc_age_ht.
  setkey(df, sex, whoagegrp.ht)
  df = who.ht.vel[df]

  # restore original sort order (ensures valid.rows variable applies to correct rows)
  setkey(df, index)

  # 15j.	Generate variable who_mindiff_ht=whoinc_i_ht according to the value if whoinc_age_ht; make who_mindiff_ht missing if whoinc_i_ht or whoinc_age_ht is missing.
  df[, who.mindiff.next.ht := ifelse(
    delta.agedays.next < 20 ,
    NA,
    ifelse(
      delta.agedays.next <= 45,
      whoinc.1.ht,
      ifelse(
        delta.agedays.next <= 75,
        whoinc.2.ht,
        ifelse(
          delta.agedays.next <= 106,
          whoinc.3.ht,
          ifelse(
            delta.agedays.next <= 152,
            whoinc.4.ht,
            ifelse(delta.agedays.next <= 198, whoinc.6.ht, NA)
          )
        )
      )
    )
  )]

  # 15k.	Generate variable who_maxdiff_ht=max_whoinc_i_ht according to the value if whoinc_age_ht; make who_maxdiff_ht missing if max_whoinc_i_ht or
  #     whoinc_age_ht is missing.
  df[, who.maxdiff.next.ht := ifelse(
    delta.agedays.next < 20 ,
    NA,
    ifelse(
      delta.agedays.next <= 45,
      max.whoinc.1.ht,
      ifelse(
        delta.agedays.next <= 75,
        max.whoinc.2.ht,
        ifelse(
          delta.agedays.next <= 106,
          max.whoinc.3.ht,
          ifelse(
            delta.agedays.next <= 152,
            max.whoinc.4.ht,
            ifelse(delta.agedays.next <= 198, max.whoinc.6.ht, NA)
          )
        )
      )
    )
  )]

  # 15l.	Scale allowed value based on d_agedays_ht:
  #   1.	replace who_mindiff_`p'=who_mindiff_`p'*d_agedays_`p'/(whoinc_age_`p'*30.4375) if d_agedays_`p'<(whoinc_age_`p'*30.4375)
  #   2.	replace who_maxdiff_`p'=who_maxdiff_`p'*d_agedays_`p'/(whoinc_age_`p'*30.4375) if d_agedays_`p'>(whoinc_age_`p'*30.4375)
  df[delta.agedays.next < whoinc.age.ht * 30.4375,
     `:=`(
       who.mindiff.next.ht = who.mindiff.next.ht * delta.agedays.next / (whoinc.age.ht *
                                                                           30.4375),
       who.maxdiff.next.ht = who.maxdiff.next.ht * delta.agedays.next /
         (whoinc.age.ht * 30.4375)
     )]

  # 15m.	Replace mindiff_ht/maxdiff_ht with adjusted WHO value if Tanner value is missing or if both are present and age difference is < 9 months:
  #   1.	replace mindiff_`p'=0.5*who_mindiff_`p'-3 if who_mindiff_`p' is not missing & d_agedays_`p'<(9*30.4375)
  #   2.	replace maxdiff_`p'=2*who_maxdiff_`p'+3 if who_maxdiff_`p' is not missing & d_agedays_`p'<(9*30.4375)
  #   3.	replace mindiff_`p'=0.5*who_mindiff_`p'-3 if mindiff_`p' is missing & who_mindiff_`p' is not missing
  #   4.	replace maxdiff_`p'=2*who_maxdiff_`p'+3 if maxdiff_`p is missing & who_maxdiff_`p' is not missing

  # refactored logic slightly for efficiency
  df[!is.na(who.mindiff.next.ht) &
       (delta.agedays.next < 9 * 30.4375 |
          is.na(mindiff.next.ht)),
     `:=`(
       mindiff.next.ht = minfactor * who.mindiff.next.ht - banddiff,
       maxdiff.next.ht = maxfactor * who.maxdiff.next.ht + banddiff
     )]

  # 15m.5.  replace mindiff_`p'=-3 if mindiff_`p' is missing
  df[is.na(mindiff.next.ht), mindiff.next.ht := -3]

  # 15n.	Determine the min/maxdiffs for the previous age: mindiff_prev_ht, maxdiff_prev_ht
  # NOTE: obtain previous height, ewma.after value and abs.tbc.sd as well since they are needed in next steps

  # for efficiency, bring get.prev inline here (working on valid rows within a single parameter for a single subject)
  # structure c(NA, tbc.sd[-.N]) == get.prev
  df[, `:=`(
    v.prev = c(NA, v[-.N]),
    dewma.after.prev = c(NA, dewma.after[-.N]),
    abs.tbc.sd.prev = c(NA, abs.tbc.sd[-.N]),
    mindiff.prev.ht = c(NA, mindiff.next.ht[-.N]),
    maxdiff.prev.ht = c(NA, maxdiff.next.ht[-.N])
  )]

  # 15o.	Determine d_prev_ht=ht-htprev (set to missing for the first value for a subject) and d_next_ht=htnext-ht (set to missing for the last value for a subject)
  df[, `:=`(delta.prev.ht = v - v.prev,
            delta.next.ht = v.next - v)]

  # 15p.  Perform a EWMA calculation with the following modifications:
  #  i.	  Generate a variable pair=1 if (d_prev_ht<mindiff_prev_ht OR d_ht<mindiff_ht OR d_prev_ht>maxdiff_prev_ht  OR d_ht>maxdiff_ht) AND exc_ht==0
  df[, pair := na.as.false(
    delta.prev.ht < mindiff.prev.ht |
      delta.next.ht < mindiff.next.ht |
      delta.prev.ht > maxdiff.prev.ht |
      delta.next.ht > maxdiff.next.ht
  )]

  # for efficiency, bring get.prev and get.next inline here (working on valid rows within a single parameter for a single subject)
  # structure c(NA, field.name[-.N]) == get.prev
  # structure c(field.name[-1], NA) == get.next
  df[, `:=`(pair.prev = c(FALSE, pair[-.N]),
            pair.next = c(pair[-1], FALSE))]

  #  ii.	Generate bef_g_aftm1=1 if |Δewma_htbef| for the value of interest is greater than |Δewma_htaft| for the previous value
  #       AND the value of interest is not the first height value for that subject AND pair==1 AND pair for the previous value==1

  #  iii.	Generate aft_g_befp1=1 if |Δewma_htaft| for the value of interest is greater than |Δewma_htbef| for the next value
  #       AND the value of interest is not the last height value for that subject AND pair==1 AND pair for the next value==1
  # NOTE: pair.next will be NA last height, which will result in a FALSE value below
  df[, `:=`(
    bef.g.aftm1 = na.as.false(
      abs(dewma.before) > abs(dewma.after.prev)  & pair & pair.prev
    ),
    aft.g.befp1 = na.as.false(
      abs(dewma.after)  > abs(dewma.before.next) & pair & pair.next
    )
  )]

  #  iv.	Determine tbchtsd for each value as well as the one before prev_tbchtsd and after next_tbchtsd it
  # NOTE: done previously for efficiency

  # 15p.v.  Determine the total number of ht values for each subject (tot_ht)
  # NOTE: all rows are valid due to constraint in subj.df[...] statement
  num.valid = .N

  # 15q.	Identify a value for possible exclusion if one of the following sets of criteria are met. For values identified by each set of criteria determine
  #       the value of temp_diff using the formula given
  #   i.	d_prev_ht<mindiff_prev_ht & bef_g_aftm1_ht==1 & exc_ht==0 & mindiff_prev_ht is not missing
  #     a.  (temp_diff=|dewma_ht_bef|)
  df[, temp.diff := as.double(NaN)]
  df[, temp.exclude := factor(NA, levels = exclude.levels, ordered =
                                TRUE)]
  df[delta.prev.ht < mindiff.prev.ht & bef.g.aftm1,
     `:=`(temp.diff = abs(dewma.before),
          temp.exclude = 'Exclude-Min-Height-Change')]

  #   ii.	d_ht<mindiff_ht & aft_g_befp1_ht==1 & exc_ht==0 & mindiff_ht is not missing
  #     a.	(temp_diff=|dewma_ht_aft|)
  df[delta.next.ht < mindiff.next.ht & aft.g.befp1,
     `:=`(temp.diff = abs(dewma.after),
          temp.exclude = 'Exclude-Min-Height-Change')]

  #   iii.	d_prev_ht>maxdiff_prev_ht & bef_g_aftm1_ht==1 & exc_ht==0 & mindiff_prev_ht is not missing
  #     a.  (temp_diff=|dewma_ht_bef|)
  df[delta.prev.ht > maxdiff.prev.ht & bef.g.aftm1,
     `:=`(temp.diff = abs(dewma.before),
          temp.exclude = 'Exclude-Max-Height-Change')]

  #   iv.	d_ht>maxdiff_ht & aft_g_befp1_ht==1 & exc_ht==0 & mindiff_ht is not missing
  #     a.  (temp_diff=|dewma_ht_aft|)
  df[delta.next.ht > maxdiff.next.ht & aft.g.befp1,
     `:=`(temp.diff = abs(dewma.after),
          temp.exclude = 'Exclude-Max-Height-Change')]

  #   v.	d_prev_ht<mindiff_prev_ht & tot_ht==2 & |tbchtsd|>|prev_tbchtsd|
  #     a. for v-viii temp_diff is kept as missing
  #   vi. d_ht<mindiff_ht & tot_ht==2 & |tbchtsd|>|next_tbchtsd|
  df[delta.prev.ht < mindiff.prev.ht &
       num.valid == 2 & abs.tbc.sd > abs.tbc.sd.prev
     |
       delta.next.ht < mindiff.next.ht &
       num.valid == 2 & abs.tbc.sd > abs.tbc.sd.next,
     temp.exclude := 'Exclude-Min-Height-Change']

  #   vii.	d_prev_ht>maxdiff_prev_ht & tot_ht==2 & |tbchtsd|>|prev_tbchtsd|
  #   viii. d_ht>maxdiff_ht & tot_ht==2 & |tbchtsd|>|next_tbchtsd|
  df[delta.prev.ht > maxdiff.prev.ht &
       num.valid == 2 & abs.tbc.sd > abs.tbc.sd.prev
     |
       delta.next.ht > maxdiff.next.ht &
       num.valid == 2 & abs.tbc.sd > abs.tbc.sd.next,
     temp.exclude := 'Exclude-Max-Height-Change']

  return(df)
}

# function to check every carried forward in a string one by one
check_cf_string <- function(
  eval_df, wh_exclude,
  ewma.fields, tanner.ht.vel, who.ht.vel, exclude.levels,
  ewma.exp, minfactor, maxfactor, banddiff, banddiff_plus,
  min_ht.exp_under, min_ht.exp_over, max_ht.exp_under, max_ht.exp_over){

  # find the next include OR the row end
  next_incl <- which(
    eval_df$orig.exclude[(wh_exclude+1):nrow(eval_df)] == "Include"
  )[1]+wh_exclude
  if (is.na(next_incl)){
    next_incl <- nrow(eval_df)+1
  }

  # find the include this is based on
  first_incl <- which(
    eval_df$orig.exclude[1:wh_exclude] == "Include"
  )
  first_incl <- first_incl[length(first_incl)]

  # now we want to test all of the ones in the string
  verdict <- "Include"
  cf_ind <- first_incl+1
  while(verdict == "Include" & cf_ind < next_incl){
    # build the dataframe for evaluation
    sub_df <- copy(
      if (next_incl <= nrow(eval_df)){
        eval_df[c(first_incl, cf_ind, next_incl),]
      } else {
        eval_df[c(first_incl, cf_ind),]
      }
    )

    eval_sub <- calc_temp_exclusion_15(
      copy(sub_df),
      ewma.fields, tanner.ht.vel, who.ht.vel, exclude.levels,
      ewma.exp, minfactor, maxfactor, banddiff, banddiff_plus,
      min_ht.exp_under, min_ht.exp_over, max_ht.exp_under, max_ht.exp_over)

    verdict <-
      if (is.na(eval_sub$temp.exclude[2])){
        "Include"
      } else {
        "Exclude"
      }

    cf_ind <- cf_ind + 1
  }

  return(list("verdict" = verdict,
              "cf_ind" = cf_ind,
              "next_incl" = next_incl,
              "first_incl" = first_incl))
}

# function to calculate step 15 as in the original algorithm (no parameters)
# eval type: definitely "exclude" or "include"
calc_step_15_no_param <- function(
  df,
  eval_type = "exclude",
  ewma.fields, tanner.ht.vel, who.ht.vel, exclude.levels,
  ewma.exp){

  # avoid "no visible binding for global variable" warnings
  agedays <- tbc.sd <- ewma.all <- ewma.before <- ewma.after <- v <- dewma.before <- NULL
  abs.tbc.sd <- sex <- tanner.months <- ht.exp <- delta.agedays.next <- mindiff.next.ht <- NULL
  min.ht.vel <- maxdiff.next.ht <- max.ht.vel <- minhtvel.exp <- min.ht.vel.2sd <- NULL
  max.ht.vel.2sd <- whoagegrp.ht <- whoinc.age.ht <- index <- who.mindiff.next.ht <- NULL
  whoinc.1.ht <- whoinc.2.ht <- whoinc.3.ht <- whoinc.4.ht <- whoinc.6.ht <- NULL
  who.maxdiff.next.ht <- max.whoinc.1.ht <- max.whoinc.2.ht <- max.whoinc.3.ht <- NULL
  max.whoinc.4.ht <- max.whoinc.6.ht <- dewma.after <- v.prev <- v.next <- pair <- NULL
  delta.prev.ht <- mindiff.prev.ht <- delta.next.ht <- maxdiff.prev.ht <- NULL
  dewma.after.prev <- pair.prev <- dewma.before.next <- pair.next <- temp.diff <- NULL
  bef.g.aftm1 <- aft.g.befp1 <- abs.tbc.sd.prev <- abs.tbc.sd.next <- temp.exclude <- NULL

  # initialize fields
  df[, (ewma.fields) := as.double(NaN)]
  df[, `:=`(
    v.prev = as.double(NaN),
    v.next = as.double(NaN),
    dewma.after.prev = as.double(NaN),
    dewma.before.next = as.double(NaN),
    abs.tbc.sd.prev = as.double(NaN),
    abs.tbc.sd.next = as.double(NaN),
    agedays.next = as.integer(NaN),
    abs.2ndlast.sd = as.double(NaN),
    mindiff.prev.ht = as.double(NaN),
    mindiff.next.ht = as.double(NaN),
    maxdiff.prev.ht = as.double(NaN),
    maxdiff.next.ht = as.double(NaN),
    pair.prev = FALSE,
    pair.next = FALSE
  )]

  # ewma fields are needed later -- calculate now for efficiency
  df[, (ewma.fields) := ewma(agedays, tbc.sd, ewma.exp, TRUE)]

  # calculate some usefule values (e.g. dewma values and tbc.sd) for use in later steps
  df[, `:=`(
    dewma.all = tbc.sd - ewma.all,
    dewma.before = tbc.sd - ewma.before,
    dewma.after = tbc.sd - ewma.after,
    abs.tbc.sd = abs(tbc.sd)
  )]

  # 15a.  As with steps 11 and 14, only one value will be excluded per round, and the step will be repeated until there are no more values to exclude
  # b.  For each height, calculate the d_age=agedays of next value-agedays of current value
  # NOTE: obtain next measurement, ewma.before and abs.tbc.sd as well since they are needed later

  # for efficiency, bring get.next inline here (working on valid rows within a single parameter for a single subject)
  # structure c(field.name[-1], NA) == get.next
  df[, `:=`(
    agedays.next = c(agedays[-1], NA),
    v.next = c(v[-1], NA),
    dewma.before.next = c(dewma.before[-1], NA),
    abs.tbc.sd.next = c(abs.tbc.sd[-1], NA)
  )]
  df$delta.agedays.next <- with(df, agedays.next - agedays)

  # 15c.	For each height, calculate mid_agedays=0.5*(agedays of next value + agedays of current value)
  df$mid.agedays <- 0.5 * (df$agedays.next + df$agedays)

  # 15d.	Generate variable tanner_months= 6+12*(round(mid_agedays/365.25))
  # only calculate for rows that relate to height (may speed up subsequent processing)
  df$tanner.months <-
    with(df, 6 + 12 * (round(mid.agedays / 365.25)))

  # 15e.	Merge with dataset tanner_ht_vel using sex and tanner_months – this will give you min_ht_vel and max_ht_vel
  setkey(df, sex, tanner.months)
  df <- tanner.ht.vel[df]

  if (eval_type == "exclude"){
    # 15f.	Calculate the following:
    #   i.	mindiff_ht=0.5*min_ht_vel*(d_agedays/365.25)^2-3 if d_agedays<365.25
    #   ii.	replace mindiff_ht=0.5*min_ht_vel-3 if d_agedays>365.25
    df[, ht.exp := ifelse(delta.agedays.next < 365.25, 2, 0)]
    df[, `:=`(maxdiff.next.ht = as.double(NA),
              mindiff.next.ht = as.double(NaN))]
    df[, mindiff.next.ht := 0.5 * min.ht.vel * (delta.agedays.next /
                                                  365.25) ^ ht.exp - 3]

    # 15f.iii.	maxdiff_ht=2*max_ht_vel*(d_agedays/365.25)^1.5+5.5 if d_agedays>365.25
    #   iv.	replace maxdiff_ht=2*max_ht_vel*(d_agedays/365.25)^0.33+5.5 if d_agedays<365.25
    df[, ht.exp := ifelse(delta.agedays.next < 365.25, 0.33, 1.5)]
    df[, maxdiff.next.ht := 2 * max.ht.vel * (delta.agedays.next /
                                                365.25) ^ ht.exp + 5.5]
  } else { # we're evaluating ones we want to keep
    # 15f.	Calculate the following:
    #   i.	mindiff_ht=0.5*min_ht_vel*(d_agedays/365.25)^2-3 if d_agedays<365.25
    #   ii.	replace mindiff_ht=0.5*min_ht_vel-3 if d_agedays>365.25
    df[, ht.exp := ifelse(delta.agedays.next < 365.25, 1, 0)]
    df[, minhtvel.exp := ifelse(delta.agedays.next < 365.25, 0, 1)]
    df[, `:=`(maxdiff.next.ht = as.double(NA),
              mindiff.next.ht = as.double(NaN))]
    df[, mindiff.next.ht := min.ht.vel.2sd * (min.ht.vel) ^ minhtvel.exp *
         (delta.agedays.next /365.25) ^ ht.exp]

    # 15f.iii.	maxdiff_ht=2*max_ht_vel*(d_agedays/365.25)^1.5+5.5 if d_agedays>365.25
    #   iv.	replace maxdiff_ht=2*max_ht_vel*(d_agedays/365.25)^0.33+5.5 if d_agedays<365.25
    df[, maxdiff.next.ht := max.ht.vel.2sd * (delta.agedays.next / 365.25)]
  }

  # 15g.	Generate variable whoagegrp_ht=agedays/30.4375 rounded to the nearest integer
  df[, whoagegrp.ht := round(agedays / 30.4375)]

  # 15h.	Generate variable whoinc_age_ht based on values of d_agedays_ht according the the following table
  #   d_agedays_ht	whoinc_age_ht
  #   20-45	        1
  #   46-75	        2
  #   76-106	      3
  #   107-152	      4
  #   153-198	      6
  #   All others	  missing
  df[, whoinc.age.ht := ifelse(delta.agedays.next < 20 ,
                               NA,
                               ifelse(
                                 delta.agedays.next <= 45,
                                 1,
                                 ifelse(
                                   delta.agedays.next <= 75,
                                   2,
                                   ifelse(
                                     delta.agedays.next <= 106,
                                     3,
                                     ifelse(
                                       delta.agedays.next <= 152,
                                       4,
                                       ifelse(delta.agedays.next <= 198, 6, NA)
                                     )
                                   )
                                 )
                               ))]

  # i.	Merge using sex and whoagegrp_ht using who_ht_vel_3sd and who_ht_maxvel_3sd; this will give you varaibles whoinc_i_ht and maxwhoinc_i_ht
  #     for various intervals where i is 1,2, 3,4, 6 and corresponds to whoinc_age_ht.
  setkey(df, sex, whoagegrp.ht)
  df <- who.ht.vel[df]

  # restore original sort order (ensures valid.rows variable applies to correct rows)
  setkey(df, index)

  # 15j.	Generate variable who_mindiff_ht=whoinc_i_ht according to the value if whoinc_age_ht; make who_mindiff_ht missing if whoinc_i_ht or whoinc_age_ht is missing.
  df[, who.mindiff.next.ht := ifelse(
    delta.agedays.next < 20 ,
    NA,
    ifelse(
      delta.agedays.next <= 45,
      whoinc.1.ht,
      ifelse(
        delta.agedays.next <= 75,
        whoinc.2.ht,
        ifelse(
          delta.agedays.next <= 106,
          whoinc.3.ht,
          ifelse(
            delta.agedays.next <= 152,
            whoinc.4.ht,
            ifelse(delta.agedays.next <= 198, whoinc.6.ht, NA)
          )
        )
      )
    )
  )]

  # 15k.	Generate variable who_maxdiff_ht=max_whoinc_i_ht according to the value if whoinc_age_ht; make who_maxdiff_ht missing if max_whoinc_i_ht or
  #     whoinc_age_ht is missing.
  df[, who.maxdiff.next.ht := ifelse(
    delta.agedays.next < 20 ,
    NA,
    ifelse(
      delta.agedays.next <= 45,
      max.whoinc.1.ht,
      ifelse(
        delta.agedays.next <= 75,
        max.whoinc.2.ht,
        ifelse(
          delta.agedays.next <= 106,
          max.whoinc.3.ht,
          ifelse(
            delta.agedays.next <= 152,
            max.whoinc.4.ht,
            ifelse(delta.agedays.next <= 198, max.whoinc.6.ht, NA)
          )
        )
      )
    )
  )]

  # 15l.	Scale allowed value based on d_agedays_ht:
  #   1.	replace who_mindiff_`p'=who_mindiff_`p'*d_agedays_`p'/(whoinc_age_`p'*30.4375) if d_agedays_`p'<(whoinc_age_`p'*30.4375)
  #   2.	replace who_maxdiff_`p'=who_maxdiff_`p'*d_agedays_`p'/(whoinc_age_`p'*30.4375) if d_agedays_`p'>(whoinc_age_`p'*30.4375)
  df[delta.agedays.next < whoinc.age.ht * 30.4375,
     `:=`(
       who.mindiff.next.ht = who.mindiff.next.ht * delta.agedays.next / (whoinc.age.ht *
                                                                           30.4375),
       who.maxdiff.next.ht = who.maxdiff.next.ht * delta.agedays.next /
         (whoinc.age.ht * 30.4375)
     )]

  # 15m.	Replace mindiff_ht/maxdiff_ht with adjusted WHO value if Tanner value is missing or if both are present and age difference is < 9 months:
  #   1.	replace mindiff_`p'=0.5*who_mindiff_`p'-3 if who_mindiff_`p' is not missing & d_agedays_`p'<(9*30.4375)
  #   2.	replace maxdiff_`p'=2*who_maxdiff_`p'+3 if who_maxdiff_`p' is not missing & d_agedays_`p'<(9*30.4375)
  #   3.	replace mindiff_`p'=0.5*who_mindiff_`p'-3 if mindiff_`p' is missing & who_mindiff_`p' is not missing
  #   4.	replace maxdiff_`p'=2*who_maxdiff_`p'+3 if maxdiff_`p is missing & who_maxdiff_`p' is not missing

  if (eval_type == "exclude"){
    # refactored logic slightly for efficiency
    df[!is.na(who.mindiff.next.ht) &
         (delta.agedays.next < 9 * 30.4375 |
            is.na(mindiff.next.ht)),
       `:=`(
         mindiff.next.ht = 0.5 * who.mindiff.next.ht - 3,
         maxdiff.next.ht = 2.0 * who.maxdiff.next.ht + 3
       )]
  } else { # we're evaluating if we want to keep or not
    # should only be used if age is less than 24 months
    # refactored logic slightly for efficiency
    df[!is.na(who.mindiff.next.ht) &
         agedays < 24 * 30.4375 &
         (delta.agedays.next < 9 * 30.4375 |
            is.na(mindiff.next.ht)),
       `:=`(
         mindiff.next.ht = who.mindiff.next.ht,
         maxdiff.next.ht = who.maxdiff.next.ht
       )]
  }

  # 15m.5.  replace mindiff_`p'=-3 if mindiff_`p' is missing
  df[is.na(mindiff.next.ht), mindiff.next.ht := -3]

  # 15n.	Determine the min/maxdiffs for the previous age: mindiff_prev_ht, maxdiff_prev_ht
  # NOTE: obtain previous height, ewma.after value and abs.tbc.sd as well since they are needed in next steps

  # for efficiency, bring get.prev inline here (working on valid rows within a single parameter for a single subject)
  # structure c(NA, tbc.sd[-.N]) == get.prev
  df[, `:=`(
    v.prev = c(NA, v[-.N]),
    dewma.after.prev = c(NA, dewma.after[-.N]),
    abs.tbc.sd.prev = c(NA, abs.tbc.sd[-.N]),
    mindiff.prev.ht = c(NA, mindiff.next.ht[-.N]),
    maxdiff.prev.ht = c(NA, maxdiff.next.ht[-.N])
  )]

  # 15o.	Determine d_prev_ht=ht-htprev (set to missing for the first value for a subject) and d_next_ht=htnext-ht (set to missing for the last value for a subject)
  df[, `:=`(delta.prev.ht = v - v.prev,
            delta.next.ht = v.next - v)]

  # 15p.  Perform a EWMA calculation with the following modifications:
  #  i.	  Generate a variable pair=1 if (d_prev_ht<mindiff_prev_ht OR d_ht<mindiff_ht OR d_prev_ht>maxdiff_prev_ht  OR d_ht>maxdiff_ht) AND exc_ht==0
  df[, pair := na_as_false(
    delta.prev.ht < mindiff.prev.ht |
      delta.next.ht < mindiff.next.ht |
      delta.prev.ht > maxdiff.prev.ht |
      delta.next.ht > maxdiff.next.ht
  )]

  # for efficiency, bring get.prev and get.next inline here (working on valid rows within a single parameter for a single subject)
  # structure c(NA, field.name[-.N]) == get.prev
  # structure c(field.name[-1], NA) == get.next
  df[, `:=`(pair.prev = c(FALSE, pair[-.N]),
            pair.next = c(pair[-1], FALSE))]

  #  ii.	Generate bef_g_aftm1=1 if |Δewma_htbef| for the value of interest is greater than |Δewma_htaft| for the previous value
  #       AND the value of interest is not the first height value for that subject AND pair==1 AND pair for the previous value==1

  #  iii.	Generate aft_g_befp1=1 if |Δewma_htaft| for the value of interest is greater than |Δewma_htbef| for the next value
  #       AND the value of interest is not the last height value for that subject AND pair==1 AND pair for the next value==1
  # NOTE: pair.next will be NA last height, which will result in a FALSE value below
  df[, `:=`(
    bef.g.aftm1 = na_as_false(
      abs(dewma.before) > abs(dewma.after.prev)  & pair & pair.prev
    ),
    aft.g.befp1 = na_as_false(
      abs(dewma.after)  > abs(dewma.before.next) & pair & pair.next
    )
  )]

  #  iv.	Determine tbchtsd for each value as well as the one before prev_tbchtsd and after next_tbchtsd it
  # NOTE: done previously for efficiency

  # 15p.v.  Determine the total number of ht values for each subject (tot_ht)
  # NOTE: all rows are valid due to constraint in subj.df[...] statement
  num.valid <- .N

  # 15q.	Identify a value for possible exclusion if one of the following sets of criteria are met. For values identified by each set of criteria determine
  #       the value of temp_diff using the formula given
  #   i.	d_prev_ht<mindiff_prev_ht & bef_g_aftm1_ht==1 & exc_ht==0 & mindiff_prev_ht is not missing
  #     a.  (temp_diff=|dewma_ht_bef|)
  df[, temp.diff := as.double(NaN)]
  df$temp.exclude <-
    factor(NA, levels = exclude.levels, ordered = TRUE)
  df[delta.prev.ht < mindiff.prev.ht & bef.g.aftm1,
     `:=`(temp.diff = abs(dewma.before),
          temp.exclude = 'Exclude-Min-Height-Change')]

  #   ii.	d_ht<mindiff_ht & aft_g_befp1_ht==1 & exc_ht==0 & mindiff_ht is not missing
  #     a.	(temp_diff=|dewma_ht_aft|)
  df[delta.next.ht < mindiff.next.ht & aft.g.befp1,
     `:=`(temp.diff = abs(dewma.after),
          temp.exclude = 'Exclude-Min-Height-Change')]

  #   iii.	d_prev_ht>maxdiff_prev_ht & bef_g_aftm1_ht==1 & exc_ht==0 & mindiff_prev_ht is not missing
  #     a.  (temp_diff=|dewma_ht_bef|)
  df[delta.prev.ht > maxdiff.prev.ht & bef.g.aftm1,
     `:=`(temp.diff = abs(dewma.before),
          temp.exclude = 'Exclude-Max-Height-Change')]

  #   iv.	d_ht>maxdiff_ht & aft_g_befp1_ht==1 & exc_ht==0 & mindiff_ht is not missing
  #     a.  (temp_diff=|dewma_ht_aft|)
  df[delta.next.ht > maxdiff.next.ht & aft.g.befp1,
     `:=`(temp.diff = abs(dewma.after),
          temp.exclude = 'Exclude-Max-Height-Change')]

  #   v.	d_prev_ht<mindiff_prev_ht & tot_ht==2 & |tbchtsd|>|prev_tbchtsd|
  #     a. for v-viii temp_diff is kept as missing
  #   vi. d_ht<mindiff_ht & tot_ht==2 & |tbchtsd|>|next_tbchtsd|
  df[delta.prev.ht < mindiff.prev.ht &
       num.valid == 2 & abs.tbc.sd > abs.tbc.sd.prev
     |
       delta.next.ht < mindiff.next.ht &
       num.valid == 2 & abs.tbc.sd > abs.tbc.sd.next,
     temp.exclude := 'Exclude-Min-Height-Change']

  #   vii.	d_prev_ht>maxdiff_prev_ht & tot_ht==2 & |tbchtsd|>|prev_tbchtsd|
  #   viii. d_ht>maxdiff_ht & tot_ht==2 & |tbchtsd|>|next_tbchtsd|
  df[delta.prev.ht > maxdiff.prev.ht &
       num.valid == 2 & abs.tbc.sd > abs.tbc.sd.prev
     |
       delta.next.ht > maxdiff.next.ht &
       num.valid == 2 & abs.tbc.sd > abs.tbc.sd.next,
     temp.exclude := 'Exclude-Max-Height-Change']

  return(df$temp.exclude)
}

#' Answers for adjustcarryforward
#'
#' Determines what should absolutely be reincluded or definitely excluded
#' for a given dataset, already run through \code{cleangrowth}.
#'
#' @param subjid Vector of unique identifiers for each subject in the database.
#' @param param Vector identifying each measurement, may be 'WEIGHTKG', 'HEIGHTCM', or 'LENGTHCM'
#'   'HEIGHTCM' vs. 'LENGTHCM' only affects z-score calculations between ages 24 to 35 months (730 to 1095 days).
#'   All linear measurements below 731 days of life (age 0-23 months) are interpreted as supine length, and
#'   all linear measurements above 1095 days of life (age 36+ months) are interpreted as standing height.
#'   Note: at the moment, all LENGTHCM will be converted to HEIGHTCM. In the future, the algorithm will be updated to consider this difference.
#' @param agedays Numeric vector containing the age in days at each measurement.
#' @param sex Vector identifying the gender of the subject, may be 'M', 'm', or 0 for males, vs. 'F',
#'  'f' or 1 for females.
#' @param measurement Numeric vector containing the actual measurement data.  Weight must be in
#'   kilograms (kg), and linear measurements (height vs. length) in centimeters (cm).
#' @param orig.exclude Vector of exclusion assessment results from cleangrowth()
#' @param sd.recenter Data frame or table with median SD-scores per day of life
#' @param ewma.exp Exponent to use for weighting measurements in the exponentially weighted moving
#'  average calculations. Defaults to -1.5. This exponent should be negative in order to weight growth
#'  measurements closer to the measurement being evaluated more strongly. Exponents that are further from
#'  zero (e.g. -3) will increase the relative influence of measurements close in time to the measurement
#'  being evaluated compared to using the default exponent.
#' @param ref.data.path Path to reference data. If not supplied, the year 2000
#' Centers for Disease Control (CDC) reference data will be used.
#' @param quietly Determines if function messages are to be displayed and if log files (parallel only)
#' are to be generated. Defaults to TRUE.
#'
#' @return A data frame, containing an index "n" of rows, corresponding to the
#' original order of the input vectors, and "acf_answers", containing the answers
#' on whether a height value should be kept or excluded (returns "Definitely
#' Exclude", "Definitely Include", or "Unknown" for height values, NA for weight
#' values).
#'
#' @export
#' @rawNamespace import(plyr, except = c(failwith, id, summarize, count, desc, mutate, arrange, rename, is.discrete, summarise, summarize))
#' @rawNamespace import(dplyr, except = c(last, first, summarize, src, between))
#' @import data.table
#' @rawNamespace import(R.utils, except = c(extract))
# NOTE: no examples, since this is a temporary function
acf_answers <- function(subjid,
                        param,
                        agedays,
                        sex,
                        measurement,
                        orig.exclude,
                        sd.recenter = NA,
                        ewma.exp = -1.5,
                        ref.data.path = "",
                        quietly = TRUE){

  # avoid "no visible binding for global variable" warnings
  tanner.months <- whoagegrp_ht <- whoagegrp.ht <- z.orig <- z.orig <- v <- sd.orig <- NULL
  index <- exclude <- tbc.sd <- sd.median <- acf_answer <- NULL

  # process ----
  # organize data into a dataframe along with a line "index" so the original data order can be recovered
  data.all <- data.table(
    line = seq_along(measurement),
    subjid = as.factor(subjid),
    param,
    agedays = as.integer(agedays),
    v = ifelse(measurement == 0, NaN, measurement),
    sex = as.integer(ifelse(
      sex %in% c(0, 'm', 'M'), 0, ifelse(sex %in% c(1, 'f', 'F'), 1, NA)
    )),
    orig.exclude = as.factor(orig.exclude)
  )

  ### ADJUSTCF EDIT
  data.all <- data.all[, n := 1:.N]
  ### ENDEDIT

  data.orig <- data.all

  setkey(data.all, subjid)

  subjid.unique <- unique(data.all$subjid)

  # tanner/who 3 SD ----

  # load tanner height velocity data. sex variable is defined such that 0=male and 1=female
  # recode column names to match syntactic style ("." rather than "_" in variable names)
  tanner_ht_vel_path <- ifelse(
    ref.data.path == "",
    system.file("extdata/tanner_ht_vel.csv.gz", package = "growthcleanr"),
    paste(ref.data.path, "tanner_ht_vel.csv.gz", sep =
            "")
  )
  tanner.ht.vel <- fread(tanner_ht_vel_path)

  setnames(tanner.ht.vel,
           colnames(tanner.ht.vel),
           gsub('_', '.', colnames(tanner.ht.vel)))
  setkey(tanner.ht.vel, sex, tanner.months)
  # keep track of column names in the tanner data
  tanner.fields <- colnames(tanner.ht.vel)
  tanner.fields <- tanner.fields[!tanner.fields %in% c('sex', 'tanner.months')]

  who_max_ht_vel_path <- ifelse(
    ref.data.path == "",
    system.file("extdata/who_ht_maxvel_3sd.csv.gz", package = "growthcleanr"),
    paste(ref.data.path, "who_ht_maxvel_3sd.csv.gz", sep =
            "")
  )

  who_ht_vel_3sd_path <- ifelse(
    ref.data.path == "",
    system.file("extdata/who_ht_vel_3sd.csv.gz", package = "growthcleanr"),
    paste(ref.data.path, "who_ht_vel_3sd.csv.gz", sep =
            "")
  )
  who.max.ht.vel <- fread(who_max_ht_vel_path)
  who.ht.vel <- fread(who_ht_vel_3sd_path)
  setkey(who.max.ht.vel, sex, whoagegrp_ht)
  setkey(who.ht.vel, sex, whoagegrp_ht)
  who.ht.vel <- as.data.table(dplyr::full_join(who.ht.vel, who.max.ht.vel, by =
                                                 c('sex', 'whoagegrp_ht')))

  setnames(who.ht.vel, colnames(who.ht.vel), gsub('_', '.', colnames(who.ht.vel)))
  setkey(who.ht.vel, sex, whoagegrp.ht)
  # keep track of column names in the who growth velocity data
  who.fields <- colnames(who.ht.vel)
  who.fields <- who.fields[!who.fields %in% c('sex', 'whoagegrp.ht')]

  # tanner/who 2 SD ----

  # load tanner height velocity data. sex variable is defined such that 0=male and 1=female
  # recode column names to match syntactic style ("." rather than "_" in variable names)
  tanner_ht_vel_2sd_path <- ifelse(
    ref.data.path == "",
    system.file("extdata/tanner_ht_vel_with_2sd.csv.gz", package = "growthcleanr"),
    paste(ref.data.path, "tanner_ht_vel_with_2sd.csv.gz", sep = "")
  )
  tanner.ht.vel.2sd <- fread(tanner_ht_vel_2sd_path)

  setnames(tanner.ht.vel.2sd,
           colnames(tanner.ht.vel.2sd),
           gsub('_', '.', colnames(tanner.ht.vel.2sd)))
  setkey(tanner.ht.vel.2sd, sex, tanner.months)
  # keep track of column names in the tanner data
  tanner.fields.2sd <- colnames(tanner.ht.vel.2sd)
  tanner.fields.2sd <-
    tanner.fields.2sd[!tanner.fields.2sd %in% c('sex', 'tanner.months')]

  who_max_ht_vel_2sd_path <- ifelse(
    ref.data.path == "",
    system.file("extdata/who_ht_maxvel_2sd.csv.gz", package = "growthcleanr"),
    paste(ref.data.path, "who_ht_maxvel_2sd.csv.gz", sep =
            "")
  )

  who_ht_vel_2sd_path <- ifelse(
    ref.data.path == "",
    system.file("extdata/who_ht_vel_2sd.csv.gz", package = "growthcleanr"),
    paste(ref.data.path, "who_ht_vel_2sd.csv.gz", sep =
            "")
  )
  who.max.ht.vel.2sd <- fread(who_max_ht_vel_2sd_path)
  who.ht.vel.2sd <- fread(who_ht_vel_2sd_path)
  setkey(who.max.ht.vel.2sd, sex, whoagegrp_ht)
  setkey(who.ht.vel.2sd, sex, whoagegrp_ht)
  who.ht.vel.2sd <-
    as.data.table(dplyr::full_join(who.ht.vel.2sd, who.max.ht.vel.2sd, by =
                                     c('sex', 'whoagegrp_ht')))

  setnames(who.ht.vel.2sd,
           colnames(who.ht.vel.2sd),
           gsub(
             ".2sd", "", # replace 2sd with nothing, for ease of use in calculations
             gsub('_', '.', colnames(who.ht.vel.2sd))
            ))
  setkey(who.ht.vel.2sd, sex, whoagegrp.ht)
  # keep track of column names in the who growth velocity data
  who.fields.2sd <- colnames(who.ht.vel.2sd)
  who.fields.2sd <- who.fields.2sd[!who.fields.2sd %in% c('sex', 'whoagegrp.ht')]

  # getting zscores ----

  # recategorize linear parameters as 'HEIGHTCM'
  # NOTE: this will be changed in future to consider this difference
  data.all[param == 'LENGTHCM', param := 'HEIGHTCM']

  # calculate z scores
  if (!quietly)
    cat(sprintf("[%s] Calculating z-scores...\n", Sys.time()))
  measurement.to.z <- read_anthro(ref.data.path, cdc.only = TRUE)
  data.all[, z.orig := measurement.to.z(param, agedays, sex, v)]

  # calculate "standard deviation" scores
  if (!quietly)
    cat(sprintf("[%s] Calculating SD-scores...\n", Sys.time()))
  data.all[, sd.orig := measurement.to.z(param, agedays, sex, v, TRUE)]

  # sort by subjid, param, agedays
  setkey(data.all, subjid, param, agedays)

  # add a new convenience index for bookkeeping
  data.all[, index := 1:.N]

  # enumerate the different exclusion levels
  exclude.levels <- c(
    'Missing',
    'No Change',
    'Include',
    'Exclude-Min-Height-Change',
    'Exclude-Max-Height-Change'
  )

  # Mark missing values for exclusion
  data.all[, exclude := factor(with(data.all, ifelse(
    is.na(v) |
      agedays < 0, 'Missing', 'No Change'
  )),
  levels = exclude.levels,
  ordered = TRUE)] # why is this ordered??

  # define field names needed by helper functions
  ewma.fields <- c('ewma.all', 'ewma.before', 'ewma.after')

  # 3.  SD-score recentering: Because the basis of the method is comparing SD-scores over time, we need to account for the fact that
  #     the mean SD-score for the population changes with age.
  # a.	Determine the median cdc*sd for each parameter by year of age (with sexes combined): median*sd.
  # b.	The median*sd should be considered to apply to midyear-age, defined as the age in days with the same value as the integer
  #     portion of (365.25*year + 365.25/2).
  # c.	Linearly interpolate median*sd for each parameter between each midyear-age, naming the interpolated values rc*sd.
  # d.	For ages below the first midyear-age, let rc*sd equal the median*sd for the earliest year.
  #     For ages above the last midyear_age, let rc*sd equal the median*sd for the last year.
  # e.	Subtract rcsd_* from SDorig to create the recentered SD-score.  This recentered SD-score, labeled tbc*sd
  #     (stands for "to be cleaned") will be used for most of the rest of the analyses.
  # f.	In future steps I will sometimes refer to measprev and measnext which refer to the previous or next wt or ht measurement
  #     for which exc_*==0 for the subject and parameter, when the data are sorted by subject, parameter, and agedays. SDprev and SDnext refer to the tbc*sd of the previous or next measurement.

  if (!quietly)
    cat(sprintf("[%s] Re-centering data...\n", Sys.time()))

  # see function definition below for explanation of the re-centering process
  # returns a data table indexed by param, sex, agedays
  if (!is.data.table(sd.recenter)) {
    # ADJUSTCF EDIT - be explicit about levels to keep
    keep.levels <- c(
      "Include",
      "Unit-Error-High",
      "Unit-Error-Low",
      "Unit-Error-Possible",
      "Swapped-Measurements"
    )
    sd.recenter <- data.all[orig.exclude %in% keep.levels, sd_median(param, sex, agedays, sd.orig)]
    # END EDIT
  }
  # add sd.recenter to data, and recenter
  setkey(data.all, param, sex, agedays)
  data.all <- sd.recenter[data.all]
  setkey(data.all, subjid, param, agedays)
  data.all[, tbc.sd := sd.orig - sd.median]

  # safety check: treat observations where tbc.sd cannot be calculated as missing
  data.all[is.na(tbc.sd), exclude := 'Missing']

  # start evaluation ----

  if (!quietly)
    cat(sprintf("[%s] Calculating definitely exclude/include...\n", Sys.time()))

  data.all[param == 'HEIGHTCM', acf_answer := (function(subj.df) {
    # assign some book keeping variables
    #subj.df[, `:=`(subjid = subjid, param='HEIGHTCM',index=1:.N)]
    subj.df[, index := 1:.N]

    # use a closure to discard all the extra fields added to df with each iteration
    subj.df[,
      acf_answer := (function (df) {
        # calculate definitely exclude
        # do steps 15a - 15q (functionalized for ease)
        def_excl <- calc_step_15_no_param(
          copy(df),
          eval_type = "exclude",
          ewma.fields, tanner.ht.vel, who.ht.vel, exclude.levels,
          ewma.exp)

        def_incl <- calc_step_15_no_param(
          copy(df),
          eval_type = "include",
          ewma.fields, tanner.ht.vel.2sd, who.ht.vel.2sd, exclude.levels,
          ewma.exp)

        # calculate which should definitely be included and excluded
        verdict <- rep("Unknown", length(def_incl))
        verdict[!is.na(def_excl)] <- "Definitely Exclude"
        verdict[is.na(def_incl)] <- "Definitely Include"

        return(verdict)

      })(copy(.SD))]

    setkey(subj.df, index)
    return(subj.df$acf_answer)
  })(copy(.SD)), by = .(subjid), .SDcols = c('sex', 'agedays', 'v', 'tbc.sd', 'exclude', 'orig.exclude')]

  # formulating results
  acf_answer_df <- data.frame(n = data.all$n, acf_answer = data.all$acf_answer)
  # sort for user ease
  acf_answer_df <- acf_answer_df[order(acf_answer_df$n),]

  return(acf_answer_df)
}

#' adjustcarryforward
#' \code{adjustcarryforward} Uses absolute height velocity to identify values
#' excluded as carried forward values for reinclusion.
#' @param subjid Vector of unique identifiers for each subject in the database.
#' @param param Vector identifying each measurement, may be 'WEIGHTKG', 'HEIGHTCM', or 'LENGTHCM'
#'   'HEIGHTCM' vs. 'LENGTHCM' only affects z-score calculations between ages 24 to 35 months (730 to 1095 days).
#'   All linear measurements below 731 days of life (age 0-23 months) are interpreted as supine length, and
#'   all linear measurements above 1095 days of life (age 36+ months) are interpreted as standing height.
#'   Note: at the moment, all LENGTHCM will be converted to HEIGHTCM. In the future, the algorithm will be updated to consider this difference.
#' @param agedays Numeric vector containing the age in days at each measurement.
#' @param sex Vector identifying the gender of the subject, may be 'M', 'm', or 0 for males, vs. 'F',
#'  'f' or 1 for females.
#' @param measurement Numeric vector containing the actual measurement data.  Weight must be in
#'   kilograms (kg), and linear measurements (height vs. length) in centimeters (cm).
#' @param orig.exclude Vector of exclusion assessment results from cleangrowth()
#' @param exclude_opt Number from 0 to 3 indicating which option to use to handle strings of carried-forwards:
#'    0. no change.
#'    1. when deciding to exclude values, if we have a string of carried forwards,
#'    drop the most deviant value, and all CFs in the same string, and move on as
#'    normal.
#'    2. when deciding to exclude values, if the most deviant in a
#'    string of carried forwards is flagged, check all the CFs in that
#'    string from 1:N. Exclude all after the first that is flagged for
#'    exclusion when comparing to the Include before and after. Do not
#'    remove things designated as include.
#'    3. when deciding to exclude values, if the most deviant in a
#'    string of carried forwards is flagged, check all the CFs in that
#'    string from 1:N. Exclude all after the first that is flagged for
#'    exclusion when comparing to the Include before and after. Make sure
#'    remove things designated as include.
#' @param sd.recenter Data frame or table with median SD-scores per day of life
#' @param ewma.exp Exponent to use for weighting measurements in the exponentially weighted moving
#'  average calculations. Defaults to -1.5. This exponent should be negative in order to weight growth
#'  measurements closer to the measurement being evaluated more strongly. Exponents that are further from
#'  zero (e.g. -3) will increase the relative influence of measurements close in time to the measurement
#'  being evaluated compared to using the default exponent.
#' @param ref.data.path Path to reference data. If not supplied, the year 2000
#' Centers for Disease Control (CDC) reference data will be used.
#' @param quietly Determines if function messages are to be displayed and if log files (parallel only)
#' are to be generated. Defaults to TRUE.
#' @param minfactor Sweep variable for computing mindiff.next.ht in 15f, default 0.5
#' @param maxfactor Sweep variable for computing maxdiff.next.ht in 15f, default 2
#' @param banddiff Sweep variable for computing mindiff.next.ht in 15f, default 3
#' @param banddiff_plus Sweep variable for computing maxdiff.next.ht in 15, default 5.5
#' @param min_ht.exp_under Sweep variable for computing ht.exp in 15f, default 2
#' @param min_ht.exp_over Sweep variable for computing ht.exp in 15f, default 0
#' @param max_ht.exp_under Sweep variable for computing ht.exp in 15f, default 0.33
#' @param max_ht.exp_over Sweep variable for computing ht.exp in 15f, default 1.5
#'
#' @return Re-evaluated exclusion assessments based on height velocity.
#'
#' @export
#' @rawNamespace import(plyr, except = c(failwith, id, summarize, count, desc, mutate, arrange, rename, is.discrete, summarise, summarize))
#' @rawNamespace import(dplyr, except = c(last, first, summarize, src, between))
#' @import data.table
#' @examples
#' \donttest{
#' # Run on a small subset of given data
#' df <- as.data.frame(syngrowth)
#' df <- df[df$subjid %in% unique(df[, "subjid"])[1:2], ]
#' clean_df <- cbind(df,
#'                   "gcr_result" = cleangrowth(df$subjid,
#'                                              df$param,
#'                                              df$agedays,
#'                                              df$sex,
#'                                              df$measurement))
#'
#' # Adjust carry forward values in cleaned data
#' adj_clean <- adjustcarryforward(subjid = clean_df$subjid,
#'                                 param = clean_df$param,
#'                                 agedays = clean_df$agedays,
#'                                 sex = clean_df$sex,
#'                                 measurement = clean_df$measurement,
#'                                 orig.exclude = clean_df$gcr_result)
#' }
adjustcarryforward <- function(subjid,
                               param,
                               agedays,
                               sex,
                               measurement,
                               orig.exclude,
                               exclude_opt = 0,
                               sd.recenter = NA,
                               ewma.exp = -1.5,
                               ref.data.path = "",
                               quietly = TRUE,
                               minfactor = 0.5,
                               maxfactor = 2,
                               banddiff = 3,
                               banddiff_plus = 5.5,
                               min_ht.exp_under = 2,
                               min_ht.exp_over = 0,
                               max_ht.exp_under = 0.33,
                               max_ht.exp_over = 1.5) {

  # Avoid "undefined global functions/variables" warnings
  v <- sd.orig <- sd.median <- tbc.sd <- agedays.next <- mid.agedays <- min.ht.vel <- NULL
  delta.agedays.next <- ht.exp <- max.ht.vel <- mindiff.next.ht <- temp.exclude <- NULL
  ecf_tmp <- ewma.all <- ewma.before <- ewma.after <- abs.tbc.sd <- whoinc.1.ht <- NULL
  whoinc.2.ht <- whoinc.3.ht <- whoinc.4.ht <- whoinc.6.ht <- max.whoinc.1.ht <- NULL
  max.whoinc.2.ht <- max.whoinc.3.ht <- max.whoinc.4.ht <- max.whoinc.6.ht <- NULL
  whoinc.age.ht <- whoinc.age.ht <- who.mindiff.next.ht <- who.maxdiff.next.ht <- NULL
  dewma.after <- maxdiff.next.ht <- v.prev <- v.next <- delta.prev.ht <- NULL
  mindiff.prev.ht <- delta.next.ht <- maxdiff.prev.ht <- pair <- dewma.after.prev <- NULL
  pair.prev <- dewma.before.next <- pair.next <- bef.g.aftm1 <- aft.g.befp1 <- NULL
  abs.tbc.sd.prev <- abs.tbc.sd.next <- exclude <- n <- dewma.before <- NULL
  tanner.months <- whoagegrp_ht <- whoagegrp.ht <- z.orig <- index <- temp.diff <- NULL

  # check option is valid
  if (!exclude_opt %in% 0:3){
    stop("Invalid exclude_opt. Enter a number from 0 to 3.")
  }

  # organize data into a dataframe along with a line "index" so the original data order can be recovered
  data.all <- data.table(
    line = seq_along(measurement),
    subjid = as.factor(subjid),
    param,
    agedays = as.integer(agedays),
    v = ifelse(measurement == 0, NaN, measurement),
    sex = as.integer(ifelse(
      sex %in% c(0, 'm', 'M'), 0, ifelse(sex %in% c(1, 'f', 'F'), 1, NA)
    )),
    orig.exclude = as.factor(orig.exclude)
  )

  ### ADJUSTCF EDIT
  data.all <- data.all[, n := 1:.N]
  ### ENDEDIT

  data.orig <- data.all

  setkey(data.all, subjid)

  subjid.unique <- unique(data.all$subjid)

  #### ADJUSTCF EDIT ####
  # for this purpose, want to subset dataset down to just "Exclude-Carried-Forward" and "Include" - assume all other measurements are invalid
  # want to remove any carried forward values whose non-carried forward is also excluded
  # data.all <- data.all %>%
  #   mutate(orig.exclude.lag = lag(orig.exclude, n = 1)) %>%
  #   filter(!(
  #     orig.exclude == "Exclude-Carried-Forward" &
  #       grepl("exclude", orig.exclude.lag, ignore.case = TRUE)
  #   )) %>%
  #   filter(orig.exclude %in% c("Exclude-Carried-Forward", "Include")) %>%
  #   select(-orig.exclude.lag)

  # NEW EDIT --
  # remove all the weight measurements
  data.all <- data.all %>%
    filter(param %in% c("HEIGHTCM", "LENGTHCM"))

  # filter to only subjects with possible carried forwards - n is here to merge back
  # if they have all includes, filter them out
  data.all <- data.all %>%
    filter(subjid %in% data.all$subjid[data.all$orig.exclude == "Exclude-Carried-Forward"]) %>%
    as.data.table()

  # here's what we want to filter out -- anything that's not carried forward/include
  # we're also going to include strings of carried forward
  # but we also need to make sure they're not coming from an excluded value

  # start of string to remove: everything that isn't include/excl-cf
  st <- which(!data.all$orig.exclude %in% c("Exclude-Carried-Forward", "Include"))
  # end of string: include or the end of a subject
  subj_end <- length(data.all$subjid)-match(unique(data.all$subjid),rev(data.all$subjid))+1
  end <- c(which(data.all$orig.exclude == "Include"),subj_end)
  end <- unique(sort(end))

  # remove anything between start and ends (including start, not including end)
  to_rem <- unlist(
    lapply(st, function(x){
      to_rem <- c(x:(end[end >= x][1]))
      if (to_rem[length(to_rem)] %in% subj_end){
        # if it's the last value, we want to get rid of that end
        return(to_rem)
      } else {
        # if it's an include, we want to keep it (don't remove)
        return(to_rem[-length(to_rem)])
      }
    })
  )
  to_rem <- unique(to_rem)

  if (length(to_rem) > 0){
    data.all <- data.all[-to_rem,]
  }

  # filter to only subjects with possible carried forwards again
  data.all <- data.all %>%
    filter(subjid %in% data.all$subjid[data.all$orig.exclude == "Exclude-Carried-Forward"]) %>%
    as.data.table()

  ### END EDIT ####

  # load tanner height velocity data. sex variable is defined such that 0=male and 1=female
  # recode column names to match syntactic style ("." rather than "_" in variable names)
  tanner_ht_vel_path <- ifelse(
    ref.data.path == "",
    system.file("extdata/tanner_ht_vel.csv.gz", package = "growthcleanr"),
    paste(ref.data.path, "tanner_ht_vel.csv.gz", sep = "")
  )

  tanner.ht.vel <- fread(tanner_ht_vel_path)

  setnames(tanner.ht.vel,
           colnames(tanner.ht.vel),
           gsub('_', '.', colnames(tanner.ht.vel)))
  setkey(tanner.ht.vel, sex, tanner.months)
  # keep track of column names in the tanner data
  tanner.fields <- colnames(tanner.ht.vel)
  tanner.fields <- tanner.fields[!tanner.fields %in% c('sex', 'tanner.months')]

  who_max_ht_vel_path <- ifelse(
    ref.data.path == "",
    system.file("extdata/who_ht_maxvel_3sd.csv.gz", package = "growthcleanr"),
    paste(ref.data.path, "who_ht_maxvel_3sd.csv.gz", sep =
            "")
  )

  who_ht_vel_3sd_path <- ifelse(
    ref.data.path == "",
    system.file("extdata/who_ht_vel_3sd.csv.gz", package = "growthcleanr"),
    paste(ref.data.path, "who_ht_vel_3sd.csv.gz", sep =
            "")
  )
  who.max.ht.vel <- fread(who_max_ht_vel_path)
  who.ht.vel <- fread(who_ht_vel_3sd_path)
  setkey(who.max.ht.vel, sex, whoagegrp_ht)
  setkey(who.ht.vel, sex, whoagegrp_ht)
  who.ht.vel <- as.data.table(dplyr::full_join(who.ht.vel, who.max.ht.vel, by =
                                                c('sex', 'whoagegrp_ht')))

  setnames(who.ht.vel, colnames(who.ht.vel), gsub('_', '.', colnames(who.ht.vel)))
  setkey(who.ht.vel, sex, whoagegrp.ht)
  # keep track of column names in the who growth velocity data
  who.fields <- colnames(who.ht.vel)
  who.fields <- who.fields[!who.fields %in% c('sex', 'whoagegrp.ht')]

  # 1.  General principles
  # a.	All steps are done separately for each parameter unless otherwise noted
  # b.	All steps are done sorted by subject, parameter, and age (in days) for nonexcluded and nonmissing values only unless otherwise noted. This is very important.
  #     Sorting needs to be redone with each step to account for excluded and transformed  values.
  # c.	The next value refers to the value with the next highest age for the same parameter and the same subject, and the previous value refers to the value with the
  #     next lowest age for the same parameter and the same subject.
  # d.	You will need to set up a method for keeping track of whether a value is missing or excluded (and in what step). I use variables called exc_* that are =0
  #     if a value is to be included, =1 if missing, and =2 or higher if it is to be excluded, with each number indicating a different step. I also set up
  #     parameter-specific subjid_* variables that are = to subjid for included values and are blank if the value is missing or should be excluded. These subjid_*
  #     variables need to be updated with each step.
  # e.  All steps assume that data are sorted by subjid_*, parameter, and age (in days) for nonexcluded and nonmissing values only
  #     unless otherwise noted. Sorting needs to be redone after any transformations or exclusions to account for excluded and
  #     transformed values.
  # f.  The next value refers to the nonexcluded nonmissing value with the next highest age for the same parameter and the same
  #     subject, and the previous value refers to the nonexcluded nonmissing value with the next lowest age for the same parameter
  #     and the same subject.
  # g.  exc_* should only be replaced with a  higher value if exc_*==0 at the time of replacement, unless otherwise specified.


  # NOTE: in the R code below exclusion is documented as a series of factor levels, where all levels occuring before 'Exclude' in the sequence are considered
  # to be valid measurements.  We use the built in sorting of the data.table object and subsets rather than re-sorting at each step
  # to ensure that only valid measurements are used at the beginning of each step.
  # Also, unlike the Stata code, the measurement parameter (weight vs. height) is recorded as a factor in the data frame, rather than as a variable name

  # 2.  Data set-up
  # a.	I always code sex as 0=Male, 1=Female, so I recoded the variable sex that way and left a variable sexorigcode the way the data was sent to me (1=Female 2=Male)
  # b.	Remove rows that are duplicates for subjid, param, and measurement from further analysis
  #     NOTE: this step is not needed -- handled automatically by "temporary duplicate" step.
  # c.  I generated separate variables for weight (wt) and height (ht), as well as exc_* and subjid_* variables. Set exc_*=0 if value is not missing
  #     and exc_*=1 if value is missing. In all future steps, exc_* should only be changed if it is 0. This helps to keep track of which step excluded a value.
  #     I also kept the measurement variable there and untouched because sometimes wt and ht got transformed to something else.
  # d.	I made tables based on CDC growth curve parameters that include data for each day that I will send separately. The LMS parameters for each day are
  #     cubically interpolated from the values by month available on the CDC website. Create wt and ht z-scores for each value of each parameter (Z: WtZ and HtZ).
  # e.	There are variables in the table labelled cdc_*_csd_pos and cdc_*_csd_neg. For each age and sex, these correspond to ½ of the absolute value of the
  #     median and the value with a z-score of +2 (csd_pos) and -2 (csd_neg). These can be created to generate a score similar to the z-score but with an
  #     important difference. The z-scores created using the LMS method account for skewness in the distribution of the parameters (particularly weight), which
  #     can lead to small changes in z-score with large changes in weight in subjects with very high weight, and relatively large changes in z-score for smaller
  #     changes in weight in subjects with low weights.  The score we will create can be called an SD-score (SDorig: WtSDorig and HtSDorig that is calculated by
  #     dividing the difference between the value and the median by the SD score (use csd_pos if the value is above the median, csd_neg if the value is below the
  #     median). These SD-scores, rather than z-scores, now form the basis for the algorithm.

  # recategorize linear parameters as 'HEIGHTCM'
  # NOTE: this will be changed in future to consider this difference
  data.all[param == 'LENGTHCM', param := 'HEIGHTCM']

  # calculate z scores
  if (!quietly)
    cat(sprintf("[%s] Calculating z-scores...\n", Sys.time()))
  measurement.to.z <- read_anthro(ref.data.path, cdc.only = TRUE)
  data.all[, z.orig := measurement.to.z(param, agedays, sex, v)]

  # calculate "standard deviation" scores
  if (!quietly)
    cat(sprintf("[%s] Calculating SD-scores...\n", Sys.time()))
  data.all[, sd.orig := measurement.to.z(param, agedays, sex, v, TRUE)]

  # sort by subjid, param, agedays
  setkey(data.all, subjid, param, agedays)

  # add a new convenience index for bookkeeping
  data.all[, index := 1:.N]

  # enumerate the different exclusion levels
  exclude.levels <- c(
    'Missing',
    'No Change',
    'Include',
    'Exclude-Min-Height-Change',
    'Exclude-Max-Height-Change'
  )

  # Mark missing values for exclusion
  data.all[, exclude := factor(with(data.all, ifelse(
    is.na(v) |
      agedays < 0, 'Missing', 'No Change'
  )),
  levels = exclude.levels,
  ordered = TRUE)] # why is this ordered??

  # define field names needed by helper functions
  ewma.fields <- c('ewma.all', 'ewma.before', 'ewma.after')

  # 3.  SD-score recentering: Because the basis of the method is comparing SD-scores over time, we need to account for the fact that
  #     the mean SD-score for the population changes with age.
  # a.	Determine the median cdc*sd for each parameter by year of age (with sexes combined): median*sd.
  # b.	The median*sd should be considered to apply to midyear-age, defined as the age in days with the same value as the integer
  #     portion of (365.25*year + 365.25/2).
  # c.	Linearly interpolate median*sd for each parameter between each midyear-age, naming the interpolated values rc*sd.
  # d.	For ages below the first midyear-age, let rc*sd equal the median*sd for the earliest year.
  #     For ages above the last midyear_age, let rc*sd equal the median*sd for the last year.
  # e.	Subtract rcsd_* from SDorig to create the recentered SD-score.  This recentered SD-score, labeled tbc*sd
  #     (stands for "to be cleaned") will be used for most of the rest of the analyses.
  # f.	In future steps I will sometimes refer to measprev and measnext which refer to the previous or next wt or ht measurement
  #     for which exc_*==0 for the subject and parameter, when the data are sorted by subject, parameter, and agedays. SDprev and SDnext refer to the tbc*sd of the previous or next measurement.

  if (!quietly)
    cat(sprintf("[%s] Re-centering data...\n", Sys.time()))

  # see function definition below for explanation of the re-centering process
  # returns a data table indexed by param, sex, agedays
  if (!is.data.table(sd.recenter)) {
    # ADJUSTCF EDIT - be explicit about levels to keep
    keep.levels <- c(
      "Include",
      "Unit-Error-High",
      "Unit-Error-Low",
      "Unit-Error-Possible",
      "Swapped-Measurements"
    )
    sd.recenter <- data.all[orig.exclude %in% keep.levels, sd_median(param, sex, agedays, sd.orig)]
    # END EDIT
  }
  # add sd.recenter to data, and recenter
  setkey(data.all, param, sex, agedays)
  data.all <- sd.recenter[data.all]
  setkey(data.all, subjid, param, agedays)
  data.all[, tbc.sd := sd.orig - sd.median]

  # safety check: treat observations where tbc.sd cannot be calculated as missing
  data.all[is.na(tbc.sd), exclude := 'Missing']



  ######### END DATA PROCESING #########

  ######### START FLAGGING #########

  # 15.  Exclude heights based on absolute differences in measurement. The key to this step is that once we identify pairs of measurements with implausible
  #      amounts of absolute difference between them, we have to determine which of the two values in the pair is less likely to be representative and should
  #      be excluded. For subjects/parameters with 3 or more measurements, this is done by looking at the dewma_* of each of the 2 values in a pair using a ewma that
  #      excludes the other value in the pair. For subjects/parameters with 2 or more measurements, this is done by looking at the absolute value of the tbc*sd.
  #      The Tanner height velocity reference is used for measurements taken at >2yo, WHO will be used for <2yo. For a few pairs of measurements either could be used;
  #      WHO will be used if difference between ages is < 9 months.
  if (!quietly)
    cat(sprintf(
      "[%s] Exclude heights based on growth velocity...\n",
      Sys.time()
    ))

  # ADJUSTCF EDIT:
  # there are several different method for handling carried forward values
  # 0. no change
  # 1. when deciding to exclude values, if we have a string of carried forwards,
  #    drop the most deviant value, and all CFs in the same string, and move on as
  #    normal
  # 2. when deciding to exclude values, if the most deviant in a
  # string of carried forwards is flagged, check all the CFs in that
  # string from 1:N. exclude all after the first that is flagged for
  # exclusion when comparing to the Include before and after. do not
  # remove things designated as include.
  # 3. when deciding to exclude values, if the most deviant in a
  # string of carried forwards is flagged, check all the CFs in that
  # string from 1:N. exclude all after the first that is flagged for
  # exclusion when comparing to the Include before and after. make sure
  # remove things designated as include.


  # for (opt in all_opts){
  data.all[param == 'HEIGHTCM', exclude := (function(subj.df) {
    # assign some book keeping variables
    #subj.df[, `:=`(subjid = subjid, param='HEIGHTCM',index=1:.N)]
    subj.df[, index := 1:.N]

    # keep the original subject dataframe, since we will always use the original
    # includes to compare to
    subj.df_orig <- copy(subj.df)
    num.height.excluded = 0
    while (TRUE) {
      # use a closure to discard all the extra fields added to df with each iteration
      # "include" protects subsets (used in option 3)
      subj.df[!grepl("Exclude", exclude) & !grepl("Include", exclude),
              exclude := (function (df) {
                # put together the dataframe with original includes for comparisons
                # get all "includes"
                subj.df_orig_incl <- copy(subj.df_orig[orig.exclude == "Include",])
                # now get all the remaining and add them on
                df <- rbind(subj.df_orig_incl,
                            df[orig.exclude != "Include",])
                # sort them by original index
                df <- df[order(index),]

                # do steps 15a - 15q (functionalized for ease)
                eval_df <- calc_temp_exclusion_15(
                  copy(df),
                  ewma.fields, tanner.ht.vel, who.ht.vel, exclude.levels,
                  ewma.exp, minfactor, maxfactor, banddiff, banddiff_plus,
                  min_ht.exp_under, min_ht.exp_over, max_ht.exp_under, max_ht.exp_over)

                # r.  If there is only one potential exclusion identified in step 15j for a subject and parameter,
                #     replace exc_ht=15 for that value if it met criteria i, ii, v, or vi  and exc_ht=16 if it met criteria iii, iv, vii, or viii
                # NOTE: these exclusions are assigned in the code above as 'Exclude-Min-Height-Change' and 'Exclude-Max-Height-Change'

                # remove includes from consideration of removal
                eval_df[orig.exclude == "Include", temp.diff := NaN]
                eval_df[orig.exclude == "Include", temp.exclude := NA]

                # count the amount of error codes we get
                all_exclude = !is.na(eval_df$temp.exclude)
                num.exclude = sum(all_exclude)

                # if there's only one, all of the groups agree
                # s.  If there is more than one potential exclusion identified in step 14h for a subject and parameter, determine which value has the largest temp_diff and
                #     replace exc_ht=15 for that value if it met criteria i, ii, v, or vi and exc_ht=16 for that value if it met criteria iii,  iv, vii, or viii.

                if (exclude_opt == 0){
                  # option 0: normal
                  if (num.exclude == 1){
                    eval_df[all_exclude, exclude := temp.exclude]
                  } else if (num.exclude > 1) {
                    # first order by decreasing temp.diff (where rep=TRUE)
                    worst.row = order(all_exclude, eval_df$temp.diff, decreasing = TRUE)[1]
                    eval_df[worst.row, exclude := temp.exclude]
                  }
                } else if (exclude_opt == 1){
                  # option 1: if there's a carried forward after, we want to exclude that too, until the next include
                  if (num.exclude == 1){
                    wh_exclude <- which(all_exclude)

                    # if it's in a string of carried forwards
                    if (
                      eval_df[wh_exclude, "orig.exclude"] == "Exclude-Carried-Forward" &&
                      wh_exclude != nrow(eval_df) &&
                      eval_df[wh_exclude+1, "orig.exclude"] == "Exclude-Carried-Forward"
                    ){
                      # find the next include OR the row end
                      next_incl <- which(
                        eval_df$orig.exclude[(wh_exclude+1):nrow(eval_df)] == "Include"
                      )[1]+wh_exclude
                      if (is.na(next_incl)){
                        next_incl <- nrow(eval_df)+1
                      }


                      # mark all the CFs after for exclusion
                      all_exclude[wh_exclude:(next_incl-1)] <- TRUE
                      # also copy all the temp.excludes
                      eval_df$temp.exclude[(wh_exclude+1):(next_incl-1)] <-
                        eval_df$temp.exclude[wh_exclude]
                    }

                    # exclude all the carried forwards before the next include
                    eval_df[all_exclude, exclude := temp.exclude]
                  } else if (num.exclude > 1) {
                    # first order by decreasing temp.diff (where rep=TRUE)
                    worst.row = order(all_exclude, eval_df$temp.diff, decreasing = TRUE)[1]

                    # option 1: if there's a carried forward after, we want to exclude that too, until the next include
                    wh_exclude <- worst.row
                    if (
                      eval_df[wh_exclude, "orig.exclude"] == "Exclude-Carried-Forward" &&
                      wh_exclude != nrow(eval_df) &&
                      eval_df[wh_exclude+1, "orig.exclude"] == "Exclude-Carried-Forward"
                    ){
                      # find the next include OR the row end
                      next_incl <- which(
                        eval_df$orig.exclude[(wh_exclude+1):nrow(eval_df)] == "Include"
                      )[1]+wh_exclude
                      if (is.na(next_incl)){
                        next_incl <- nrow(eval_df)+1
                      }

                      # mark all the CFs after for exclusion
                      worst.row <- c(worst.row:(next_incl-1))
                      # also copy all the temp.excludes
                      eval_df$temp.exclude[(wh_exclude+1):(next_incl-1)] <-
                        eval_df$temp.exclude[wh_exclude]
                    }

                    eval_df[worst.row, exclude := temp.exclude]
                  }
                } else if (exclude_opt == 2){
                  # 2. when deciding to exclude values, if the most deviant in a
                  # string of carried forwards is flagged, check all the CFs in that
                  # string from 1:N. exclude all after the first that is flagged for
                  # exclusion when comparing to the Include before and after. do not
                  # remove things designated as include.

                  # after you've evaluated them, do you reevaluate the ones you've kept?
                  if (num.exclude == 1){
                    wh_exclude <- which(all_exclude)

                    if (
                      eval_df[wh_exclude, "orig.exclude"] == "Exclude-Carried-Forward" &&
                      ((wh_exclude != nrow(eval_df) &&
                        eval_df[wh_exclude+1, "orig.exclude"] == "Exclude-Carried-Forward")
                       ||
                       (wh_exclude != 1 &&
                        eval_df[wh_exclude-1, "orig.exclude"] == "Exclude-Carried-Forward")
                      )
                    ){
                      # check all the carried forwards in a string
                      res <- check_cf_string(
                        copy(eval_df), wh_exclude,
                        ewma.fields, tanner.ht.vel, who.ht.vel, exclude.levels,
                        ewma.exp, minfactor, maxfactor, banddiff, banddiff_plus,
                        min_ht.exp_under, min_ht.exp_over, max_ht.exp_under, max_ht.exp_over
                      )

                      verdict <- res$verdict
                      cf_ind <- res$cf_ind
                      next_incl <- res$next_incl
                      first_incl <- res$first_incl

                      if (verdict == "Exclude"){
                        # mark all the CFs after for exclusion
                        all_exclude[(cf_ind-1):(next_incl-1)] <- TRUE
                        # also copy all the temp.excludes
                        eval_df$temp.exclude[(cf_ind-1):(next_incl-1)] <-
                          eval_df$temp.exclude[(cf_ind-1)]
                      } else {
                        # if the verdict is include, we need to mark them all for
                        # "exclusion" to remove -- otherwise we end up in a loop
                        all_exclude[(first_incl+1):(next_incl-1)] <- TRUE
                        # also copy all the temp.excludes
                        eval_df$temp.exclude[(first_incl+1):(next_incl-1)] <-
                          "Include"
                      }
                    }

                    # exclude all the carried forwards before the next include
                    eval_df[all_exclude, exclude := temp.exclude]
                  } else if (num.exclude > 1) {
                    # first order by decreasing temp.diff (where rep=TRUE)
                    worst.row = order(all_exclude, eval_df$temp.diff, decreasing = TRUE)[1]

                    wh_exclude <- worst.row
                    if (
                      eval_df[wh_exclude, "orig.exclude"] == "Exclude-Carried-Forward" &&
                      ((wh_exclude != nrow(eval_df) &&
                        eval_df[wh_exclude+1, "orig.exclude"] == "Exclude-Carried-Forward")
                       ||
                       (wh_exclude != 1 &&
                        eval_df[wh_exclude-1, "orig.exclude"] == "Exclude-Carried-Forward")
                      )
                    ){
                      # check all the carried forwards in a string
                      res <- check_cf_string(
                        copy(eval_df), wh_exclude,
                        ewma.fields, tanner.ht.vel, who.ht.vel, exclude.levels,
                        ewma.exp, minfactor, maxfactor, banddiff, banddiff_plus,
                        min_ht.exp_under, min_ht.exp_over, max_ht.exp_under, max_ht.exp_over
                      )

                      verdict <- res$verdict
                      cf_ind <- res$cf_ind
                      next_incl <- res$next_incl
                      first_incl <- res$first_incl

                      if (verdict == "Exclude"){
                        # mark all the CFs after for exclusion
                        worst.row <- c((cf_ind-1):(next_incl-1))
                        # also copy all the temp.excludes
                        eval_df$temp.exclude[(cf_ind-1):(next_incl-1)] <-
                          eval_df$temp.exclude[(cf_ind-1)]
                      } else {
                        # if the verdict is include, we need to mark them all for
                        # "exclusion" to remove -- otherwise we end up in a loop
                        worst.row <- c((first_incl+1):(next_incl-1))
                        # also copy all the temp.excludes
                        eval_df$temp.exclude[(first_incl+1):(next_incl-1)] <-
                          "Include"
                      }
                    }

                    eval_df[worst.row, exclude := temp.exclude]
                  }
                } else if (exclude_opt == 3){
                  # 3. when deciding to exclude values, if the most deviant in a
                  # string of carried forwards is flagged, check all the CFs in that
                  # string from 1:N. exclude all after the first that is flagged for
                  # exclusion when comparing to the Include before and after. make sure
                  # remove things designated as include.

                  # after you've evaluated them, do you reevaluate the ones you've kept?
                  if (num.exclude == 1){
                    wh_exclude <- which(all_exclude)
                    if (
                      eval_df[wh_exclude, "orig.exclude"] == "Exclude-Carried-Forward" &&
                      ((wh_exclude != nrow(eval_df) &&
                        eval_df[wh_exclude+1, "orig.exclude"] == "Exclude-Carried-Forward")
                       ||
                       (wh_exclude != 1 &&
                        eval_df[wh_exclude-1, "orig.exclude"] == "Exclude-Carried-Forward")
                      )
                    ){
                      # check all the carried forwards in a string
                      res <- check_cf_string(
                        copy(eval_df), wh_exclude,
                        ewma.fields, tanner.ht.vel, who.ht.vel, exclude.levels,
                        ewma.exp, minfactor, maxfactor, banddiff, banddiff_plus,
                        min_ht.exp_under, min_ht.exp_over, max_ht.exp_under, max_ht.exp_over
                      )

                      verdict <- res$verdict
                      cf_ind <- res$cf_ind
                      next_incl <- res$next_incl
                      first_incl <- res$first_incl

                      if (verdict == "Exclude"){
                        # mark all the CFs after for exclusion
                        all_exclude[(cf_ind-1):(next_incl-1)] <- TRUE
                        # also copy all the temp.excludes
                        eval_df$temp.exclude[(cf_ind-1):(next_incl-1)] <-
                          eval_df$temp.exclude[(cf_ind-1)]

                        # we also want to keep all the implicit excludes
                        # mark all the CFs after for "exclusion"
                        all_exclude[(first_incl+1):(cf_ind)] <- TRUE
                        # also copy all the temp.excludes
                        eval_df$temp.exclude[(first_incl+1):(cf_ind)] <-
                          "Include"
                      } else {
                        # if the verdict is include, we need to mark them all for
                        # "exclusion" to remove -- otherwise we end up in a loop
                        all_exclude[(first_incl+1):(next_incl-1)] <- TRUE
                        # also copy all the temp.excludes
                        eval_df$temp.exclude[(first_incl+1):(next_incl-1)] <-
                          "Include"
                      }
                    }

                    # exclude all the carried forwards before the next include
                    eval_df[all_exclude, exclude := temp.exclude]
                  } else if (num.exclude > 1) {
                    # first order by decreasing temp.diff (where rep=TRUE)
                    worst.row = order(all_exclude, eval_df$temp.diff, decreasing = TRUE)[1]

                    wh_exclude <- worst.row
                    if (
                      eval_df[wh_exclude, "orig.exclude"] == "Exclude-Carried-Forward" &&
                      ((wh_exclude != nrow(eval_df) &&
                        eval_df[wh_exclude+1, "orig.exclude"] == "Exclude-Carried-Forward")
                       ||
                       (wh_exclude != 1 &&
                        eval_df[wh_exclude-1, "orig.exclude"] == "Exclude-Carried-Forward")
                      )
                    ){
                      # check all the carried forwards in a string
                      res <- check_cf_string(
                        copy(eval_df), wh_exclude,
                        ewma.fields, tanner.ht.vel, who.ht.vel, exclude.levels,
                        ewma.exp, minfactor, maxfactor, banddiff, banddiff_plus,
                        min_ht.exp_under, min_ht.exp_over, max_ht.exp_under, max_ht.exp_over
                      )

                      verdict <- res$verdict
                      cf_ind <- res$cf_ind
                      next_incl <- res$next_incl
                      first_incl <- res$first_incl

                      if (verdict == "Exclude"){
                        # mark all the CFs after for exclusion
                        worst.row <- c((cf_ind-1):(next_incl-1))
                        # also copy all the temp.excludes
                        eval_df$temp.exclude[(cf_ind-1):(next_incl-1)] <-
                          eval_df$temp.exclude[(cf_ind-1)]

                        # we also want to keep all the implicit excludes
                        # mark all the CFs after for "exclusion"
                        worst.row <- c((first_incl+1):(cf_ind), worst.row)
                        # also copy all the temp.excludes
                        eval_df$temp.exclude[(first_incl+1):(cf_ind)] <-
                          "Include"
                      } else {
                        # if the verdict is include, we need to mark them all for
                        # "exclusion" to remove -- otherwise we end up in a loop
                        worst.row <- c((first_incl+1):(next_incl-1))
                        # also copy all the temp.excludes
                        eval_df$temp.exclude[(first_incl+1):(next_incl-1)] <-
                          "Include"
                      }
                    }

                    eval_df[worst.row, exclude := temp.exclude]
                  }
                }

                return(eval_df$exclude)

              })(copy(.SD))]

      # t.  If there was at least one subject who had a potential exclusion identified in step 15q, repeat steps 15b-15q. If there were no subjects with potential
      #     exclusions identified in step 15q, move on to step 16.
      newly.excluded = sum(
        subj.df$exclude %in% c(
          'Exclude-Min-Height-Change',
          'Exclude-Max-Height-Change'
        )
      )

      if (newly.excluded > num.height.excluded) {
        num.height.excluded = newly.excluded
      } else {
        break
      }
    }
    setkey(subj.df, index)
    return(subj.df$exclude)
  })(copy(.SD)), by = .(subjid), .SDcols = c('sex', 'agedays', 'v', 'tbc.sd', 'exclude', 'orig.exclude')]

  # process what came from step 15

  # if you're outside of the bands OR if you are not a carry forward then you have no change
  data.all[
    !(
      data.all[, exclude] %in% c(
        'Exclude-Min-Height-Change',
        'Exclude-Max-Height-Change'
      )
    ) &
      (data.all$orig.exclude == "Exclude-Carried-Forward"), exclude := "Include"]
  # everything with exclude should not be changed
  data.all[
    data.all[, exclude] %in% c('Exclude-Min-Height-Change','Exclude-Max-Height-Change'),
    exclude := "No Change"]
  # all original includes should not be changed
  data.all[
    data.all$orig.exclude != "Exclude-Carried-Forward",
    exclude := "No Change"
  ]

  # }

  # formulating results and options
  acf_df <- data.frame(n = data.all$n)
  acf_df <- cbind(acf_df,
                  data.all[, exclude])
  colnames(acf_df)[-1] <- "adjustcarryforward"

  # return results
  return(rbind(
    acf_df,
    data.frame(
      filter(data.orig,!n %in% data.all$n) %>% mutate(adjustcarryforward = "Not Considered")  %>% select(adjustcarryforward, n)
    )
  ))
}

Try the growthcleanr package in your browser

Any scripts or data that you put into this service are public.

growthcleanr documentation built on June 24, 2024, 5:16 p.m.