R/sm-algorithm.R

Defines functions s_mN s_m_p r2b getxxnorm merge_intervals split_interval

#' @title spliting interval i into 2 pieces
#' @description spliting interval i into 2 pieces
#'
#' @param i [INTEGER] interval number, should be less than NR+1
#' @param ni [INTEGER(NR)] current array with interval start point number
#'
#' @noRd
#' @return new array with interval start point number
#'
#'
#' @examples
#' ni = c(1,5,10,15,20)
#' nr = 4
#' ni
#' ni = split_interval(ni,i=2)
#' ni
#' ni = split_interval(ni,i=5)
#' ni
#' ni = split_interval(ni,i=2)
#' ni
#' ni = split_interval(ni,i=1)
#' ni
#' ni = split_interval(ni,i=length(ni)-1)
#' ni
#' ni = split_interval(ni,i=length(ni)-1)
#' ni
#' ni

split_interval <- function(ni, i) {
  stopifnot(i < length(ni)) # Otherwise ni[i+1] would fail.
  k1 <- ni[i]
  k2 <- ni[i + 1]
  jsplit <- floor((k1 + k2) / 2) # Is an index, must be an integer.

  # The following lines are taken from the original code but actually
  # are in error. They are included simply to make the tests produce the
  # identical answers in every case.
  if (jsplit >= k2 - 1) jsplit <- k2 - 2
  if (jsplit <= k1 + 1) jsplit <- k1 + 2

  stopifnot(k1 < jsplit & jsplit < k2) # Verify that the interval has actually been split.
  # Create a new vector with jsplit at position i
  c(ni[1:i], jsplit, ni[(i + 1):length(ni)])
}


#'
#' @title merge_intervals
#' @description merge interval i and i+1
#' @param i interval number of the first segment to merge
#' @param ni current array with interval start point numbers
#'
#' @noRd
#' @details Intervals to be merged


merge_intervals <- function(i, ni) {
  stopifnot(i > 1) # Can't merge the first interval
  stopifnot(i < length(ni)) # Must have an element at n[i+1]
  stopifnot(length(ni) > 2) # Only one interval so can't merge
  c(ni[1:i - 1], ni[(i + 1):length(ni)]) # Just removes element i
}


#'
#' @description Normalize a vector of samples. X and Y  will be normalized so that the first point is (0,0) and the last point is (1,1). Vector x are the input x values, must be in ascending order.
#'
#' @title Normalize a vector of samples.
#' @param x input x-axis array (should be in "chronological" order)
#' @param y input y-axis array
#' @return Output is a list with:
#' \itemize{
#'  \item anormx, the normalization value used to make the last x == 1.
#'  \item anormy, the normalization value used to make the last y == 1.
#'  \item xx,yy  vectors of x,y values interpolated to be the same length as x0.
#'  }
#'
#' @noRd
#' @details A function to normalize a vector of samples
#'

getxxnorm <- function(x, y) {
  anormx <- x[length(x)] - x[1]
  anormy <- y[length(y)] - y[1]
  xx <- (x - x[1]) / anormx
  yy <- (y - y[1]) / anormy

  list(anormx = anormx, anormy = anormy, xx = xx, yy = yy)
}



#' @title computing a norm value for the segment from point k1 to k2-1
#' @description  Computes the norm value for the segment from the point k1 to k2-1
#' @param k1 [INTEGER] start point
#' @param k2 [INTEGER] end point+1
#' @param x [REAL] input x-axis array (predictor)
#' @param y [REAL] input y-axis array (response)
#' @noRd
#' @return norm value

#' @examples
#' ni <- c( 1, 201, 402 )
#' i <- 1
#' k1 <- ni[i]
#' k2 <- ni[i+1]
#'
#'
#' r2b(k1, k2, y=latesummer$temper, x=latesummer$depth)


r2b <- function(k1, k2, x, y) {
  if (k2 - k1 <= 2) {
    r2b <- 0
    #    a = 0  # original code does not produce a or b?
    #    b = 0
  } else {
    is <- k1:(k2 - 1)
    sxx <- sum(x[is] ^ 2)
    sxy <- sum(x[is] * y[is])
    sy <- sum(y[is])
    sx <- sum(x[is])

    n <- k2 - k1

    a <- 0.0
    if (k1 > 1) {
      a <- (n * sxy - sy * sx) / (n * sxx - sx * sx)
    }
    b <- (sy - a * sx) / n
    r2b <- max(abs(y[is] - a * x[is] - b) / sqrt(a ^ 2 + 1))
  }

  return(r2b)
}




#' @title Computing linear segments for a specified error norm value.
#' @description Segments the data in x,y into the intervals given in the output array A.
#' The data in each interval can be linearly fitted within an error, given by r2b(), less than eps.
#' The dynamic arrays in R remove the need to know the number of data points, n, which is just
#' the length of x and y. Similarly Nr is no longer needed and is length(Ni)-1, one less than the length
#' of the output array.
#'
#' @param eps [real] error norm
#' @param x [real] input x-axis array, should be an increasing function of index
#' @param y [real] input y-axis array
#' @noRd
#' @return [integer] array A of indices giving data segments.
#' A[i] start of interval; A[i+1] end of interval, for any i<length(A)

s_m_p <- function(eps, x, y) {
  # This code generates Nr intervals in vector Ni
  # if(is.unsorted(x)==TRUE) stop("x is not an increasing vector")
  #  Nr=2
  #      m=NINT(FLOAT(N)/FLOAT(Nr))
  #        ni(1)=1
  #      DO i=2,Nr
  #        Ni(i)=m*(i-1)+1
  #      end DO
  #      Ni(Nr+1)=N+1 !{last interval}

  # But N4 is fixed at two so we end up with 3 elements in Ni:
  # Ni = {1,m+1,n+1}
  # Clearly, it once spread Nr intervals out uniformly over all the data.
  # We will short-circuit it instead:
  ni <- c(1, round(length(x) / 2) + 1, length(x) + 1)

  # If any interval does not meet the r2b<eps test then split it into two with split_interval().
  # Note that ni will grow in length as this process proceeds.
  i <- 1
  while (i < length(ni)) { # Rinse, repeat over all intervals in ni.
    k1 <- ni[i]
    k2 <- ni[i + 1]
    if (r2b(k1, k2, x, y) > eps) {
      # N.B. We split an interval here so ni gets one element longer
      # N.B. If an interval is added we will have to test the first
      # of the two new intervals so we don't increment i.
      ni <- split_interval(ni, i)
      changed <- TRUE
    }
    else {
      # Prior intervals all meet eps test, go to next one.
      i <- i + 1
    }
  }

  changed <- TRUE # Keep track of whether any intervals were altered...
  while (changed) { # ... because we are not finished until none were.
    changed <- FALSE # Continue the loop only if something changed.

    # All intervals now meet the test but it is possible that there are too many
    # in the sense that there may now be adjacent intervals, which if merged into one,
    # will then still meet the test.
    # Scan for such interval pairs and merge them.

    # Loop i over all possible "middle" intervals, which we may remove.
    # I am not sure at this point why the last interval is not considered for merging,
    # as it would be if i<=length(ni)-1 , which seems more natural to me.

    # Note that ni potentially changes inside the loop.
    i <- 2
    while (i <= length(ni) - 2) {
      # if( 2 <= length(ni)-2 ) {
      #   # This branch had to be added because the Fortran DO loop will not execute
      #   # if the start value is greater than the limit value but R runs backward.
      #   for( i in 2:(length(ni)-2) ) {
      k1 <- ni[i - 1]
      k2 <- ni[i + 1]
      eps1 <- r2b(k1 = k1, k2 = k2, x = x, y = y)
      if (eps1 < eps) {
        if (length(ni) - 1 > 2) { # Are there two or more intervals in the entire set?
          # Yes, so merge the interval we are looking at.
          ni <- merge_intervals(i, ni)
          changed <- TRUE
          # break # exit the loop
        } else {
          # We are here because the last two intervals tested out to merge.
          # So we can just adjust the intervals and bail out entirely because,
          # apparently the entire data set lies on a line within eps!
          # TODO: The original code doesn't seem to do this correctly. It should
          # adjust ni[2] = ni[3] and nr = 1. I think this branch has never been
          # taken in actual operation. Obviously the data set is invalid if it lies
          # entirely on a straight line, right?

          #            ni[1]=1  # TODO: Original code, but this should already be the case.
          return(ni)
        }
      }
      # }
      i <- i + 1
    }

    # If a merge was done then changed==TRUE. In this circumstance, the original
    # code then immediately attempts to merge all intervals again from 2 so we
    # do the same. This procedure seems wasteful because all the prior
    # interval pairs have already been tested for merging and don't need it, yet we
    # are going to test them all over again. For the moment we will continue to
    # do it this way and perhaps TODO change it later so that it just restarts
    # with the just-merged interval. Should get the same results faster.

    if (changed) next # short out the rest of the while(changed) loop.

    # In the original code the following is preceded with a comment:
    #  "to avoid couples"
    # I'm not precisely sure what that is supposed to mean but from the original
    # code I infer:
    #
    # If the end index of any interval is only 1 greater than the start index
    # (i.e., ni[i+1] == 1 + ni[i]) then bump the end index up by one. Due to the
    # loop this change propagates up to the last interval so we can't easily
    # replace the loop with nice vectorized R code.

    for (i in 1:(length(ni) - 1)) {
      if (ni[i + 1] - ni[i] == 1) {
        ni[i + 1] <- ni[i + 1] + 1
        changed <- TRUE
      }
    }

    # TODO: seems like we should restart the while(changed) loop at this point.

    # At this point in the original code stands the comment:
    #  "{"R" algorithm: adjusting the endpoint}"
    # I don't know what the "R algorithm" is so I'm just going to have to wing
    # comments.

    # Scan all adjacent interval pairs.
    for (i in 2:(length(ni) - 1)) {
      # First of pair (ni[i-1],ni[i]), second (ni[i],ni[i+1])
      k1 <- ni[i - 1]
      kmid <- ni[i]
      k2 <- ni[i + 1]
      # Find the error on both intervals and take the max.
      epsm <- max(r2b(k1, kmid, x, y), r2b(kmid, k2, x, y)) # TODO: I don't think this is necessary. We'll find it below anyway.

      # We are going to move the midpoint and find the split that gives the
      # best error.
      j1 <- ni[i] # Keep track of best splitpoint so far.
      # Scan all alternative splitpoints between these two enpoints.
      for (j in (k1 + 2):(k2 - 2)) {
        epsr <- max(r2b(k1, j, x, y), r2b(j, k2, x, y)) # Calculate max error
        if (epsr < epsm) {
          epsm <- epsr # This split is the best so far
          j1 <- j # Keep track of which index it is at.
        }
      }

      if (j1 != ni[i]) { # Did we find a better splitpoint?
        ni[i] <- j1 # Yes, change the split.
        changed <- TRUE
      }

      # Here in the original code stands "if (i.eq.2) epsm1=epsm"
      # but epsm1 is never used so we have ignored it.
    } # for
  } # while(changed)

  # The following statement corresponds to one in the original code but seems
  # unnecessary. The code that manipulates ni should really maintain this condition.
  # Mind you, I haven't actually checked carefully that this is true. Maybe the
  # author added it to solve a problem! TODO: maybe take it out and test sometime.
  ni[length(ni)] <- length(x)

  return(ni) # Q.E.D.
}


#' @title Computing linear segments for a specified error norm value.
#' @description Subroutine to determine the Linear SEGMENTS for a PREDEFINED NUMBER OF SEGMENTS (NR)
#' (Use this program when you want to predefine the number of segments you want to fit
#' to the data) Segments the data in x,y into the intervals given in the output array A.
#' The data in each interval can be linearly fitted within an error, given by r2b(), less than eps.
#' @param nr [integer] number of segments, fixed.
#' @param x [real] input x-axis array, should be an increasing function of index
#' @param y [real] input y-axis array
#' @noRd
#' @return [list(eps=eps,ni=ni)] where:
#' \itemize{
#'  \item eps [real] is the maximum error over all intervals,
#'  \item ni is a [vector of integer, length nr+1] vector Ni of indices giving data segments
#'  \item Ni[i] start of interval; Ni[i+1] end of interval, for any i<length(Ni)
#'  }
#'

# The dynamic arrays in R remove the need to know the number of data points, n, which is just
# the length of x and y.

s_mN <- function(nr, x, y) {

  # Initially spread intervals uniformly out over the data
  # taking care that the last interval includes the last point,
  # which otherwise might be lost due to the rounding error.
  ni <- rep(0, nr + 1)
  m <- round(length(x) / nr)
  ni <- c(m * (0:(nr - 1)) + 1, length(x) + 1)

  # TODO: A more accurate formula might be:
  # ni = round( 1 + (0:nr)*length(x)/nr )

  #  Nr=2
  #      m=NINT(FLOAT(N)/FLOAT(Nr))
  #        ni(1)=1
  #      DO i=2,Nr
  #        Ni(i)=m*(i-1)+1
  #      end DO
  #      Ni(Nr+1)=N+1 !{last interval}

  changed <- TRUE # Keep track of whether any intervals were altered...
  while (changed) { # ... because we are not finished until none were.
    changed <- FALSE # Continue the loop only if something changed.

    # All intervals now meet the test but it is possible that there are too many
    # in the sense that there may now be adjacent intervals, which if merged into one,
    # will then still meet the test.
    # Scan for such interval pairs and merge them.

    # # Loop i over all possible "middle" intervals, which we may remove.
    # # Note that ni potentially changes inside the loop.
    # if( 2 <= length(ni)-2 ) {
    #   # This branch had to be added because the Fortran DO loop will not execute
    #   # if the start value is greater than the limit value but R runs backward.
    #   for( i in 2:(length(ni)-2) ) {
    #     k1=ni[i-1]
    #     k2=ni[i+1]
    #     eps1=r2b( k1=k1, k2=k2, x=x, y=y )
    #     if( eps1<eps ) {
    #       if( length(ni)-1 > 2 ) { # Are there two or more intervals in the entire set?
    #         # Yes, so merge the interval we are looking at.
    #         ni <- merge_intervals(i, ni)
    #         changed = TRUE
    #         break # exit the for loop
    #       } else {
    #         # We are here because the last two intervals tested out to merge.
    #         # So we can just adjust the intervals and bail out entirely because,
    #         # apparently the entire data set lies on a line within eps!
    #         # TODO: The original code doesn't seem to do this correctly. It should
    #         # adjust ni[2] = ni[3] and nr = 1. I think this branch has never been
    #         # taken in actual operation. Obviously the data set is invalid if it lies
    #         # entirely on a straight line, right?
    #
    #         ni[1]=1  # TODO: Original code, but this should already be the case.
    #         return(ni)
    #       }
    #     }
    #   }
    # }

    # # If a merge was done then changed==TRUE. In this circumstance, the original
    # # code then immediately attempts to merge all intervals again from 2 so we
    # # do the same. This procedure seems wasteful because all the prior
    # # interval pairs have already been tested for merging and don't need it, yet we
    # # are going to test them all over again. For the moment we will continue to
    # # do it this way and perhaps TODO change it later so that it just restarts
    # # with the just-merged interval. Should get the same results faster.
    #
    # if( changed ) next   # short out the rest of the while(changed) loop.

    # In the original code the following is preceded with a comment:
    #  "to avoid couples"
    # I'm not precisely sure what that is supposed to mean but from the original
    # code I infer:
    #
    # If the end index of any interval is only 1 greater than the start index
    # (i.e., ni[i+1] == ni[i]) then bump the end index up by one. Due to the
    # loop this change propagates up to the last interval so we can't easily
    # replace the loop with nice vectorized R code.

    for (i in 1:(length(ni) - 1)) {
      if (ni[i + 1] - ni[i] == 1) {
        ni[i + 1] <- ni[i + 1] + 1
        changed <- TRUE
      }
    }

    # TODO: seems like we should restart the while(changed) loop at this point.

    # At this point in the original code stands the comment:
    #  "{"R" algorithm: adjusting of endpoint}"
    # I don't know what the "R algorithm" is so I'm just going to have to wing
    # comments.

    for (i in 2:(length(ni) - 1)) {
      k1 <- ni[i - 1]
      kmid <- ni[i]
      k2 <- ni[i + 1]
      epsm <- max(r2b(k1, kmid, x, y), r2b(kmid, k2, x, y))

      j1 <- ni[i]
      for (j in (k1 + 2):(k2 - 2)) {
        epsr <- max(r2b(k1, j, x, y), r2b(j, k2, x, y))
        if (epsr < epsm) {
          epsm <- epsr
          j1 <- j
        }
      }

      if (j1 != ni[i]) {
        ni[i] <- j1
        changed <- TRUE
      }

      # Here in the original code stands "if (i.eq.2) epsm1=epsm"
      # but epsm1 is never used so we have ignored it.
    } # for
  } # while(changed)

  # The following statement corresponds to one in the original code but seems
  # unnecessary. The code that manipulates ni should really maintain this condition.
  # Mind you, I haven't actually checked carefully that this is true. Maybe the
  # author added it to solve a problem! TODO: maybe take it out and test sometime.
  ni[length(ni)] <- length(x)

  return(list(eps = epsm, ni = ni)) # Q.E.D.
}

Try the rLakeAnalyzer package in your browser

Any scripts or data that you put into this service are public.

rLakeAnalyzer documentation built on March 18, 2018, 1:51 p.m.