NITPicker: Finds the Best Subset of Points to Sample

# Hello, world!
#
# This is an example function named 'hello'
# which ####prints 'Hello, world!'.
#
# You can learn more about package authoring with RStudio at:
#
#   http://r-pkgs.had.co.nz/
#
# Some useful keyboard shortcuts for package authoring:
#
#   Build and Reload Package:  'Cmd + Shift + B'
#   Check Package:             'Cmd + Shift + E'
#  Test Package:              'Cmd + Shift + T'

#' Generate Perturbations
#' 
#' Find curves similar to a set of example curves.  This function takes as input a set of example curves, and uses them to infer a probability distribution of curves.  \code{numPert} curves are sampled from this probability distribution.
#' @param training This is a numerical matrix of training data, where the rows represent different samples, columns represent different time points (or points on a single spatial axis), and the values correspond to measurements
#' @param tp A numerical vector of time points (or spatial coordinates along a single axis)
#' @param iterations a positive integer, representing the maximum number of iterations employed during time warping (see time_warping in fdasrvf library)
#' @param spline a positive integer, representing the degree of the B-spline interpolation when calculating values at the new, evenly spaced knot positions
#' @param knots a positive integer-- for time warping to work optimally, the points must be evenly sampled.  This determines how many points do we evenly sample before conducting time warping
#' @param numPert a positive integer, representing the number of sampled curves to output.
#' @return An fdawarp object (see fdasrvf library)
#' @examples 
#' mat=CanadianWeather$monthlyTemp
#' generated=generatePerturbations(mat, c(1:length(mat[,1])))

generatePerturbations<- function(training, tp, iterations=20, spline=3, knots=100, numPert=20){
    #interpolate points using splines
    newX=seq(min(tp), max(tp), (max(tp)-min(tp))/knots)
    training_interpolated=apply(training, 2, function(i){deBoorWrapper(newX, tp, i, spline)})
    #do time warping
    tw= time_warping(training_interpolated, newX, MaxItr = iterations, showplot=FALSE)
    #sample from the distribution of functions
    gm=gauss_model(tw, n=numPert)
    gm
}

# Helper function: calculates the optimisation matrix value 
optimisationMatrixValue <- function(i, j, edge, max_value, max_link, tp, y, w, multipleGenes=F, iter=20, knots=100, numPerts=1000){
if(!multipleGenes){
    #if single gene, use this method
    optimisationMatrixValueSingleGene(i, j, edge, max_value, max_link, tp, y, w)
}else{
    #take sum over the genes
    sum(sapply(c(1:dim(y)[1]), function(index){
            y1=y[index,] #check orientation
            w1_id=w[[index]]
          
                optimisationMatrixValueSingleGene(i, j, edge, max_value, max_link, tp, y1, w1_id)
                
                }))

}
    }

#Helper function: gets scoreF1
optimisationMatrixValueSingleGene <- function(i, j, edge, max_value, max_link, tp, y, w){
    #case 1: start node
    if(i==0){
        #case 1.5: start node AND end node:
        if(j>length(tp)){0}else{

        scoreF1(c(tp[1], tp), c(y[j], y), rbind(w[j,],w),  tp[1], tp[j], c(1, j+1))

        }
    }else{
        if(edge==1){
            Inf
        }else{
        index=backtrace(max_link, i, edge) 

        #case 2: end node
        if(j>length(tp)){
            if(i==length(tp)){max_value[i, edge-1]}else{
                
                scoreF1(c(tp, tp[j-1]), c(y, y[i]), rbind(w, w[i,]),  tp[i], tp[j-1], c(index, i, j))+max_value[i, edge-1]
                
            #case 3: just intermediate nodes
             }}else{
            scoreF1(tp, y, w,  tp[i], tp[j], c(index, i, j))+max_value[i, edge-1]
            }
        }
    }
   }


#' Find best subset of points for follow-up experiments
#' 
#' findPath is the main function of the NITPicker package-- it finds the best subset of points to sample from a time course (or spatial axis, along a single axis), based on a set of example curves.  
#'
#' @param tp A numerical vector of time points (or spatial coordinates along a single axis)
#' @param y A numerical vector of measurements (of the control)-- this can be set to \code{rep(0, length(tp))} for calculating the shape of the curves only
#' @param training Unless multiple is true, this is a numerical matrix of training data, where the rows represent different samples, columns represent different time points (or points on a single spatial axis), and the values correspond to measurements.  If multiple is true, this is a list of numerical matrices, each representing a different class of curves (for instance, different genes).
#' @param numSubSamples integer that represents the number of time points that will be subsampled
#' @param multiple A boolean that designates whether training is a matrix or a list of matrices
#' @param spline A positive integer representing the spline used to interpolate between knots when generating perturbations.  Note that this does NOT designate the spline used when calculating the L2-error.
#' @param resampleTraining A boolean designating whether the exact training data should be used (False) or whether a probability distribution of curves should be generated and training curves resampled (True).
#' @param iterations A positive integer, representing the maximum number of iterations employed during time warping (see time_warping in fdasrvf library)
#' @param knots A positive integer-- for time warping to work optimally, the points must be evenly sampled.  This determines how many points do we evenly sample before conducting time warping
#' @param numPert a positive integer, representing the number of sampled curves to output.
#'
#' @return An integer vector of the indices of the time points selected to be subsampled.  The actual time points can be found by \code{tp[output]}.  The length of this vector should be \code{numSubSamples}.
findPath <- function(tp, y, training, numSubSamples, multiple=F, spline=1, resampleTraining=T, iter=20, knots=100, numPerts=1000){
    perts=NA
    w=NA
    
    #Usually you would want to use the fdasrvf package to generate new pdf from the training set and sample curves from that
    if(resampleTraining){
        
    #if there are multiple genes, then the procedure must take place on each gene, trained separately
    if(multiple){
        w=lapply(c(1:length(training)), function(index){
            perts=generatePerturbations(training[[index]], tp, iterations=iter, spline=1, knots=knots, numPert=numPerts)
            a=apply(perts$ft, 2, function(i){
                deBoorWrapper(tp, perts$time, i, spline)
            })

            a/sum(a)
        })

    }else{
    perts=generatePerturbations(training, tp, iterations=iter, spline=spline, knots=knots, numPert=numPerts)

    w=apply(perts$ft, 2, function(i){
        deBoorWrapper(tp, perts$time, i, spline)
    })

    }
    }else{
    #Sometimes a different strategy might be used to sample example curves-- 
    #in this case, training and training2 can just be set to the new set of perturbations
        w=training
    }

     min_score=matrix(0, nrow=1+length(tp), ncol=numSubSamples+1)
     min_link=matrix(0, nrow=1+length(tp), ncol=numSubSamples+1)

    # #Make an N x N x E
     for(j in c(1:(1+length(tp)))){
         temp=sapply(c(0:(j-1)), function(i){
             sapply(c(1:(1+numSubSamples)), function(edge){
                 if(edge>(i+1)){Inf}else{
                 optimisationMatrixValue(i, j, edge, min_score, min_link, tp, y, w, multipleGenes=multiple)
                 }})
         })

         print(temp)


         min_link[j,]=apply(temp, 1, function(k){

             which.min(k)[1]-1}
            )

         min_score[j,]=apply(temp, 1, function(k){

          min(k)})


     }

     print(min_link)
     print(min_score)
     backtrace(min_link, length(min_link[,1]), length(min_link[1,]))

     ######If you allow splines for calculating the L2-error you would need to add `step 2`
     ##STEP2: do correction for first 'spline' time points that were subset, by running the algorithm backwards

}


#internal function for cal
scoreF1 <- function(tp, y, w,  start, stop, index, numSubdivisions=500){
     sum(apply(w, 2, function(i){
        integrate(F1, start, stop, tp=tp, g=y, w=i, index=index, subdivisions=numSubdivisions)$value
    }))}



#' L2-error
#'
#'Given two functions y1(t) and y2(t), this function finds the L2-distance between the following two curves:
#'a) y1(t)-y2(t) sampled at all time points (\code{tp}) 
#'b) y1(t)-y2(t) sampled at the time points indexed by \code{index} (\code{tp[index]}).
#'Note that by setting \code{y2} to \code{rep(0,length(tp))}, this function can be used to estimate the L2-error in the shape of \code{y1}. 
#'
#' @param tp A numerical vector of time points (or spatial coordinates along a single axis)
#' @param y1 A numerical vector of measurements (of the control)
#' @param y2 A numerical vector of measurements (of the experimental condition)
#' @param start A numerical value representing the start time (or spatial coordinate) of the integration
#' @param stop A numerical value representing the end time (or spatial coordinate) of the integration
#' @param index A vector of positive integers representing the indices of \code{tp} that we subsample
#' @param numSubdivisions This can be adjusted to ensure the integration doesn't take too long, especially if we aren't overly concerned with rounding errors.
#'
#' @return A numeric value-- the L2 error.
L2 <-function(tp, y1, y2, start, stop, index, numSubdivisions=2000){

   integrate(meanSqr, start, stop, tp=tp, g=y1, w=y2,
             tp2=c(tp[1], tp[index], tp[length(tp)]),
             g2=c(y1[index[1]], y1[index], y1[index[length(index)]]),
             w2=c(y2[index[1]], y2[index], y2[index[length(index)]]),
             spl=1, subdivisions=numSubdivisions)$value
}

#helper function
 meanSqr <-function(x, tp, g, w, tp2, g2, w2, spl){

    temp=deBoorWrapper(x, tp, g, spl)-deBoorWrapper(x, tp, w, spl)-(deBoorWrapper(x, tp2, g2, spl)-deBoorWrapper(x, tp2, w2, spl))
    temp*temp
}
#helper function
F1 <-function(x, tp, g, w, index, spl=1){
    temp=deBoorWrapper(x, tp, g, spl)-deBoorWrapper(x, tp, w, spl)-(deBoorWrapper(x, tp[index], g[index], spl)-deBoorWrapper(x, tp[index], w[index], spl))
    temp*temp
    }
#helper function
deBoorWrapper <- function(x, tp, values, spline){
    if(length(tp)==2 & tp[1]==tp[2]){
        0
    }

    #check input values
    if(spline<0){
        print('spline must be >0')
    }

    sapply(x, function(i){
        k_smaller=which(tp<i)
        k=k_smaller[length(k_smaller)]

        if(i==tp[1]){values[1]}else{
        if(k<spline){
            k_smaller=which(sort(-tp)<(-i))
            k=k_smaller[length(k_smaller)]
            temp=deBoor2(k, -i, sort(-tp), values[length(values):1], spline)
            temp
        }else{
            deBoor2(k, i, tp, sapply(values, function(bl){as.numeric(bl)}), spline)
        }
        }
    })

}

#helper function
deBoor2 <- function(k, x, t, b, p){
    indices=seq(0, p, 1)+ k - p+1
    d=b[indices]

    for (r in indices[0:(length(indices)-1)]){
        temp=indices[which(indices>r)]
        for (j in temp[length(temp):1]){

            alpha = (x - t[r]) / (t[j] - t[r])
                     oldD=d[j-min(indices)+1]

            d[j-min(indices)+1] = (1.0 - alpha) * d[r-min(indices)+1] + alpha * d[j-min(indices)+1]

        }
    }
 d[p+1]
}


#Helper function: 
#finds backtrace from the max_index matrix
backtrace<- function(max_index, index, edge, until=NA){
    if(max_index[index, edge]==0){
        c();
    }else{
        if(is.na(until)){
        c(backtrace(max_index, max_index[index, edge], edge-1),max_index[index, edge])
        }else{
            if(until==0){
                c(max_index[index, edge])
            }else{
                c(backtrace(max_index, max_index[index, edge], edge-1, until=until-1),max_index[index, edge])
            }
        }
    }
}