R/fEMDDetailed.R

Defines functions fEMDDetailed

Documented in fEMDDetailed

#' emdist:emd is quite restrictive. A more libral alternative.
#'
#' So long as you can calculate a distance matrix between the two datasets, you
#' can use this function to calculate the earth mover's distance between them. 
#' This lets you calculate EMD beteen any two datasets, unlike emdist::emd 
#' only lets compare numberic datasets of up to four dimensions.
#' This additionally gives you the weights when transforming one dataset to 
#' another so you can make more detailed inferences about which data is 
#' contributing the most to the distances, etc.
#' 
#' The output is an lpExtPtr type of object. The lpSolveAPI library has many 
#' operations that you can perform on such an object, for instance 
#' get.variables will get the value of the mapping performed by the EMD which
#' is a useful detail to underestand what observations are contributing more
#' to the distance.
#'
#' @param SNO1, index of the obervation number from dataset one,
#' eg. c(1,2,3,4,1,2,3,4,1,2,3,4)
#' @param SNO2, index of the obervation number from dataset one,
#' eg. c(1,1,1,1,2,2,2,2,3,3,3,3)
#' @param Distance The distance between the data associated with the 
#' respective SNO1 and SNO2 values
#' @param dtWeights1 The weight for the respective SNO1 entry, a data.table with
#' two columns - SNO1, Weight
#' @examples
#' # Two random datasets of three dimension
#' a = data.table(matrix(runif(21), ncol = 3))
#' b = data.table(matrix(runif(30), ncol = 3))
#' # adding serial numbers to each observation
#' a[, SNO := .I]
#' b[, SNO := .I]
#' # evaluating distance between all combinations of data in the two datasets
#' a[, k := 'k']
#' b[, k := 'k']
#' dtDistances = merge(a,b,'k',allow.cartesian = T)
#' dtDistances[,
#'    Distance := (
#'       (( V1.x - V1.y) ^ 2) +
#'       (( V2.x - V2.y) ^ 2) +
#'       (( V3.x - V3.y) ^ 2)
#'    ) ^ 0.5
#' ]
#' # getting EMD between this dataet
#' lprec = fEMDDetailed(
#'    SNO1 = dtDistances[, SNO.x],
#'    SNO2 = dtDistances[, SNO.y],
#'    Distance = dtDistances[, Distance]
#' )
#' fGetEMDFromDetailedEMD(lprec)
#' # This value should be the same as that computed by EMD
#' # EMD needs the weightage of each point, which is assigned as equal in our 
#' # function, so giving 1/N weightage to each data point
#' emdist::emd(
#'    as.matrix(
#'       a[, list(1/.N, V1,V2,V3)]
#'    ),
#'    as.matrix(
#'       b[, list(1/.N, V1,V2,V3)]
#'    )
#' )
#' @import data.table
#' @import lpSolveAPI
#' @export
fEMDDetailed = function(
   SNO1,
   SNO2,
   Distance,
   dtWeights1 = NULL,
   dtWeights2 = NULL
) {

   if ( is.null(dtWeights1) ) {

      nScaleUpFactor = length(unique(SNO1)) / length(unique(SNO2))

      if ( nScaleUpFactor < 1 ) {

         dtWeights1 = data.table(
            SNO1 = unique(SNO1),
            Weight = 1 / nScaleUpFactor
         )

         dtWeights2 = data.table(
            SNO2 = unique(SNO2),
            Weight = 1
         )

      } else {

         dtWeights1 = data.table(
            SNO1 = unique(SNO1),
            Weight = 1
         )

         dtWeights2 = data.table(
            SNO2 = unique(SNO2),
            Weight = 1 / nScaleUpFactor
         )

      }

   }

   setDT(dtWeights1)
   setDT(dtWeights2)

   if ( dtWeights2[, sum(Weight)] > dtWeights1[, sum(Weight)] ) {

      warning('Adding dummy SNO1 to balance weights with SNO2')

      NewSNO = dtWeights1[, max(SNO1) + 1]

      dtWeights1 = rbind(
         dtWeights1,
         data.table(
            SNO1 = NewSNO,
            Weight = dtWeights2[, sum(Weight)] - dtWeights1[, sum(Weight)]
         )
      )

      SNO1 = c(
         SNO1,
         rep(NewSNO, nrow(dtWeights2))
      )

      SNO2 = c(
         SNO2,
         dtWeights2[, SNO2]
      )

      Distance = c(
         Distance,
         rep(pmax(10, max(Distance)) ^ 5, nrow(dtWeights2))
      )

   } else if ( dtWeights1[, sum(Weight)] > dtWeights2[, sum(Weight)] ) {

      warning('Adding dummy SNO2 to balance weights with SNO1')

      NewSNO = dtWeights2[, max(SNO2) + 1]

      dtWeights2 = rbind(
         dtWeights2,
         data.table(
            SNO2 = NewSNO,
            Weight = dtWeights1[, sum(Weight)] - dtWeights2[, sum(Weight)]
         )
      )

      SNO2 = c(
         SNO2,
         rep(NewSNO, nrow(dtWeights1))
      )

      SNO1 = c(
         SNO1,
         dtWeights1[, SNO1]
      )

      Distance = c(
         Distance,
         rep(100 * max(Distance), nrow(dtWeights1))
      )

   }

   # dtMatrix = dtMatrix[, list(SNO1, SNO2, Distance)]
   dtMatrix = data.table(SNO1, SNO2, Distance)

   lprec = make.lp(
      nrow = dtMatrix[, length(unique(SNO1))] + dtMatrix[, length(unique(SNO2))] + nrow(dtMatrix), # constraints
      ncol = nrow(dtMatrix) # objective function
   )


   for ( iSNO1 in dtMatrix[, unique(SNO1) ]) {

      add.constraint(
         lprec,
         xt = rep(
            1,
            dtMatrix[, sum(iSNO1 == SNO1)]
         ),
         type = c("="),
         rhs = dtWeights1[SNO1 == iSNO1, Weight],
         indices = dtMatrix[, which(iSNO1 == SNO1)]
      )

   }

   for ( iSNO2 in dtMatrix[, unique(SNO2) ]) {

      add.constraint(
         lprec,
         xt = rep(
            1,
            dtMatrix[, sum(iSNO2 == SNO2)]
         ),
         type = c("="),
         rhs = dtWeights2[SNO2 == iSNO2, Weight],
         indices = dtMatrix[, which(iSNO2 == SNO2)]
      )

   }

   for ( iRow in dtMatrix[, seq(.N) ]) {

      add.constraint(
         lprec,
         xt = 1,
         type = c(">="),
         rhs = 0,
         indices = iRow
      )

   }

   set.type(lprec, c(1:nrow(dtMatrix)), type = c("real"))

   set.objfn(
      lprec,
      dtMatrix[, Distance]
   )

   solve(lprec)

   lprec

}
thecomeonman/CodaBonito documentation built on April 24, 2023, 11:41 a.m.