R/computeDuplicity.R

Defines functions computeDuplicity

Documented in computeDuplicity

#'@title Computes the duplicity probability for each device.
#'
#'@description Computes the duplicity probability for each device using three 
#'different methods: pairs, 1-to-1 and trajectory. The theory behind these 
#'methods is described in detail in \href{https://webgate.ec.europa.eu/fpfis/mwikis/essnetbigdata/images/f/fb/WPI_Deliverable_I3_A_proposed_production_framework_with_mobile_network_data_2020_05_31_draft.pdf}{WPI
#'   Deliverable 3} and in the paper \emph{An end-to-end statistical process 
#'   with mobile network data for Official Statistics}.
#'
#'@param method The method used to compute the duplicity probability. It could 
#'have one of the following values: "pairs", "1to1", "trajectory".
#'
#'@param gridFileName The name of the file with the grid parameters. This file 
#'could be the one generated by the simulation software or can be created with 
#'any text editor. The grid file generated by the simulation software has
#'the following columns: \code{Origin X, Origin Y, X Tile Dim, Y Tile Dim, No Tiles X, No Tiles Y}. 
#'We are interested only in the number of rows and columns and the tile size on 
#'OX and OY axes. Therefore, the file provided as input to this function should 
#'have at least the following 4 columns: \code{No Tiles X} ,\code{No Tiles Y}, 
#'\code{X Tile Dim} and \code{Y Tile Dim}.
#'
#'@param eventsFileName The name of the file with the network events to be used. 
#'Depending on the parameter \code{simulatedData} it could be a .csv file coming 
#'from the simulation software or from a real MNO. In case the file comes from 
#'the simulation software it contains following columns: \code{time, antennaID, eventCode, deviceID,
#'  x, y, tile} Only the first 4 columns are used, the rest are ignored.
#'
#'@param signalFileName The name of the .csv file that contains the signal 
#'strength/quality for each tile in the grid. Depending on the parameter 
#'\code{simulatedData} it could be a .csv file coming from the simulation 
#'software or from a real MNO. In case the file comes from the simulation 
#'software the data are organized as a matrix with the number of rows equals to 
#'the number of antennas and the the following columns: \code{Antenna ID, Tile 0, Tile 1, ... Tile
#'(N-1)}. On the first column there are the antenna IDs and on the rest of the 
#'columns the corresponding signal strength/quality for each tile in the grid.
#'
#'@param cellsFileName The name of the file where the coverage areas of antennas 
#'are to be found. It is a should be a .csv file with two values on each row: 
#'the antenna ID and a WKT string representing a polygon (i.e. it should start 
#'with the word POLYGON) which is the coverage area of the corresponding antenna. 
#'This area is also called the antenna cell.
#'
#'@param simulatedData If TRUE the input data are provided by the simulation 
#'software, otherwise real data is used. 
#'
#'@param simulationFileName The name of the file used to define a simulation 
#'scenario. It is the file that was provided as input for the simulation software. 
#'This file is required only if simulatedData is TRUE. 
#'
#'@param netParams This parameter is required if simulatedData is FALSE, i.e. 
#'the \pkg{deduplication} package uses real mobile network data. In this case, 
#'\code{netParam} is a list with two elements: \code{conn_threshold} and 
#'\code{prob_sec_mobile_phone}. \code{conn_threshold} is the minimum value of 
#'the signal strength/quality that can be used to connect a mobile device to an
#'antenna. If the signal in a region is below this value the region is 
#'considered out of the coverage area. \code{prob_sec_mobile_phone} is the 
#'probability of a person to have two mobile devices.
#'
#'@param path The path where the files with the posterior location probabilities
#'for each device are to be found. This parameter is needed only if the 
#'"trajectory" method is used.
#'
#'@param gamma This value is used only for the "trajectory" method and is the 
#'factor used in the compariosn between the mode of Delta X and Delta Y and the 
#'dispersion radius. The default value is 0.5.
#'
#'@param aprioriProbModel This parameter is used to initialize the apriori 
#'probabilities for the HMM model for each device. The default value is NULL 
#'which means that by default the models are initialized with the steady state.
#'
#'@param aprioriJointModel This parameter is used to initialize the apriori 
#'probabilities for the joint HMM models for each pair of two devices. The 
#'default value is NULL which means that by default the models are initialized 
#'with the steady state.
#'
#'@param lambda This parameter is used in combination with the "1-to-1" method. 
#'If it is NULL (which is the default value) the computations follow the 
#'description given in \href{https://webgate.ec.europa.eu/fpfis/mwikis/essnetbigdata/images/f/fb/WPI_Deliverable_I3_A_proposed_production_framework_with_mobile_network_data_2020_05_31_draft.pdf}{WPI
#'   Deliverable 3}, otherwise the computations proceed as they are described in the 
#'   \emph{An end-to-end statistical process with mobile network data for Official Statistics} paper.
#'
#'@param handoverType The handover mechanism used by the mobile network. It could 
#'have two values: "strength" or "quality". The default value is "strength". 
#'This parameter is used to compute the emission probabilities for the HMM
#'models using the values from the signal file. If this file contains the signal 
#'strength then the \code{handoverType} should be set to "strength" and if the 
#'file contains the values of the signal quality then the parameter should be set to
#'"quality".
#'
#'@param emissionModel This parameter dictates how the emission probabilities 
#'for the HMM models are computed. It can have two values: "RSS" and "SDM". 
#'Normally, the emission probabilities are computed using the signal values from 
#'the signal file assuming that if the \code{handoverType} is set to "strength" 
#'then the signal file contains the strength values and if the \code{handoverType}
#'is set to "quality" the signal file contains the quality of the signal. For 
#'demonstrative purposes the package supports unusual combinations. If \code{handoverType}
#' is set to "strength", the signal file contains the signal strength but the 
#' \code{emissionModel} is set to "SDM" the signal strength values are transformed 
#' to signal quality and then the computation of the emission probabilities uses 
#' these transformed values. If \code{handoverType} is set to "quality", the signal 
#' file contains the signal quality but the \code{emissionModel} is set to "RSS", 
#' the signal quality values are transformed to signal strength and then the 
#' computation of the emission probablities uses these transformed values.
#'
#'@param antennaFileName The name of the xml file containing the technical 
#'parameters of the antennas needed to transform the signal quality into signal 
#'strength and the other way around. This is an input file of the simulation 
#'software. An example file is included in this package. The default value is 
#'NULL because this file is only needed to make unusual combinations between 
#'\code{handoverType} and the way the emission probabilities are computed.
#'
#'@param prefix The file name prefix for the files with posterior location 
#'probabilities.
#'
#'@return a data.table object with two columns: \code{deviceID} and \code{dupP}. 
#'On the first column there are deviceIDs and on the second column the 
#'corresponding duplicity probability, i.e. the probability that a device is in 
#'a 2-to-1 correspondence with its holder.
#'
#'
#'@export
computeDuplicity <- function(method, gridFileName, eventsFileName, signalFileName,  antennaCellsFileName = NULL, simulatedData = TRUE,  simulationFileName, netParams = NULL, path = NULL, gamma = 0.5, aprioriProbModel = NULL, aprioriProbJointModel = NULL, lambda = NULL, handoverType = 'strength', emissionModel = NULL, antennaFileName = NULL, prefix = NULL) {
  
  out_duplicity <- NULL
  tryCatch ({
    if(method != 'trajectory' & method != "1to1" & method != 'pairs')
      stop(paste(method, " method unknown!"))
    
    if(!file.exists(gridFileName))
      stop(paste0(gridFileName, " doesn't exist"))
    
    if(!file.exists(eventsFileName))
      stop(paste0(eventsFileName, " doesn't exist"))
    
    if(!file.exists(signalFileName))
      stop(paste0(signalFileName, " doesn't exist"))
    
    if(method == "pairs")
      if(!file.exists(antennaCellsFileName))
        stop(paste0(antennaCellsFileName, " doesn't exist. If you use \"pairs\" method you should provide a file with antenna cells."))
    
    if(simulatedData == TRUE)
      if (is.null(simulationFileName) | !file.exists(simulationFileName))
        stop(paste0(simulationFileName, " dooesn't exist. If you use simulated data you should provide the simulation input file."))
    
    if(simulatedData == FALSE & is.null(netParams))
      stop("In case of using real data you should provide a list with two params: the minimum value of the signal detectable by mobile devices and
         the probability of a person to have two mobile devices")
    
    gridParams <-readGridParams(gridFileName)
    events <- readEvents(eventsFileName)
    if(simulatedData == TRUE)
      simParams <-readSimulationParams(simulationFileName)
    else
      simParams <- netParams
    
    
    devices <- getDeviceIDs(events)
    connections <- getConnections(events)
    emissionProbs <- getEmissionProbs(gridParams$nrow, gridParams$ncol, signalFileName, simParams$conn_threshold, handoverType, simulatedData = TRUE, emissionModel, antennaFileName)
    jointEmissionProbs <- getEmissionProbsJointModel(emissionProbs)
    if(is.null(aprioriProbModel)) {
      model <- getGenericModel(gridParams$nrow, gridParams$ncol, emissionProbs)
    }
    else {
      model <- getGenericModel(gridParams$nrow, gridParams$ncol, emissionProbs, initSteady = FALSE, aprioriProb = aprioriProbModel)
    }
    if(is.null(aprioriProbJointModel)) {
      modelJ <- getJointModel(gridParams$nrow, gridParams$ncol, jointEmissionProbs)
    } else {
      modelJ <- getJointModel(gridParams$nrow, gridParams$ncol, jointEmissionProbs, initSteady = FALSE, aprioriJointProb = aprioriProbJointModel)
    }

    if(method == "pairs" | method == "trajectory") {
      coverarea <- readCells(antennaCellsFileName)
      antennaNeigh <- antennaNeighbours(coverarea)
      P1a <- aprioriDuplicityProb(simParams$prob_sec_mobile_phone, length(devices))
      pairs4dup<-computePairs(connections, length(devices), oneToOne = FALSE, antennaNeighbors = antennaNeigh)
      if (method == "pairs") {
        ll <- fitModels(length(devices), model,connections)
        out_duplicity <- computeDuplicityBayesian(method, devices, pairs4dup, modelJ, ll, P1 = P1a)
      }
      else {
        T<-sort(unique(events[,1][[1]]))
        out_duplicity <-computeDuplicityTrajectory(path, prefix, devices, gridParams, pairs4dup, P1 = P1a , T, gamma = gamma)
      }
    }
    else if(method == "1to1"){
      pairs4dup<-computePairs(connections, length(devices), oneToOne = TRUE)
      ll <- fitModels(length(devices), model,connections)
      if(!is.null(lambda)) {
        out_duplicity <- computeDuplicityBayesian(method, devices, pairs4dup, modelJ, ll, P1 = NULL, Pii = NULL, init = TRUE, lambda = lambda)
      }
      else {
        Piia <- aprioriOneDeviceProb(simParams$prob_sec_mobile_phone, length(devices))
        out_duplicity <- computeDuplicityBayesian(method, devices, pairs4dup, modelJ, ll, P1 = NULL, Pii = Piia)
      }
    }
  }, error = function(err){
    print(err)
    rm(list=setdiff(ls(), "out_duplicity"))
    
  },
  finally = {
    rm(list=setdiff(ls(), "out_duplicity"))
  })
  
  return(out_duplicity)
}
bogdanoancea/deduplication documentation built on Dec. 2, 2020, 11:22 p.m.