R/sims2Parser.R

Defines functions sims2Parser

Documented in sims2Parser

#' @export
#' @title Special purpose parsing function for SIMS2 aggregate data with duplicates
#'
#' @description sims2Parser will parse a semi-compliant DHIS2  CSV file and transform it into a standard data
#' frame which can be used in subsequent DATIM validation routines. The difference with d2Parser is that
#' an extra (non-standard) field will be introduced to record the SIMS visit. This will in turn be
#' used to deduplicate visits which occur at the same site + mechanism + date combination. This function
#' will automatically decollide these types of visits 
#'
#' @param filename Location of the payload to be imported. Should be a valid SIMS2 aggregate file. See the separate spec for details
#' @param dataElementIdScheme Should be one of either code, name, shortName or id. The default is "code". If this paramater is "id",
#' then the Data elements are assumed to be already specififed as UIDs.
#' @param orgUnitIdScheme Should be one of either code, name, shortName or id. If this paramater is "id",
#' then the organisation units are assumed to be already specififed as UIDs
#' @param idScheme Remapping scheme for category option combos
#' @param invalidData Exclude any (NA or missing) data from the parsed file?
#' @param hasHeader TRUE by default. Should be set to FALSE if the file does not contain header information.
#' @param isoPeriod period to be used for date shift boundaries. If not provided, no boundaries are set.
#' 
#' @return Returns a data frame of  "dataElement","period","orgUnit","categoryOptionCombo","attributeOptionCombo","value"
#'
#' @note function(filename="/home/me/foo.csv",dataElementIdScheme="code",orgUnitIdScheme="code",idScheme="id",invalidData=FALSE)
#' Note that all values will be returned as characters.
#'
sims2Parser <-
  function(filename,
           dataElementIdScheme = "code",
           orgUnitIdScheme = "id",
           idScheme = "id",
           invalidData = FALSE,
           hasHeader=TRUE,
           isoPeriod=NA) {
    
    header <-
      c(
        "dataElement",
        "period",
        "orgUnit",
        "categoryOptionCombo",
        "attributeOptionCombo",
        "value",
        "assessmentid"
      )
    
    #We need to be a global user.
    organisationUnit <- getOption("organisationUnit")
    assertthat::assert_that(organisationUnit =="ybg3MO3hcf4")
    data <- read.csv(filename,na="",stringsAsFactors = FALSE, header=hasHeader,quote='"',row.names = NULL,sep=",")
    #TODO: Centralize all of this between the SIMS parser and general parser
    #Number of lines in the file number equal the number of records
    if (nrow(data) == length(readLines(filename))-as.numeric(hasHeader)) {
    warning("Number of records does not equal the number of lines. You may have empty lines or line breaks!")
    }
    #Ensure we have the correct number of columns
    data<-data[,1:length(header)]
    #Get number of columns and assign the header
    names(data)<-header
    #Data element, period and orgunit must be specified
    missing_required<-!complete.cases(data[,1:3])
    if (sum(missing_required) > 0) { 
      msg<-paste0("File contains rows with missing required fields in rows ", 
                  paste(which(missing_required == TRUE),sep="",collapse = ","),". These rows will be excluded.")
      warning(msg)
    }
    data<-data[!missing_required ,] 
    
    data <- data[, header[header %in% names(data)]]
    if (orgUnitIdScheme != "id") {
      data$orgUnit <-
        remapOUs(
          data$orgUnit,
          organisationUnit,
          mode_in = orgUnitIdScheme,
          mode_out = "id"
        )
    }

    #Filter out any thing which does not correspond to the orgUnit mapping scheme
    ou_non_match<-unique(data$orgUnit)[!(unique(data$orgUnit) %in% getOrganisationUnitMap()$id)]
    if ( length(ou_non_match) > 0 ) {
      msg<-paste0("The following orgunits are not valid and will be removed",paste(ou_non_match,sep="",collapse=","))
      warning(msg)
      data<-data[!(data$orgUnit %in% ou_non_match),]
    }
    
    
    if (dataElementIdScheme != "id") {
      data$dataElement <-
        remapDEs(
          data$dataElement,
          mode_in = dataElementIdScheme,
          mode_out = "id"
        )
    }

    de_non_match<-unique(data$dataElement)[!(unique(data$dataElement) %in% getDataElementMap()$id)]
    if ( length(de_non_match) > 0 )  {
      msg<-paste0("The following data elements are not valid and will be removed: ",paste(de_non_match,sep="",collapse=" , "))
      warning(msg)
      data<-data[!(data$dataElement %in% de_non_match), ]  }
    
    

    if (idScheme != "id") {
      data$attributeOptionCombo <- remapMechs(
        data$attributeOptionCombo,
        organisationUnit = organisationUnit,
        mode_in = idScheme,
        mode_out = "id"
      )
    }

    mechs_non_match<-unique(data$attributeOptionCombo)[!(unique(data$attributeOptionCombo) %in% getMechanismsMap()$id)]
    if ( length(mechs_non_match) > 0 ) {
      msg<-paste0("The following mechanisms are not valid and will be removed: ",paste(mechs_non_match,sep="",collapse=" , "))
      warning(msg)
      data<-data[!(data$attributeOptionCombo %in% mechs_non_match), ] 
    }
    
    #Data frame needs to be completely flattened to characters
    data <- plyr::colwise(as.character)(data)
    
    invalid <-
      function(x) {
        sapply(x, function(x) {
          is.na(x) || missing(x) || x == "" || x=="NULL"
        })
      }
    invalid.rows <-
      apply(apply(data, 2, invalid), 1, sum) != 0 #Anything which is not complete.
    if ( sum(invalid.rows) ) {
      msg<-paste(sum(invalid.rows),
                 " rows are incomplete. Please check your file to ensure its correct.")
      warning(msg)
    }
    
    if (!invalidData) {
      data <- data[!invalid.rows, ]
    }
    
    #TODO: End centralization here. 
    #TODO: Functionalize this with dateShifter

    #if period is provided, use it for boundaries
    if(!is.na(isoPeriod)){
      period<-getPeriodFromISO(isoPeriod);
    } else {
      period<-NA
    }
    
    #Start to shift the data
    data_shifted<-data[0,]
    assessments<-unique(data[,c("period","orgUnit","attributeOptionCombo","assessmentid")])
    #Are there any assessment ids which occur on different dates?
    #This should not be possible
    if ( sum(duplicated(assessments$assessmentid)) != 0 ) {stop("Duplicate assessment IDS were found.")}
    assessments_ou_acoc<-aggregate(. ~ orgUnit + attributeOptionCombo,data=assessments[,-4],length)
    #Possible collisions
    assessments_ou_acoc_dups<-assessments_ou_acoc[assessments_ou_acoc$period > 1,]
    asessments_collisions<-assessments[0,]
    if (nrow(assessments_ou_acoc_dups)> 0 ) {
    for (i in 1:nrow(assessments_ou_acoc_dups)) {
      foo<-assessments_ou_acoc_dups[i,]
      bar<-assessments[assessments$orgUnit==foo$orgUnit & assessments$attributeOptionCombo==foo$attributeOptionCombo,]
      #Are there any duplicated dates?
      
      if (sum(duplicated(bar$period)) > 0) {
        dates<-as.Date(unique(strptime(bar$period,"%Y%m%d",tz = "UTC")))
        start_date<-min(dates)
        end_date<-max(dates)
        #We need a minumum pool of dates
        if ( (start_date - end_date) < nrow(bar) ) {
          end_date<-start_date + nrow(bar)
        }
        #make sure end date does not go beyond boundaries
        if(!is.na(period)){
          if(end_date > period$endDate){
            start_date = start_date - (end_date - period$endDate)
          }
          if(start_date < period$startDate){
            warning("Shifting results in periods outside of the defined isoPeriod")
          }
        }

        possible_dates<-seq(start_date,end_date,by="day")
        #Remove any dates which are already used
        possible_dates<-possible_dates[!(possible_dates %in% dates)]
        
        duplicated_dates<-which(duplicated(bar$period))
        for (j in 1:length(duplicated_dates)) {
          this_date<-as.Date(bar$period[duplicated_dates[j]],"%Y%m%d")
          #Which date is closest?
          date_distance<-methods::as(abs(possible_dates-this_date),"integer")
          replacement_date_n<-which(date_distance == min(date_distance))
          
          bar$period[duplicated_dates[j]]<-format(possible_dates[replacement_date_n],"%Y%m%d")
          #Remove it from the pool
          possible_dates<-possible_dates[-replacement_date_n]
        }
      }
      asessments_collisions<-rbind(asessments_collisions,bar) }
    #Non-collisions
    
    data_clear<-merge(data,assessments_ou_acoc[assessments_ou_acoc$period == 1,c("orgUnit","attributeOptionCombo")],
                      by=c("orgUnit","attributeOptionCombo"))
    data_not_clear<-merge(data,asessments_collisions,
                          by=c("orgUnit","attributeOptionCombo","assessmentid"))
    data_not_clear<-data_not_clear[,c("orgUnit","attributeOptionCombo","dataElement","period.y","categoryOptionCombo","value","assessmentid")]
    names(data_not_clear)<-names(data_clear)
    data_shifted<-rbind(data_clear,data_not_clear)
    assertthat::assert_that(nrow(data) == nrow(data_shifted))
    data_shifted$comment<-data_shifted$assessmentid
    data_shifted$storedby<-NA
    data_shifted$timestamp<-NA
  } else { data_shifted<-data 
    data_shifted$storedby<-NA
    data_shifted$timestamp<-NA
    data_shifted$comment<-data_shifted$assessmentid
    }
    
    header_final <-
      c(
        "dataElement",
        "period",
        "orgUnit",
        "categoryOptionCombo",
        "attributeOptionCombo",
        "value",
        "storedby",
        "timestamp",
        "comment"
      )
    data_shifted <- data_shifted[, header_final[header_final %in% names(data_shifted)]]
    return(data_shifted)
  }
jason-p-pickering/datim-validation documentation built on Nov. 8, 2018, 3:30 p.m.