AresIndexer: Observational Data Research Indexing Package

Documented in buildNetworkIndex

# @file BuildNetworkIndex.R
#
#
# Copyright 2021 Observational Health Data Sciences and Informatics
#
# This file is part of AresIndexer
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#' Build Network Index
#'
#' @details Generates a network index containing data source history across releases including performance, and data quality metrics.
#'
#' @param sourceFolders A vector of folder locations that contain the files
#' exported from Achilles in the ARES Option format (Achilles::exportAO)
#' to be included in the network data quality index.
#'
#' @param outputFolder The location of the Ares data folder.
#'
#' @return The JSON object that is written to the ARES data folder (outputFolder parameter).
#'
#'
#' @importFrom data.table data.table
#'
#' @export
buildNetworkIndex <- function(sourceFolders, outputFolder) {
	index <- {}
	index$sources <- list()

	sourceCount <- 0

	writeLines("Generating export query index")
	AresIndexer::buildExportQueryIndex(outputFolder)
	networkPerformanceIndex <- data.frame()

	# iterate on sources
	for (sourceFolder in sourceFolders) {
		writeLines(paste("processing source folder: ", sourceFolder))
	  source <- {}

		sourceCount <- sourceCount + 1
		releaseCount <- 0

		source$releases <- data.table()
		releaseFolders <- list.dirs(sourceFolder, recursive = F)
		releaseFolders <- sort(releaseFolders, decreasing = T)

		# as part of building the network index - build the history index for this source
		writeLines(paste("processing data source history index: ", sourceFolder))
		dataSourceHistoryIndex <- AresIndexer::buildDataSourceHistoryIndex(sourceFolder)
		write(jsonlite::toJSON(dataSourceHistoryIndex), file.path(sourceFolder,"data-source-history-index.json"))

		writeLines(paste("processing network performance index", sourceFolder))
		networkPerformanceIndex <- rbind(networkPerformanceIndex, buildNetworkPerformanceIndex(sourceFolder))


		releaseIntervalData <- data.frame()

		# iterate on source releases
		for (releaseFolder in releaseFolders) {
		  writeLines(paste("processing release folder: ", releaseFolder))
			dataQualityResultsFile <- file.path(releaseFolder, "dq-result.json")
      personResultsFile <- file.path(releaseFolder, "person.json")
      observationPeriodResultsFile <- file.path(releaseFolder, "observationperiod.json")

			# add data quality details
			if (file.exists(dataQualityResultsFile)) {
				dataQualityResults <- jsonlite::fromJSON(dataQualityResultsFile)

				# add person results
				if (file.exists(personResultsFile)) {
				  personResults <- jsonlite::fromJSON(personResultsFile)
				  count_person <- sum(personResults$BIRTH_YEAR_DATA$COUNT_PERSON)
				} else {
				  writeLines(paste("missing person results file ", personResultsFile))
				}

				# add observation period results
				if (file.exists(observationPeriodResultsFile)) {
				  observationPeriodResults <- jsonlite::fromJSON(observationPeriodResultsFile)
				  obs_period_start <- min(observationPeriodResults$OBSERVED_BY_MONTH$MONTH_YEAR)
				  obs_period_end <- max(observationPeriodResults$OBSERVED_BY_MONTH$MONTH_YEAR)
				} else {
				  writeLines(paste("missing observation period results file ", observationPeriodResultsFile))
				}

				source$cdm_source_name <- dataQualityResults$Metadata$cdmSourceName
				source$cdm_source_abbreviation <- dataQualityResults$Metadata$cdmSourceAbbreviation
				source$cdm_source_key <- gsub(" ", "_", source$cdm_source_abbreviation)
				source$cdm_holder <- dataQualityResults$Metadata$cdmHolder
				source$source_description <- dataQualityResults$Metadata$sourceDescription

        source$releases <- rbind(
          source$releases,
          list(
            release_name = format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y-%m-%d"),
            release_id = format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y%m%d"),
            cdm_version = dataQualityResults$Metadata$cdmVersion,
            vocabulary_version = dataQualityResults$Metadata$vocabularyVersion,
            dqd_version = dataQualityResults$Metadata$dqdVersion,
            count_data_quality_issues = dataQualityResults$Overview$countOverallFailed,
            count_data_quality_checks = dataQualityResults$Overview$countTotal,
            dqd_execution_date = format(lubridate::ymd_hms(dataQualityResults$endTimestamp),"%Y-%m-%d"),
            count_person = count_person,
            obs_period_start = format(lubridate::ym(obs_period_start),"%Y-%m"),
            obs_period_end = format(lubridate::ym(obs_period_end),"%Y-%m")
          )
        )
			} else {
				writeLines(paste("missing data quality result file ",dataQualityResultsFile))
			}

      cdmSourceFile <- file.path(releaseFolder, "cdmsource.csv")
      if (file.exists(cdmSourceFile)) {
        cdmSourceData <- read.csv(cdmSourceFile)
        releaseIntervalData <- rbind(releaseIntervalData, cdmSourceData)
      }
		}

		averageUpdateIntervalDays <- "n/a"
		if (nrow(releaseIntervalData) > 1 ) {
  		processedIndex <- releaseIntervalData %>%
  		  mutate(DAYS_ELAPSED = as.Date(CDM_RELEASE_DATE) - lag(as.Date(CDM_RELEASE_DATE))) %>%
  		  filter(!is.na(DAYS_ELAPSED))

  		averageUpdateIntervalDays <- round(as.numeric(abs(mean(processedIndex$DAYS_ELAPSED)), units="days"))
		}

		source$releases <- source$releases[order(-dqd_execution_date)]
		source$count_releases <- nrow(source$releases)
		source$average_update_interval_days <- averageUpdateIntervalDays
		index$sources[[sourceCount]] <- source
	}

	indexJson <- jsonlite::toJSON(index,auto_unbox = T)
	write(indexJson, file.path(outputFolder,"index.json"))
	if(file.exists(file.path(outputFolder, "network-performance.csv"))) {
	  file.remove(file.path(outputFolder, "network-performance.csv"))
	}
	data.table::fwrite(networkPerformanceIndex, file=paste0(outputFolder, "/network-performance.csv"))
  invisible(indexJson)
}