datatrack: Manage Metadata Describing Dependencies Between Data Files

#Contains functions to do with manipulating the metadata file


#' Compute a Checksum of the Metadata
#'
#' To quickly compare if the metadata of 2 datatrack projects are the same
#' We can compute a checksum which ignores the date column.
#'
#' @details
#' After removing the columns that don't matter for meta comparisons, 
#' e.g date, system, calstack column of the metadata, performs a hash of the dataframe and returns is
#' @export
GetChecksum <- function (ignore.cols = c('date', 'callstack', 'system'), algo = 'sha1') {
    if (!requireNamespace("digest", quietly = TRUE)) {
        stop("Package \"digest\" needed to generate checksum. Please install it",
             call. = FALSE)
    }
    meta <- ReadMeta()
    meta <- meta[,-which(names(meta) %in% ignore.cols)]
    return(digest::digest(meta, algo = algo))
}


#' Reads the csv of metadata from disk
#' @details
#' If the file doesn't exist, creates an empty one with the correct columns
#' @export
ReadMeta <- function () {
    .LoadConfig()
    path <- file.path(pkg.env$meta.dir, 'meta.csv')
    if (file.exists(path)) {
        meta <- read.csv(path, stringsAsFactors=FALSE)
    } else {
        return(.CreateEmptyMeta())
    }
    #meta <- FixMeta(meta)
    meta <- .VerifyMeta(meta)
    return(meta)
}

#' Fix the structure of the metadata dataframe
#'
#' Restores the structure of the metadata dataframe, which might be
#' necessary if, during a change to datatrack e.g. a new version of the package
#' extra columns are added, removed or reordered
#' @details
#' An empty correct meta dataframe is created, then each of the existing
#' columns are attempted to be matched to a correct column. If no match for an existing
#' column is found, it is discarded with a warning. If a correct column has not had a match found
#' for it, it is left blank.
#'
#' In specific cases of upgrades, transformations might need to be made to the existing columns. If this
#' happens this function should be added to appropriately. For example, if csv column is removed an a 'class' column
#' is addeed instead, then specific tranformation from existing csv = 1 rows to class = 'csv' needs to be coded.
#' @export
FixMeta <- function (meta) {

    .LoadConfig()

    correct.meta <- .CreateEmptyMeta()
    correct.names <- colnames(correct.meta)
    existing.names <- colnames(meta)
    if (length(correct.names) == length(existing.names) && all(correct.names == existing.names)) {
        return(meta)
    }

    warning('Existing meta dataframe is invalid. Attempting to fix it ...')

    .Report('Correct meta columns: ', paste(correct.names, collapse = ","), '. Existing meta columns: ',  paste(existing.names, collapse = ","))
    .Report('Missing columns: ', paste(setdiff(correct.names, existing.names), collapse = ","))
    .Report('Extra columns: ', paste(setdiff(existing.names, correct.names), collapse = ","))

    matching.names <- intersect(existing.names,correct.names)
    corrected.meta <- as.data.frame(matrix(NA, ncol = length(correct.names), nrow = nrow(meta)))
    colnames(corrected.meta) <- correct.names
    corrected.meta[,matching.names] <- meta[,matching.names]

    # if there are some columns from the existing meta that we couldn't find a place for in the
    # correct meta, save the existing meta with a different file name as a backup
    if (length(matching.names) < length(existing.names)) {
        .ArchiveMeta(meta, fn.note = "before_fixMeta")
        warning('Because there were some columns in the existing meta data.frame that can not be put in the correct meta dataframe, the exising meta data frame will be saved to a backup')
    }

    return(corrected.meta)
}


#' given a single row of meta data as a data.frame
#' converts it to a list and then converts the values that are json to lists
#' @param meta.df data.frame
#' @return list
.ExpandMeta <- function (meta.df) {
    meta.list <- list()
    meta.list$version <- meta.df$version
    meta.list$params <- rjson::fromJSON(as.character(meta.df$params))
    meta.list$dependencies <- rjson::fromJSON(as.character(meta.df$dependencies))
    meta.list$date <- meta.df$date
    meta.list$name <- meta.df$name
    return(meta.list)
}

#' discovers the correct value for the csv column based on the existance of a csv file
#' @param meta data.frame
#' @return data.frame
#' For meta data table that has corrupted csv column or values missing, this will look at the files that 
#' exist and use that information to complete the csv column. This is mainly useful for old datatrack projects 
#' to be brought up to date with changes to datatrack
.RepairCsvColumn <- function (meta = NULL) {
    if (is.null(meta)) {
        meta <- ReadMeta()
    }
    
    file.paths.csv <- .DataobjectPath(meta$name, meta$version, rep(1, nrow(meta)))
    file.paths.object <- .DataobjectPath(meta$name, meta$version, rep(0, nrow(meta)))
    
    file.is.csv <- meta$csv
    object.exists <- file.exists(file.paths.object)
    csv.exists <- file.exists(file.paths.csv)
    both.exist <- object.exists & csv.exists
    
    meta$csv[object.exists] <- 0
    meta$csv[csv.exists] <- 1
    
    #print(data.frame(name = meta$name, v = meta$version, csv.exists = csv.exists, csv.paths = file.paths.csv))
    
    
    if (sum(both.exist) > 0) {
        .Report("some conflicting data files were found. The following files exist for both .object and .csv. Defaulting to use csv.")
        conflicting <- paste(meta$name[both.exist], meta$version[both.exist], sep = ":v", collapse = " \n")
        .Report(conflicting)
    }
    
    return(meta)
    
}


#' Perform some meta verification
#' 
#' Updates the "file.exists" column of the meta csv and sorts the meta data frame into the correct order
#'
#' @param meta data.frame optional if ommitted will read from disk
#' @details
#' i.e. checkes if the file exists
#' files may get deleted, however this should not necessarily
#' mean the meta row should be deleted, since it can still be
#' used to show information about dependencies.
#' Also writes meta again at the end incase anything changed. 
.VerifyMeta <- function (meta = NULL) {

    if (is.null(meta)) {
        meta <- ReadMeta()
    }
    
    meta <- .RepairCsvColumn(meta)

    file.paths <- .DataobjectPath(meta$name, meta$version, meta$csv)
    files.exist <- file.exists(file.paths) | file.exists(.ZipPath(file.paths))
    meta$file.exists <- as.integer(files.exist)

    if (!"col.names" %in% colnames(meta)) {
        meta$col.names = '';
    }
    if (!"callstack" %in% colnames(meta)) {
        meta$callstack = '';
    }
    meta <- .SortMeta(meta)
    .WriteMeta(meta)
    return(meta)
}

#' saves the metadata csv
#' @param meta data.frame
.WriteMeta <- function (meta) {
    path <- file.path(pkg.env$meta.dir, 'meta.csv')
    meta <- .SortMeta(meta)
    write.csv(meta, path, row.names = FALSE)
}

#' Sort the metadata
#' 
#' sorts the metadata for consistency
#' @param meta dataframe
#' @details
#' There are some situations where we need the metadata to be in a consistent order
#' For example comparing metadata for tests
#' in most cases meta$date should be sufficient however there may be duplicate datetimes if 
#' the data objects are generated very quickly sugh 
.SortMeta <- function (meta) {
    meta <- meta[order(meta$date, meta$name, meta$version, decreasing = TRUE), ]  
    return(meta)
}

#' Returns the metadata row for the given name/version pair
#'
#' @param name string
#' @param version int
#' @return list
#'
#' @details
#' Reads the metadata, filters to the correct row, converts to list,
#' then converts json encoded values to lists
#' 
#' @export
GetMeta <- function (name, version) {
  return(.GetMeta(name, version))
}

#' Returns the metadata row for the given name/version pair
#'
#' @param name string
#' @param version int
#' @return list
#'
#' @details
#' Reads the metadata, filters to the correct row, converts to list,
#' then converts json encoded values to lists
.GetMeta <- function (name, version) {
    meta <- ReadMeta()
    meta <- meta[meta$name == name & meta$version == version,]
    meta <- as.list(meta)
    meta$params <- rjson::fromJSON(meta$params)
    meta$dependencies <- rjson::fromJSON(meta$dependencies)
    return(meta)
}


#' TODO: every time the meta is written, make a copy of the old meta (1 per day)
#'
#' @param meta data.frame
#' @param fn.note character if supplied will insert the string into the filename
#' @details
#' The archive version of the metadata will be saved in as in the archive dir within
#' the meta directory. The filename will have the form:
#' meta.bak + fn.note + datetime + csv
.ArchiveMeta <- function (meta, fn.note = '') {

    if (fn.note != '') {
        fn.note = paste0(fn.note, '.')
    }
    fn <- paste0('meta.bak.', fn.note, format(Sys.time(), "%Y-%m-%d_%H-%M-%S"), '.csv')
    archive.dir <- file.path(pkg.env$meta.dir, 'archive')
    if (!file.exists(archive.dir)) {
        dir.create(archive.dir)
    }
    path <- file.path(archive.dir, fn)
    write.csv(meta, path, row.names = FALSE)
    .Report('Meta archived to file: ', fn)
    return(fn)

}

#' creates an empty dataframe with the correct columns
#' @return data.frame
.CreateEmptyMeta <- function () {

    # create a dummy meta row to get all the right columns
    # then remove the row to get an empty data frame
    dummy.meta <- .MakeMetaRow("", 0)
    return(dummy.meta[c(),])
}

#' Makes a 1-row data frame for the metdata of a saved dataobject
#' @param name string
#' @param v.num int
#' @param params list,
#' @param dependencies list
#' @param date string
#' @param col.names character vector
#' @param callstack character vector
#' @param csv int whether it is saved as a csv or an R object
#' @param annotations mixed list or character
#' @return data.frame
.MakeMetaRow <- function (name,
                          v.num,
                          params = list(),
                          dependencies = list(),
                          date = NA,
                          col.names = NULL,
                          callstack = NULL,
                          csv = 0,
                          annotations = NULL) {

    if (is.na(date)) {
        date <- .DateTime()
    }

    sysinfo <- rjson::toJSON(.SessionInfo())

    row <- data.frame(name = name,
                      version = v.num,
                      params = .toCsvValue(params),
                      dependencies = .toCsvValue(dependencies),
                      date = date, file.exists = NA,
                      col.names = .toCsvValue(col.names),
                      callstack = .toCsvValue(callstack),
                      csv = csv,
                      annotations = .toCsvValue(annotations),
                      system = sysinfo)
    return(row)
}
peichins/datatrack documentation built on May 25, 2019, 12:44 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
peichins/datatrack
Manage Metadata Describing Dependencies Between Data Files

R/meta.R
In peichins/datatrack: Manage Metadata Describing Dependencies Between Data Files

R Package Documentation

Browse R Packages

We want your feedback!

peichins/datatrack Manage Metadata Describing Dependencies Between Data Files

R/meta.R In peichins/datatrack: Manage Metadata Describing Dependencies Between Data Files

R Package Documentation

Browse R Packages

We want your feedback!

peichins/datatrack
Manage Metadata Describing Dependencies Between Data Files

R/meta.R
In peichins/datatrack: Manage Metadata Describing Dependencies Between Data Files